Source code for nlpboost.hfdatasets_manager

from datasets import load_dataset, Value
import datasets
from .utils import (
    get_tags,
    _tokenize_dataset,
)
from .tokenization_functions import (
    tokenize_ner,
    tokenize_squad,
    tokenize_summarization,
    tokenize_classification,
)
from .augmentation import NLPAugPipeline
from transformers import PreTrainedTokenizer


tok_func_map = {
    "ner": tokenize_ner,
    "qa": tokenize_squad,
    "seq2seq": tokenize_summarization,
    "classification": tokenize_classification,
}


[docs]class HFDatasetsManager: """ Utility for loading HF Datasets' objects, using a DatasetConfig and a ModelConfig. Parameters ---------- dataset_config: nlpboost.DatasetConfig Configuration for the dataset model_config: nlpboost.ModelConfig Configuration for the model. """ def __init__(self, dataset_config, model_config): self.dataset_config = dataset_config self.model_config = model_config
[docs] def get_dataset_and_tag2id(self, tokenizer: PreTrainedTokenizer): """ Get dataset and tag2id depending on dataset and model config. Using dataset config (task, etc), a preprocessing is applied to the dataset, tokenizing text data, returning a processed dataset ready for the configured task. Parameters ---------- tokenizer: transformers.PretrainedTokenizer Tokenizer to process data. Returns ------- dataset: datasets.DatasetDict Tokenized dataset. tag2id: Dict Dictionary with tags (labels) and their indexes. """ if self.dataset_config.pretokenized_dataset is None: dataset, tag2id = self._generic_load_dataset(tokenizer) else: dataset = self.dataset_config.pretokenized_dataset tag2id = {} if "test" not in dataset: dataset["test"] = dataset["validation"] return dataset, tag2id
def _generic_load_dataset(self, tokenizer: PreTrainedTokenizer): """ Load a generic dataset. Load the dataset and process it depending on the dataset configuration, and get the tag2id (map of labels to ids of the labels). Parameters ---------- tokenizer: transformers.PretrainedTokenizer Tokenizer to process data. Returns ------- dataset: Union[datasets.Dataset,datasets.DatasetDict] Dataset containing data for training, evaluation and testing. tag2id: Dict Dictionary mapping the label names to their numerical ids. """ dataset = self._basic_dataset_loading() dataset = self._smoke_test_filter(dataset) if self.dataset_config.pre_func is not None: dataset = dataset.map(self.dataset_config.pre_func, remove_columns=dataset["train"].column_names if self.dataset_config.remove_fields_pre_func else None) if self.dataset_config.task == "qa": test_dataset = dataset["test"] tags = get_tags(dataset, self.dataset_config) tag2id = {t: i for i, t in enumerate(sorted(tags))} dataset = self._general_label_mapper(tag2id, dataset) dataset = self._resplit_dataset(dataset) dataset = self._augment_dataset(dataset) dataset = _tokenize_dataset( tokenizer, tok_func_map, dataset, self.dataset_config, self.model_config ) if self.dataset_config.task == "qa": dataset["test"] = test_dataset dataset = self._parse_types_dataset(dataset) return dataset, tag2id def _parse_types_dataset(self, dataset): """ Parse the types of the dataset if needed from int to float for regression. Parameters ---------- dataset: datasets.DatasetDict Dataset to process. Returns ------- dataset: datasets.DatasetDict Dataset with correct types. """ if self.dataset_config.config_num_labels == 1 and not isinstance( dataset["train"][0][self.dataset_config.label_col], float ): features = dataset["train"].features.copy() features[self.dataset_config.label_col] = Value("float") dataset = dataset.cast(features) return dataset def _smoke_test_filter(self, dataset): """ Filter dataset if smoke test. Parameters ---------- dataset: datasets.DatasetDict Dataset to filter. Returns ------- dataset: datasets.DatasetDict Dataset filtered if necessary. """ if self.dataset_config.smoke_test: for split in dataset: dataset[split] = dataset[split].select([i for i in range(10)]) return dataset def _basic_dataset_loading(self): """ Load the raw dataset based on dataset config. Returns ------- dataset: datasets.DatasetDict Raw dataset. """ if not self.dataset_config.loaded_dataset: if self.dataset_config.hf_load_kwargs is not None: dataset = load_dataset(**self.dataset_config.hf_load_kwargs) else: if self.dataset_config.type_load == "json": dataset = load_dataset( self.dataset_config.type_load, data_files=self.dataset_config.files, field=self.dataset_config.data_field or None, ) elif self.dataset_config.type_load == "csv": dataset = load_dataset( self.dataset_config.type_load, data_files=self.dataset_config.files, ) else: dataset = self.dataset_config.loaded_dataset return dataset def _augment_dataset(self, dataset): """ Augment dataset based on dataset config. Parameters ---------- dataset: datasets.DatasetDict Dataset to tokenize. Returns ------- dataset: datasets.DatasetDict Augmented dataset. """ if self.dataset_config.augment_data: aug_pipeline = NLPAugPipeline( steps=self.dataset_config.data_augmentation_steps, text_field=self.dataset_config.text_field, ) dataset["train"] = dataset["train"].map( aug_pipeline.augment, batched=True, batch_size=64 ) return dataset def _resplit_dataset(self, dataset): """ Re-split dataset based on dataset config. Parameters ---------- dataset: datasets.DatasetDict Dataset to tokenize. Returns ------- dataset: datasets.DatasetDict Re-splitted dataset. """ if self.dataset_config.partial_split and not self.dataset_config.split: dataset = self._partial_split(dataset) elif self.dataset_config.split and not self.dataset_config.partial_split: dataset = self._complete_split(dataset) return dataset def _partial_split(self, dataset): """ Split the train part of the dataset to create a validation split which did not exist. Parameters ---------- dataset: Union[datasets.Dataset, datasets.DatasetDict] Dataset containing data for training and testing. Returns ------- dataset: datasets.Dataset or datasets.DatasetDict Dataset containing data for training, evaluation and testing. """ dataset_train_val = dataset["train"].train_test_split( test_size=self.dataset_config.val_size, seed=self.dataset_config.seed ) dataset = datasets.DatasetDict( { "train": dataset_train_val["train"], "validation": dataset_train_val["test"], "test": dataset["test" if "test" in dataset else "validation"], } ) return dataset def _complete_split(self, dataset): """ Split the train part of the dataset to create a validation split and test split which did not exist. Parameters ---------- dataset: Union[datasets.Dataset, datasets.DatasetDict] Dataset containing data for training. Returns ------- dataset: datasets.Dataset or datasets.DatasetDict Dataset containing data for training, evaluation and testing. """ dataset_train_test = dataset["train"].train_test_split( test_size=self.dataset_config.test_size, seed=self.dataset_config.seed ) dataset_train_val = dataset_train_test["train"].train_test_split( test_size=self.dataset_config.val_size, seed=self.dataset_config.seed ) dataset = datasets.DatasetDict( { "train": dataset_train_val["train"], "validation": dataset_train_val["test"], "test": dataset_train_test["test"], } ) return dataset def _general_label_mapper(self, tag2id, dataset): """ Transcript the labels from label names to label ids, for classification and ner. Parameters ---------- tag2id: Dict Dictionary with the map of tag to id of those tags. dataset: datasets.Dataset or datasets.DatasetDict Dataset containing data for training, evaluation and testing. Returns ------- dataset: datasets.Dataset or datasets.DatasetDict Processed dataset, with labels mapped to their ids. """ def label_mapper_ner(example): example[self.dataset_config.label_col] = [ tag2id[label] for label in example[self.dataset_config.label_col] ] return example def label_mapper_class(example): example[self.dataset_config.label_col] = tag2id[ example[self.dataset_config.label_col] ] return example if self.dataset_config.task == "ner": dataset = dataset.map(label_mapper_ner) elif ( self.dataset_config.task == "classification" and not self.dataset_config.is_multilabel ): dataset = dataset.map(label_mapper_class) return dataset