Source code for nlpboost.dataset_config

from dataclasses import dataclass, asdict, field
from typing import Dict, List, Any


[docs]@dataclass
class DatasetConfig:
    """
    Configure a dataset for use within the AutoTrainer class.

    This determines how to load the dataset,
    whether local files are needed, whether additional splits are needed (for example when the original
    dataset only has train-test and we want also validation), and so on.

    Parameters
    ----------
    dataset_name: str
        The name of the dataset.
    alias: str
        Alias for the dataset, for saving it.
    task: str
        The task of the dataset. Currenlty, only classification, ner and qa (question answering) are available.
    fixed_training_args: Dict
        The training arguments (to use in transformers.TrainingArguments) for every model on this dataset, in dictionary format.
    is_multilabel: bool
        Whether it is multilabel classification
    multilabel_label_names: List
        Names of the labels for multilabel training.
    hf_load_kwargs: Dict
        Arguments for loading the dataset from the huggingface datasets' hub. Example: {'path': 'wikiann', 'name': 'es'}.
        If None, it is assumed that all necessary files exist locally and are passed in the files field.
    type_load: str
        The type of load to perform in load_dataset; for example, if your data is in csv format (d = load_dataset('csv', ...)), this should be csv.
    files: Dict
        Files to load the dataset from, in Huggingface's datasets format. Possible keys are train, validation and test.
    data_field: str
        Field to load data from in the case of jsons loading in datasets.
    partial_split: bool
        Wheter a partial split is needed, that is, if you only have train and test sets, this should be True so that a new validation set is created.
    split: bool
        This should be true when you only have one split, that is, a big train set; this creates new validation and test sets.
    label_col: str
        Name of the label column.
    val_size: float
        In case no validation split is provided, the proportion of the training data to leave for validation.
    test_size: float
        In case no test split is provided, the proportion of the total data to leave for testing.
    pre_func
        Function to perform previous transformations. For example, if your dataset lacks a field (like xquad with title field for example), you can fix it in a function provided here.
    squad_v2: bool
        Only useful for question answering. Whether it is squad v2 format or not. Default is false.
    text_field: str
        The name of the field containing the text. Useful only in case of unique-text-field datasets,like most datasets are. In case of 2-sentences datasets like xnli or paws-x this is not useful. Default is text.
    is_2sents: bool
        Whether it is a 2 sentence dataset. Useful for processing datasets like xnli or paws-x.
    sentence1_field: str
        In case this is a 2 sents dataset, the name of the first sentence field.
    sentence2_field: str
        In case this is a 2 sents dataset, the name of the second sentence field.
    summary_field: str = field(
        The name of the field with summaries (we assume the long texts are in the text_field field). Only useful for summarization tasks. Default is summary.
    callbacks: List
        Callbacks to use inside transformers.
    metric_optimize: str
        Name of the metric you want to optimize in the hyperparameter search.
    direction_optimize : str
        Direction of the optimization problem. Whether you want to maximize or minimize metric_optimize.
    custom_eval_func: Any
        In case we want a special evaluation function, we can provide it here. It must receive EvalPredictions by trainer, like any compute_metrics function in transformers.
    seed : int
        Seed for optuna sampler.
    max_length_summary: int
        Max length of the summaries, for tokenization purposes. It will be changed depending on the ModelConfig.
    num_proc : int
        Number of processes to preprocess data.
    loaded_dataset: Any
        In case you want to do weird things like concatenating datasets or things like that, you can do that here, by passing a (non-tokenized) dataset in this field.
    additional_metrics: List
        List of additional metrics loaded from datasets, to compute over the test part.
    retrain_at_end: bool
        whether to retrain with the best performing model. In most cases this should be True, except when training 1 model with 1 set of hyperparams.
    config_num_labels: int
        Number of labels to set for the config, if None it will be computed based on number of labels detected.
    smoke_test: bool
        Whether to select only top 10 rows of the dataset for smoke testing purposes.
    augment_data: bool
        Whether to augment_data or not.
    data_augmentation_steps: List
        List of data augmentation techniques to use from NLPAugPipeline.
    pretokenized_dataset: Any
        Pre-tokenized dataset, to avoid tokenizing inside AutoTrainer, which may cause memory issues with huge datasets.

    Examples
    --------
    One can easily create a DatasetConfig for dataset conll2002 just with the following:

    >>> from nlpboost import DatasetConfig

    >>> config={'fixed_training_args': {}, 'dataset_name': 'conll2002', 'alias': 'conll2002', 'task': 'ner', 'hf_load_kwargs': {'path': 'conll2002', 'name': 'es'}, 'label_col':'ner_tags'}

    >>> config = DatasetConfig(**config)
    """

    dataset_name: str = field(metadata={"help": "The name of the dataset"})
    alias: str = field(metadata={"help": "Alias for the dataset, for saving it."})
    task: str = field(
        metadata={
            "help": "The task of the dataset. Currenlty, only classification, ner and qa (question answering) are available."
        }
    )
    fixed_training_args: Dict = field(
        metadata={
            "help": "The training arguments (to use in transformers.TrainingArguments) for every model on this dataset, in dictionary format."
        }
    )
    is_multilabel: bool = field(
        default=False, metadata={"help": "Whether it is multilabel classification"}
    )
    multilabel_label_names: List = field(
        default_factory=list,
        metadata={"help": "Names of the labels for multilabel training."},
    )
    hf_load_kwargs: Dict = field(
        default=None,
        metadata={
            "help": (
                "arguments for loading the dataset from the huggingface datasets' hub. Example: {'path': 'wikiann', 'name': 'es'}."
                "if None, it is assumed that all necessary files exist locally and are passed in the files field."
            )
        },
    )
    type_load: str = field(
        default="json",
        metadata={
            "help": "The type of load to perform in load_dataset; for example, if your data is in csv format (d = load_dataset('csv', ...)), this should be csv."
        },
    )
    files: Dict = field(
        default=None,
        metadata={
            "help": "Files to load the dataset from, in Huggingface's datasets format. Possible keys are train, validation and test"
        },
    )
    data_field: str = field(
        default="data",
        metadata={
            "help": "Field to load data from in the case of jsons loading in datasets. "
        },
    )
    partial_split: bool = field(
        default=False,
        metadata={
            "help": "Wheter a partial split is needed, that is, if you only have train and test sets, this should be True so that a new validation set is created."
        },
    )
    split: bool = field(
        default=False,
        metadata={
            "help": "This should be true when you only have one split, that is, a big train set; this creates new validation and test sets."
        },
    )
    label_col: str = field(
        default="label_list", metadata={"help": "Name of the label column."}
    )
    val_size: float = field(
        default=0.15,
        metadata={
            "help": "In case no validation split is provided, the proportion of the training data to leave for validation."
        },
    )
    test_size: float = field(
        default=0.15,
        metadata={
            "help": "In case no test split is provided, the proportion of the total data to leave for testing."
        },
    )
    pre_func: Any = field(
        default=None,
        metadata={
            "help": "function to perform previous transformations. For example, if your dataset lacks a field (like xquad with title field for example), you can fix it in a function provided here."
        },
    )
    remove_fields_pre_func: bool = field(
        default=False,
        metadata={"help": "Whether to remove fields after pre_func is applied."}
    )
    squad_v2: bool = field(
        default=False,
        metadata={
            "help": "Only useful for question answering. Whether it is squad v2 format or not. Default is false"
        },
    )
    text_field: str = field(
        default="text",
        metadata={
            "help": "The name of the field containing the text. Useful only in case of unique-text-field datasets,like most datasets are. In case of 2-sentences datasets like xnli or paws-x this is not useful."
        },
    )
    is_2sents: bool = field(
        default=False,
        metadata={
            "help": "Whether it is a 2 sentence dataset. Useful for processing datasets like xnli or paws-x."
        },
    )
    sentence1_field: str = field(
        default=None,
        metadata={
            "help": "In case this is a 2 sents dataset, the name of the first sentence field."
        },
    )
    sentence2_field: str = field(
        default=None,
        metadata={
            "help": "In case this is a 2 sents dataset, the name of the second sentence field."
        },
    )
    summary_field: str = field(
        default="summary",
        metadata={
            "help": "The name of the field with summaries (we assume the long texts are in the text_field field). Only useful for summarization tasks."
        },
    )
    callbacks: List = field(
        default_factory=list, metadata={"help": "Callbacks to use inside transformers."}
    )
    metric_optimize: str = field(
        default="eval_loss",
        metadata={
            "help": "Name of the metric you want to optimize in the hyperparameter search."
        },
    )
    direction_optimize: str = field(
        default="minimize",
        metadata={
            "help": "Direction of the optimization problem. Whether you want to maximize or minimize metric_optimize."
        },
    )
    custom_eval_func: Any = field(
        default=None,
        metadata={
            "help": "In case we want a special evaluation function, we can provide it here. It must receive EvalPredictions by trainer, like any compute_metrics function in transformers."
        },
    )
    seed: int = field(default=420, metadata={"help": "Seed for optuna sampler. "})
    max_length_summary: int = field(
        default=120,
        metadata={
            "help": "Max length of the summaries, for tokenization purposes. It will be changed depending on the ModelConfig."
        },
    )
    num_proc: int = field(
        default=4, metadata={"help": "Number of processes to preprocess data."}
    )
    loaded_dataset: Any = field(
        default=None,
        metadata={
            "help": "In case you want to do weird things like concatenating datasets or things like that, you can do that here, by passing a (non-tokenized) dataset in this field."
        },
    )
    additional_metrics: List = field(
        default=None,
        metadata={
            "help": "List of additional metrics loaded from datasets, to compute over the test part."
        },
    )
    retrain_at_end: bool = field(
        default=True,
        metadata={
            "help": "whether to retrain with the best performing model. In most cases this should be True, except when you're only training 1 model with 1 set of hyperparams."
        },
    )
    config_num_labels: int = field(
        default=None,
        metadata={
            "help": "Number of labels to set for the config, if None it will be computed based on number of labels detected."
        },
    )
    smoke_test: bool = field(
        default=False,
        metadata={
            "help": "Whether to select only top 10 rows of the dataset for smoke testing purposes"
        },
    )
    augment_data: bool = field(
        default=False, metadata={"help": "Whether to augment_data or not."}
    )
    data_augmentation_steps: List = field(
        default_factory=list,
        metadata={
            "help": "List of data augmentation techniques to use from NLPAugPipeline."
        },
    )
    id_field_qa: str = field(
        default="id",
        metadata={
            "help": "Name of the field with the unique id of the examples in a question answering dataset."
        },
    )
    pretokenized_dataset: Any = field(
        default=None,
        metadata={
            "help": "Pre-tokenized dataset, to avoid tokenizing inside AutoTrainer, which may cause memory issues with huge datasets."
        },
    )

    def __str__(
        self,
    ):
        """Representation of dataset config in str."""
        self_as_dict = asdict(self)
        attrs_as_str = [f"{k}={v},\n" for k, v in sorted(self_as_dict.items())]
        return f"{self.__class__.__name__}(\n{''.join(attrs_as_str)})"