from typing import List
import os
from tqdm import tqdm
from .metrics import (
compute_metrics_ner,
compute_metrics_classification,
compute_metrics_summarization,
compute_metrics_multilabel,
)
from .utils import (
_save_metrics,
joinpaths,
)
from copy import deepcopy
from .ckpt_cleaner import CkptCleaner
from optuna.samplers import TPESampler
from apscheduler.schedulers.background import BackgroundScheduler
from .hftransformers_manager import HFTransformersManager
from .hfdatasets_manager import HFDatasetsManager
from .results_getter import ResultsGetter
metric_func_map = {
"ner": compute_metrics_ner,
"classification": compute_metrics_classification,
"qa": None,
"seq2seq": compute_metrics_summarization,
"multilabel": compute_metrics_multilabel,
}
[docs]class AutoTrainer:
"""
Main class of nlpboost. Fine-tune and evaluate several models on several datasets.
Useful for performing benchmarking of different models on the same datasets. The behavior
of `AutoTrainer` is mainly configured through `model_configs` and `dataset_configs`, which
define the datasets and the models to be used.
Parameters
----------
model_configs: List[nlpboost.ModelConfig]
Configurations for the models, instances of ModelConfig, each describing their
names in the hub or local directory, the name to save the model, the dropout
values to use, and a long etc.
dataset_configs: List[nlpboost.DatasetConfig]
Configurations for the datasets, instances of DatasetConfig, each describing
how each dataset should be processed.
metrics_dir: str
Directory to save the metrics for the experiments, as returned by `nlpboost.ResultsGetter`.
hp_search_mode: str
Mode for hyperparameter search; possibilities are `optuna` or `fixed`. If `fixed`,
no hyperparameter tuning is carried out.
clean: bool
Whether to clean checkpoints every 10 minutes to avoid using too much disk, by
using nlpboost.CkptCleaner. Best model checkpoint is also saved when unuseful
checkpoints are deleted.
metrics_cleaner: str
Path to the folder where the metrics of the checkpoint cleaner should be stored.
These metrics are used to decide which checkpoints should be removed. Note: if the
experiment fails for some reason, and you re-launch it, please remove this folder
before doing so. Otherwise there will probably be an error, as the checkpoint cleaner
will use metrics from past experiments, not the running one, so there will be incorrect
checkpoint removals.
use_auth_token: bool
Whether to use auth token to load datasets and models.
skip_mixes: List[nlpboost.SkipMix]
List of SkipMix instances with combinations of datasets and models that must be skipped.
"""
def __init__(
self,
model_configs: List,
dataset_configs: List,
metrics_dir: str = "tmp_experiments_metrics",
hp_search_mode: str = "optuna",
clean: bool = True,
metrics_cleaner: str = "tmp_metrics_cleaner",
use_auth_token: bool = False,
skip_mixes: List = None
):
self.model_configs = model_configs
self.dataset_configs = dataset_configs
self.metrics_dir = metrics_dir
self.hp_search_mode = hp_search_mode
self.metrics_cleaner = metrics_cleaner
self.clean = clean
self.use_auth_token = use_auth_token
os.makedirs(self.metrics_dir, exist_ok=True)
self.use_auth_token = use_auth_token
self.skip_mixes = skip_mixes
def __call__(
self,
):
"""
Use `train_with_fixed_params` or `optuna_hp_search` to carry out hyperparameter search defined in init.
Check the documentation of those methods for more information.
"""
if self.hp_search_mode == "optuna":
return self.optuna_hp_search()
elif self.hp_search_mode == "fixed":
return self.train_with_fixed_params()
[docs] def train_with_fixed_params(
self,
):
"""
Train without hyperparameter search, with a fixed set of params.
The default parameters are defined in the fixed_train_args of DatasetConfig.
However, we can use ModelConfig.overwrite_training_args to change this,
by passing a dictionary with the new parameters that we want to use for a model.
"""
all_results = {}
for dataset_config in tqdm(
self.dataset_configs, desc="Iterating over datasets..."
):
for model_config in tqdm(
self.model_configs,
desc=f"Trying models on dataset {dataset_config.dataset_name}",
):
if self.skip_mixes is not None:
if any([skip_mix.dataset_name == dataset_config.alias and skip_mix.model_name == model_config.save_name for skip_mix in self.skip_mixes]):
continue
transformers_manager = HFTransformersManager(
model_config, dataset_config
)
datasets_manager = HFDatasetsManager(dataset_config, model_config)
self.tokenizer = transformers_manager.load_tokenizer()
model_config, dataset_config = self._adapt_objects_summarization(
model_config, dataset_config
)
dataset, tag2id = datasets_manager.get_dataset_and_tag2id(
deepcopy(self.tokenizer)
)
data_collator = transformers_manager.load_data_collator(self.tokenizer)
if len(model_config.dropout_vals) == 0:
model_config.dropout_vals = [0.0]
config = transformers_manager.load_config(
tag2id, model_config.dropout_vals[0]
)
output_dir = joinpaths(
model_config.save_dir,
f"fixedparams_{model_config.save_name}-{dataset_config.alias}",
)
args = transformers_manager.load_train_args(output_dir)
model_cls = transformers_manager.get_model_cls()
model_init = transformers_manager.load_model_init(
model_cls, config, self.tokenizer
)
compute_metrics_func = self._get_compute_metrics(dataset_config)
self.trainer = transformers_manager.load_trainer(
dataset,
self.tokenizer,
args,
model_init,
data_collator,
compute_metrics_func,
config,
)
test_results = self.train_one_model_fixed_params(
model_config, dataset_config, compute_metrics_func, dataset["test"]
)
all_results[model_config.save_name.replace("/", "-")] = test_results
return all_results
[docs] def optuna_hp_search(
self,
):
"""
Carry out hyperparameter search with Optuna.
Use `model_configs` and `dataset_configs` passed in init. Iterate over
each dataset, and then over each model, with hyperparameter tuning.
Metrics over the test dataset are gathered and then saved
in the `metrics_dir` specified in init for each of those models, for later comparison.
Returns
-------
all_results: Dict
Dictionary with results from the experiments.
"""
all_results = {}
for dataset_config in tqdm(
self.dataset_configs, desc="Iterating over datasets..."
):
for model_config in tqdm(
self.model_configs,
desc=f"Trying models on dataset {dataset_config.dataset_name}",
):
if self.skip_mixes is not None:
if any([skip_mix.dataset_name == dataset_config.alias and skip_mix.model_name == model_config.save_name for skip_mix in self.skip_mixes]):
continue
transformers_manager = HFTransformersManager(
model_config, dataset_config
)
datasets_manager = HFDatasetsManager(dataset_config, model_config)
self.tokenizer = transformers_manager.load_tokenizer()
model_config, dataset_config = self._adapt_objects_summarization(
model_config, dataset_config
)
if len(model_config.dropout_vals) == 0:
model_config.dropout_vals = [0.0]
for p_d in model_config.dropout_vals:
dataset, tag2id = datasets_manager.get_dataset_and_tag2id(
deepcopy(self.tokenizer)
)
data_collator = transformers_manager.load_data_collator(
self.tokenizer
)
config = transformers_manager.load_config(tag2id, p_d)
output_dir = joinpaths(
model_config.save_dir,
f"best_optuna_{model_config.save_name}-{dataset_config.alias}-dropout_{p_d}",
)
args = transformers_manager.load_train_args(output_dir)
model_cls = transformers_manager.get_model_cls()
model_init = transformers_manager.load_model_init(
model_cls, config, self.tokenizer
)
compute_metrics_func = self._get_compute_metrics(dataset_config)
self.trainer = transformers_manager.load_trainer(
dataset,
self.tokenizer,
args,
model_init,
data_collator,
compute_metrics_func,
config,
)
def compute_objective(metrics):
return metrics[dataset_config.metric_optimize]
test_results = self.train_one_model_optuna(
model_config,
dataset_config,
compute_objective,
compute_metrics_func,
output_dir,
dataset["test"],
)
all_results[model_config.save_name.replace("/", "-")] = test_results
return all_results
[docs] def train_one_model_fixed_params(
self, model_config, dataset_config, compute_metrics_func, test_dataset
):
"""
Train one model with fixed params in one dataset, without tuning parameters.
Parameters
----------
model_config: nlpboost.ModelConfig
Configuration for the model.
dataset_config: nlpboost.DatasetConfig,
Configuration for the dataset.
compute_metrics_func: Any
Function to compute metrics.
test_dataset: datasets.Dataset
Test dataset to get metrics on.
Returns
-------
test_results: Dict
Dictionary with results over the test set after training with fixed params.
"""
if not model_config.only_test:
self.trainer.train()
test_results = self._get_test_results(
dataset_config, compute_metrics_func, model_config, test_dataset
)
_save_metrics(
test_results,
model_config.save_name,
dataset_config.alias,
self.metrics_dir,
)
if model_config.push_to_hub and model_config.hf_hub_username is not None:
self.trainer.push_to_hub(
f"{model_config.hf_hub_username}/{model_config.save_name}",
private=True,
)
test_results["model_name"] = model_config.save_name
test_results["dataset_name"] = dataset_config.alias
return test_results
[docs] def train_one_model_optuna(
self,
model_config,
dataset_config,
compute_objective,
compute_metrics_func,
output_dir,
test_dataset,
):
"""
Train one model in one dataset, with hyperparameter tuning, using Optuna.
Load a checkpoint cleaner in the background to clean bad performing checkpoints
every 10 minutes, also saving the best performing checkpoint. Then, carry out
hyperparameter search and, if configured (see `DatasetConfig`), retrain at end
with the best hyperparameters again. After that, results on the test set are
obtained. For that, `ResultsGetter` is used for dataset processing, prediction
and metrics gathering. If desired, the user may change the behavior
of this part by creating a custom `ResultsGetter` overriding the desired
methods, and passing it to `DatasetConfig` as a `custom_results_getter`.
Metrics are saved in json or txt format, and, if configured, the model
is pushed to the hub.
Parameters
----------
model_config: nlpboost.ModelConfig
Configuration for the model.
dataset_config: nlpboost.DatasetConfig,
Configuration for the dataset.
compute_objective: Any
Function to return the computed metric objective.
compute_metrics_func: Any
Function to compute metrics.
output_dir: str
Directory where the model is saved.
test_dataset: datasets.Dataset
Test dataset to get metrics on.
Returns
-------
test_results: Dict
Dictionary with the results in the test set.
"""
if not model_config.do_nothing:
if not model_config.only_test:
scheduler = BackgroundScheduler()
cleaner_callable = self._create_clean_job(
output_dir,
model_config.save_dir,
mode="max"
if dataset_config.direction_optimize == "maximize"
else "min",
metrics_save_dir=self.metrics_cleaner,
modelname=f"{model_config.save_name}-{dataset_config.alias}",
)
scheduler.add_job(cleaner_callable, "interval", seconds=600)
scheduler.start()
if not model_config.resume_from_checkpoint:
best_run = self.trainer.hyperparameter_search(
direction=dataset_config.direction_optimize,
hp_space=model_config.hp_space,
n_trials=model_config.n_trials,
compute_objective=compute_objective,
sampler=TPESampler(
seed=dataset_config.seed,
n_startup_trials=model_config.random_init_trials,
),
)
if dataset_config.retrain_at_end:
for n, v in best_run.hyperparameters.items():
setattr(self.trainer.args, n, v)
self.trainer.train()
else:
self.trainer.train(model_config.name)
test_results = self._get_test_results(
dataset_config, compute_metrics_func, model_config, test_dataset
)
_save_metrics(
test_results,
model_config.save_name,
dataset_config.alias,
self.metrics_dir,
)
if model_config.push_to_hub and model_config.hf_hub_username is not None:
self.trainer.push_to_hub(
f"{model_config.hf_hub_username}/{model_config.save_name}",
private=True,
)
if not model_config.only_test:
scheduler.shutdown()
cleaner_callable(skip_last=False)
test_results["model_name"] = model_config.save_name
test_results["dataset_name"] = dataset_config.alias
return test_results
else:
return {
"Do nothing": True,
"model_name": model_config.save_name,
"dataset_name": dataset_config.alias,
}
def _get_test_results(
self, dataset_config, compute_metrics_func, model_config, test_dataset
):
"""
Get results for the test set. Metrics vary depending on the task.
Use `ResultsGetter` for dataset processing, obtaining the predictions
and getting the metrics. If desired, the user may change the behavior
of this part by creating a custom `ResultsGetter` overriding the desired
methods.
Parameters
----------
dataset_config: nlpboost.DatasetConfig,
Configuration for the dataset.
compute_metrics_func: Any
Function to compute metrics.
model_config: nlpboost.ModelConfig
Configuration for the model.
test_dataset: datasets.Dataset
Test dataset to get metrics on.
Returns
-------
test_results: Dict
Dictionary with the results in the test set.
"""
if model_config.custom_results_getter is None:
results_getter = ResultsGetter(
dataset_config, model_config, compute_metrics_func
)
else:
results_getter = model_config.custom_results_getter(
dataset_config, model_config, compute_metrics_func
)
test_results = results_getter(self.trainer, test_dataset)
return test_results
def _create_clean_job(
self,
output_dir,
dataset_folder,
mode,
metrics_save_dir,
modelname,
try_mode=False,
):
"""
Create a job to schedule cleaning process with CkptCleaner.
Initialize a checkpoint cleaner with class CkptCleaner,
with parameters passed in this function call. This callable class
is used as a job to be scheduled, so that checkpoints are cleaned
every 10 minutes.
Parameters
----------
output_dir: str
Directory where models are being saved.
dataset_folder: str
The name of the dataset models.
mode: str
max or min are allowed.
metrics_save_dir: str
Directory to save metrics.
modelname: str
Name of the current model.
try_mode: bool
Default is False. This is to test the checkpoint cleaner without removing checkpoints.
Returns
-------
ckpt_cleaner: CkptCleaner
Instance of CkptCleaner to clean the checkpoints for the current model.
"""
ckpt_cleaner = CkptCleaner(
current_folder_clean=output_dir,
current_dataset_folder=dataset_folder,
metrics_save_dir=metrics_save_dir,
modelname=modelname,
mode=mode,
try_mode=try_mode,
)
return ckpt_cleaner
def _adapt_tokenizer_summarization(
self,
):
"""Add bos and eos tokens to the tokenizer for summarization, in EncoderDecoder models."""
self.tokenizer.bos_token = self.tokenizer.cls_token
self.tokenizer.eos_token = self.tokenizer.sep_token
def _get_compute_metrics(self, dataset_config):
"""
Get the function to compute metrics with.
Parameters
----------
dataset_config: nlpboost.DatasetConfig
Configuration for the dataset.
Returns
-------
compute_metrics_func: Any
Function to compute metrics.
"""
compute_metrics_func = (
metric_func_map[dataset_config.task]
if not dataset_config.custom_eval_func
else dataset_config.custom_eval_func
)
if dataset_config.is_multilabel:
compute_metrics_func = metric_func_map["multilabel"]
return compute_metrics_func
def _adapt_objects_summarization(self, model_config, dataset_config):
"""
Adapt dataset config and model config, along with the tokenizer, for seq2seq tasks.
Parameters
----------
model_config: nlpboost.ModelConfig
Configuration for the model.
dataset_config: nlpboost.DatasetConfig
Configuration for the dataset.
Returns
-------
model_config: nlpboost.ModelConfig
Adjusted configuration for the model.
dataset_config: nlpboost.DatasetConfig
Adjusted configuration for the dataset.
"""
if dataset_config.task == "seq2seq":
dataset_config.max_length_summary = model_config.max_length_summary
if not model_config.model_cls_summarization:
self._adapt_tokenizer_summarization()
return model_config, dataset_config