Source code for nlpboost.hftransformers_manager
from transformers import (
AutoTokenizer,
EncoderDecoderModel,
Seq2SeqTrainer,
Trainer,
Seq2SeqTrainingArguments,
TrainingArguments,
AutoModelForSequenceClassification,
AutoModelForSeq2SeqLM,
AutoConfig,
AutoModelForTokenClassification,
AutoModelForQuestionAnswering,
DataCollatorForTokenClassification,
DataCollatorForSeq2Seq
)
import torch
from functools import partial
from .dataset_config import DatasetConfig
from .model_config import ModelConfig
from typing import Dict
[docs]class MultilabelTrainer(Trainer):
"""Version of the trainer used for multilabel setting."""
[docs] def compute_loss(self, model, inputs, return_outputs=False):
"""Compute loss of the model.
Parameters
----------
model : transformers.PreTrainedModel
Model to compute loss.
inputs : torch.Tensor
Model inputs.
return_outputs : bool
Wether or not to return model outputs.
"""
labels = inputs.pop("labels")
outputs = model(**inputs)
logits = outputs.logits
loss_fct = torch.nn.BCEWithLogitsLoss()
loss = loss_fct(
logits.view(-1, self.model.config.num_labels),
labels.float().view(-1, self.model.config.num_labels),
)
return (loss, outputs) if return_outputs else loss
map_trainer_cls = {
"classification": Trainer,
"ner": Trainer,
"qa": Trainer,
"multilabel": MultilabelTrainer,
"seq2seq": Seq2SeqTrainer,
}
map_model_cls = {
"classification": AutoModelForSequenceClassification,
"ner": AutoModelForTokenClassification,
"qa": AutoModelForQuestionAnswering,
"seq2seq": AutoModelForSeq2SeqLM,
}
[docs]class HFTransformersManager:
"""
Utility for loading HF Transformers' objects, using a dataset config and a model config.
Parameters
----------
model_config: nlpboost.ModelConfig
Configuration for the model.
dataset_config: nlpboost.DatasetConfig
Configuration for the dataset
"""
def __init__(self, model_config: ModelConfig = None, dataset_config: DatasetConfig = None, use_auth_token: bool = True):
self.model_config = model_config
self.dataset_config = dataset_config
self.use_auth_token = use_auth_token
[docs] def load_config(self, tag2id: Dict, dropout: float):
"""
Load configuration for the model depending on the type of task we are doing.
Parameters
----------
tag2id: Dict
Dictionary mapping labels to indices of those labels in the network output layer.
dropout: float
Dropout proportion for the pooler layer.
Returns
-------
config: transformers.PretrainedConfig
Configuration for use in the transformers module.
"""
task = self.dataset_config.task
model_name = self.model_config.name
if task in ["qa", "multiple_choice"]:
config = AutoConfig.from_pretrained(
model_name,
use_auth_token=self.use_auth_token,
**{self.model_config.dropout_field_name: dropout},
)
elif task in ["seq2seq"]:
config = (
AutoConfig.from_pretrained(
model_name, use_auth_token=self.use_auth_token
)
if not self.model_config.encoder_name
else None
)
else:
if not self.model_config.custom_config_class:
config = AutoConfig.from_pretrained(
model_name,
use_auth_token=self.use_auth_token,
num_labels=len(tag2id)
if not self.dataset_config.config_num_labels
else self.dataset_config.config_num_labels,
**{self.model_config.dropout_field_name: dropout},
)
else:
config = self.model_config.custom_config_class
setattr(config, "label2id", tag2id)
setattr(config, "id2label", {i: tag for tag, i in tag2id.items()})
if self.model_config.config_problem_type:
setattr(config, "problem_type", self.model_config.config_problem_type)
if self.model_config.custom_params_config_model is not None:
for param, val in self.model_config.custom_params_config_model.items():
setattr(config, param, val)
return config
[docs] def load_data_collator(self, tokenizer):
"""
Load data collator depending on the type of task we are doing.
Parameters
----------
tokenizer: transformers.PretrainedTokenizer
Tokenizer to process data.
Returns
-------
data_collator: transformers.DataCollator
DataCollator for use in the transformers library.
"""
data_collator = None
if self.dataset_config.task == "ner":
data_collator = DataCollatorForTokenClassification(tokenizer)
elif self.dataset_config.task == "seq2seq":
data_collator = DataCollatorForSeq2Seq(tokenizer)
return data_collator
[docs] def load_tokenizer(
self,
):
"""
Load tokenizer for the given model config and model name.
Returns
-------
tokenizer:
Loaded tokenizer.
"""
if self.model_config.additional_params_tokenizer:
tokenizer = AutoTokenizer.from_pretrained(
self.model_config.name
if not self.model_config.encoder_name
else self.model_config.encoder_name,
do_lower_case=False,
add_prefix_space=True,
use_fast=True,
use_auth_token=self.use_auth_token,
**self.model_config.additional_params_tokenizer,
)
else:
tokenizer = AutoTokenizer.from_pretrained(
self.model_config.name
if not self.model_config.encoder_name
else self.model_config.encoder_name,
do_lower_case=False,
add_prefix_space=True,
use_fast=True,
use_auth_token=self.use_auth_token,
)
return tokenizer
[docs] def get_model_cls(self):
"""
Get the class to use for a model.
Returns
-------
model_cls:
Class for the model.
"""
if not self.model_config.custom_model_class:
model_cls = (
self.model_config.model_cls_summarization
if self.dataset_config.task == "seq2seq"
and self.model_config.model_cls_summarization
else map_model_cls[self.dataset_config.task]
)
return model_cls
else:
return self.model_config.custom_model_class
[docs] def load_trainer(
self,
dataset,
tokenizer,
args,
model_init,
data_collator,
compute_metrics_func,
config,
):
"""
Load an instantiated Trainer object depending on the configuration.
Parameters
----------
dataset: datasets.DatasetDict
Dataset with train and validation splits.
tokenizer: transformers.PretrainedTokenizer
Tokenizer from transformers.
args: transformers.TrainingArguments
TrainingArguments for the Trainer.
model_init: Any
Function that loads the model.
data_collator: Any
Data Collator to use inside Trainer.
compute_metrics_func: Any
Function to compute metrics.
config: transformers.PretrainedConfig
Configuration for the model in Huggingface Transformers.
Returns
-------
Trainer: transformers.Trainer
Trainer object loaded with the given configuration.
"""
trainer_cls = (
self.model_config.custom_trainer_cls
if self.model_config.custom_trainer_cls
else map_trainer_cls[self.dataset_config.task]
)
trainer = trainer_cls(
model_init=model_init,
args=args,
train_dataset=dataset[
"train" if not self.model_config.only_test else "test"
],
eval_dataset=dataset[
"validation" if not self.model_config.only_test else "test"
],
data_collator=data_collator,
tokenizer=tokenizer,
compute_metrics=partial(
compute_metrics_func,
tokenizer=tokenizer,
id2tag=config.id2label if config else None,
)
if compute_metrics_func
else None,
callbacks=self.dataset_config.callbacks,
)
return trainer
[docs] def load_train_args(self, output_dir):
"""
Load training args depending on the task.
Parameters
----------
output_dir: str
Local directory name to save the model.
Returns
-------
args: transformers.TrainingArguments
Arguments for training.
"""
self.dataset_config.fixed_training_args.update(
{
"metric_for_best_model": self.dataset_config.metric_optimize,
"greater_is_better": self.dataset_config.direction_optimize
== "maximize",
}
)
fixed_training_args = self.dataset_config.fixed_training_args.copy()
if self.model_config.overwrite_training_args:
for k, v in self.model_config.overwrite_training_args.items():
fixed_training_args.update({k: v})
if self.dataset_config.task != "seq2seq":
args = TrainingArguments(
output_dir=output_dir,
run_name=output_dir,
overwrite_output_dir=True,
report_to=["tensorboard"],
**fixed_training_args,
)
else:
args = Seq2SeqTrainingArguments(
output_dir=output_dir,
run_name=output_dir,
overwrite_output_dir=True,
predict_with_generate=True,
report_to=["tensorboard"],
**fixed_training_args,
)
return args
[docs] def load_model_init(self, model_cls, config, tokenizer):
"""
Load the model init function.
This function is useful for the Transformers integration with Optuna.
Parameters
----------
model_cls
Class for the model.
config: AutoConfig
Configuration for the model.
tokenizer: transformers.PretrainedTokenizer
Tokenizer to preprocess text data.
Returns
-------
model_init
Function for initializing the model. Furtherly passed to the Trainer.
"""
if self.dataset_config.task == "seq2seq" and self.model_config.encoder_name:
def model_init():
model = EncoderDecoderModel.from_encoder_decoder_pretrained(
self.model_config.encoder_name,
self.model_config.decoder_name,
tie_encoder_decoder=self.model_config.tie_encoder_decoder,
use_auth_token=self.use_auth_token,
)
model.config.decoder_start_token_id = (
tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 0
)
model.config.eos_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.vocab_size = model.config.decoder.vocab_size
model.config.max_length = self.model_config.max_length_summary
model.config.min_length = self.model_config.min_length_summary
model.config.no_repeat_ngram_size = (
self.model_config.no_repeat_ngram_size
)
model.config.early_stopping = (
self.model_config.early_stopping_summarization
)
model.config.length_penalty = self.model_config.length_penalty
model.config.num_beams = self.model_config.num_beams
model.config.decoder.bos_token_id = (
tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 0
)
return model
elif not self.model_config.custom_model_class:
def model_init():
"""Model init function needed by optuna to initialize the model."""
if self.dataset_config.is_multilabel:
config.update(
{
"problem_type": "multi_label_classification",
"num_labels": self.dataset_config.config_num_labels,
}
)
return model_cls.from_pretrained(
self.model_config.name,
config=config,
use_auth_token=self.use_auth_token,
)
else:
if self.model_config.custom_config_class:
def model_init():
return self.model_config.custom_model_class(
self.model_config.custom_config_class
)
else:
def model_init():
return self.model_config.custom_model_class.from_pretrained(
self.model_config.name, config=config
)
return model_init