Source code for nlpboost.metrics

from sklearn.metrics import classification_report
import numpy as np
import nltk
import itertools
from typing import List
import torch
import evaluate

nltk.download("punkt")

metric_sum = evaluate.load("rouge")
metric_seqeval = evaluate.load("seqeval")


[docs]def compute_metrics_classification( pred, tokenizer=None, id2tag=None, additional_metrics=None ): """ Compute metrics for classification (multi-class or binary) tasks. Parameters ---------- pred: transformers.EvalPrediction Prediction as output by transformers.Trainer tokenizer: transformers.Tokenizer Tokenizer from huggingface. id2tag: Dict Dictionary mapping label ids to label names. additional_metrics: List List with additional metrics to compute. Returns ------- metrics: Dict Dictionary with metrics. For information regarding the exact metrics received in it, see the documentation for sklearn.metrics.classification_report. """ preds, labels = pred.predictions, pred.label_ids preds = np.argmax(preds, axis=1) class_report = classification_report(labels, preds, output_dict=True) metrics = class_report["macro avg"] return metrics
[docs]def compute_metrics_multilabel( pred, tokenizer=None, id2tag=None, additional_metrics=None ): """ Compute the metrics for a multilabel task. Parameters ---------- pred: transformers.EvalPrediction Prediction as output by transformers.Trainer tokenizer: transformers.Tokenizer Tokenizer from huggingface. id2tag: Dict Dictionary mapping label ids to label names. additional_metrics: List List with additional metrics to compute. Returns ------- best_metrics: Dict Dictionary with best metrics, after trying different thresholds. """ preds, labels = pred.predictions, pred.label_ids preds = torch.sigmoid(torch.from_numpy(preds)).numpy() thresholds = np.arange(0.1, 0.9, 0.1) best_metrics, best_metric, best_threshold = {}, 0, None for thres in thresholds: preds = preds >= thres preds = preds.astype(np.int) labels = labels.astype(np.int) class_report = classification_report( labels, preds, output_dict=True, ) metrics = class_report["macro avg"] f1 = metrics["f1-score"] if f1 > best_metric: best_metrics = metrics best_metric = f1 best_threshold = thres print(f"*** The best threshold is {best_threshold} ***") return best_metrics
[docs]def compute_metrics_ner(p, tokenizer=None, id2tag=None, additional_metrics=None): """ Compute metrics for ner. Use seqeval metric from HF Evaluate. Get the predicted label for each instance, then skip padded tokens and finally use seqeval metric, which takes into account full entities, not individual tokens, when computing the metrics. Parameters ---------- p: transformers.EvalPrediction Instance of EvalPrediction from transformers. tokenizer: transformers.Tokenizer Tokenizer from huggingface. id2tag: Dict Dictionary mapping label ids to label names. additional_metrics: List List with additional metrics to compute. Returns ------- Metrics Complete dictionary with all computed metrics on eval data. """ predictions, labels = p.predictions, p.label_ids try: predictions = np.argmax(predictions, axis=2) except Exception: print("The output shape is not logits-like, but directly targets.") predictions = predictions.astype("int") # Remove ignored index (special tokens) true_predictions = [ [str(id2tag[p]) for (p, i) in zip(prediction, label) if i != -100] for prediction, label in zip(predictions, labels) ] true_labels = [ [str(id2tag[i]) for (p, i) in zip(prediction, label) if i != -100] for prediction, label in zip(predictions, labels) ] metrics = metric_seqeval.compute(predictions=true_predictions, references=true_labels) metrics["f1-score"] = metrics["overall_f1"] return metrics
[docs]def compute_metrics_summarization( eval_pred, tokenizer, id2tag=None, additional_metrics: List = None ): """ Compute metrics for summarization tasks, by using rouge metrics in datasets library. Parameters ---------- eval_pred: transformers.EvalPrediction Prediction as output by transformers.Trainer tokenizer: Tokenizer from huggingface. id2tag: Dict Dictionary mapping label ids to label names. additional_metrics: List List with additional metrics to compute. Returns ------- metrics: Dict Dictionary with relevant metrics for summarization. """ predictions, labels = eval_pred.predictions, eval_pred.label_ids decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True) # Replace -100 in the labels as we can't decode them. labels = np.where(labels != -100, labels, tokenizer.pad_token_id) decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) # Rouge expects a newline after each sentence decoded_preds = [ "\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds ] decoded_labels = [ "\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels ] result = metric_sum.compute( predictions=decoded_preds, references=decoded_labels, use_stemmer=True ) # Extract a few results result = {key: value * 100 for key, value in result.items()} # Add mean generated length prediction_lens = [ np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions ] result["gen_len"] = np.mean(prediction_lens) result = {k: round(v, 4) for k, v in result.items()} if additional_metrics: other_results = [] for metric in additional_metrics: subre = metric.compute(predictions=decoded_preds, references=decoded_labels) other_results.append(subre) print(f"Other results for this dataset: \n {other_results}") result["other_metrics"] = other_results return result