Source code for nlpboost.metrics

from sklearn.metrics import classification_report
import numpy as np
import nltk
import itertools
from typing import List
import torch
import evaluate

nltk.download("punkt")

metric_sum = evaluate.load("rouge")
metric_seqeval = evaluate.load("seqeval")


[docs]def compute_metrics_classification(
    pred, tokenizer=None, id2tag=None, additional_metrics=None
):
    """
    Compute metrics for classification (multi-class or binary) tasks.

    Parameters
    ----------
    pred: transformers.EvalPrediction
        Prediction as output by transformers.Trainer
    tokenizer: transformers.Tokenizer
        Tokenizer from huggingface.
    id2tag: Dict
        Dictionary mapping label ids to label names.
    additional_metrics: List
        List with additional metrics to compute.

    Returns
    -------
    metrics: Dict
        Dictionary with metrics. For information regarding the exact metrics
        received in it, see the documentation for sklearn.metrics.classification_report.
    """
    preds, labels = pred.predictions, pred.label_ids
    preds = np.argmax(preds, axis=1)
    class_report = classification_report(labels, preds, output_dict=True)
    metrics = class_report["macro avg"]
    return metrics


[docs]def compute_metrics_multilabel(
    pred, tokenizer=None, id2tag=None, additional_metrics=None
):
    """
    Compute the metrics for a multilabel task.

    Parameters
    ----------
    pred: transformers.EvalPrediction
        Prediction as output by transformers.Trainer
    tokenizer: transformers.Tokenizer
        Tokenizer from huggingface.
    id2tag: Dict
        Dictionary mapping label ids to label names.
    additional_metrics: List
        List with additional metrics to compute.

    Returns
    -------
    best_metrics: Dict
        Dictionary with best metrics, after trying different thresholds.
    """
    preds, labels = pred.predictions, pred.label_ids
    preds = torch.sigmoid(torch.from_numpy(preds)).numpy()
    thresholds = np.arange(0.1, 0.9, 0.1)
    best_metrics, best_metric, best_threshold = {}, 0, None

    for thres in thresholds:
        preds = preds >= thres
        preds = preds.astype(np.int)
        labels = labels.astype(np.int)
        class_report = classification_report(
            labels,
            preds,
            output_dict=True,
        )
        metrics = class_report["macro avg"]
        f1 = metrics["f1-score"]
        if f1 > best_metric:
            best_metrics = metrics
            best_metric = f1
            best_threshold = thres
    print(f"*** The best threshold is {best_threshold} ***")
    return best_metrics


[docs]def compute_metrics_ner(p, tokenizer=None, id2tag=None, additional_metrics=None):
    """
    Compute metrics for ner.

    Use seqeval metric from HF Evaluate. Get the predicted label for each instance,
    then skip padded tokens and finally use seqeval metric, which takes into account
    full entities, not individual tokens, when computing the metrics.

    Parameters
    ----------
    p: transformers.EvalPrediction
        Instance of EvalPrediction from transformers.
    tokenizer: transformers.Tokenizer
        Tokenizer from huggingface.
    id2tag: Dict
        Dictionary mapping label ids to label names.
    additional_metrics: List
        List with additional metrics to compute.

    Returns
    -------
    Metrics
        Complete dictionary with all computed metrics on eval data.
    """
    predictions, labels = p.predictions, p.label_ids

    try:
        predictions = np.argmax(predictions, axis=2)
    except Exception:
        print("The output shape is not logits-like, but directly targets.")
        predictions = predictions.astype("int")

    # Remove ignored index (special tokens)
    true_predictions = [
        [str(id2tag[p]) for (p, i) in zip(prediction, label) if i != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [str(id2tag[i]) for (p, i) in zip(prediction, label) if i != -100]
        for prediction, label in zip(predictions, labels)
    ]
    metrics = metric_seqeval.compute(predictions=true_predictions, references=true_labels)
    metrics["f1-score"] = metrics["overall_f1"]
    return metrics


[docs]def compute_metrics_summarization(
    eval_pred, tokenizer, id2tag=None, additional_metrics: List = None
):
    """
    Compute metrics for summarization tasks, by using rouge metrics in datasets library.

    Parameters
    ----------
    eval_pred: transformers.EvalPrediction
        Prediction as output by transformers.Trainer
    tokenizer:
        Tokenizer from huggingface.
    id2tag: Dict
        Dictionary mapping label ids to label names.
    additional_metrics: List
        List with additional metrics to compute.

    Returns
    -------
    metrics: Dict
        Dictionary with relevant metrics for summarization.
    """
    predictions, labels = eval_pred.predictions, eval_pred.label_ids
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = [
        "\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds
    ]
    decoded_labels = [
        "\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels
    ]

    result = metric_sum.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract a few results
    result = {key: value * 100 for key, value in result.items()}

    # Add mean generated length
    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions
    ]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    if additional_metrics:
        other_results = []
        for metric in additional_metrics:
            subre = metric.compute(predictions=decoded_preds, references=decoded_labels)
            other_results.append(subre)
        print(f"Other results for this dataset: \n {other_results}")
        result["other_metrics"] = other_results
    return result