Source code for nlpboost.tokenization_functions

import re
import tokenizers
import evaluate
import collections
from tqdm import tqdm
import numpy as np
from functools import partial
from .utils import match_questions_multiple_answers


[docs]def tokenize_classification(examples, tokenizer, dataset_config): """ Tokenize classification datasets. Given a dataset, a tokenizer and a dataset configuration, returns the tokenized dataset. Parameters ---------- examples: datasets.Dataset Samples from datasets.Dataset. tokenizer: tokenizers.Tokenizer Instance of hf's tokenizer. dataset_config: benchmarker.DatasetConfig Instance of a Dataset Config. Returns ------- tokenized: Tokenized samples. """ if dataset_config.is_2sents: tokenized = tokenizer( examples[dataset_config.sentence1_field], examples[dataset_config.sentence2_field], truncation=True, padding="longest", max_length=512, ) else: tokenized = tokenizer( examples[dataset_config.text_field], truncation=True, padding="longest", max_length=512, ) if not dataset_config.is_multilabel: tokenized["labels"] = examples[dataset_config.label_col] else: columns_not_text = list( sorted([col for col in examples if dataset_config.text_field not in col]) ) labels = [ [float(examples[col][i]) for col in columns_not_text] for i in range(len(examples[dataset_config.text_field])) ] tokenized["labels"] = labels return tokenized
[docs]def tokenize_ner(examples, tokenizer, dataset_config): """ Tokenize a dataset or dataset split. This function is intended to be used inside the map method for the Dataset. Parameters ---------- examples: datasets.Dataset Samples from datasets.Dataset. tokenizer: tokenizers.Tokenizer Instance of hf's tokenizer. dataset_config: benchmarker.DatasetConfig Instance of a Dataset Config. Returns ------- tokenized: Tokenized samples. """ ignore_index = -100 tokenized = tokenizer( examples[dataset_config.text_field], truncation=True, is_split_into_words=True, padding="longest", max_length=512, ) labels = [] for i, label in enumerate(examples[dataset_config.label_col]): word_ids = tokenized.word_ids(batch_index=i) label_ids = [] for word_idx in word_ids: # Special tokens have a word id that is None. We set the label to -100 so # they are automatically ignored in the loss function. if word_idx is None: label_ids.append(ignore_index) else: label_ids.append(label[word_idx]) labels.append(label_ids) tokenized["labels"] = labels return tokenized
[docs]def tokenize_squad(examples, tokenizer, dataset_config=None, pad_on_right=True): """ Tokenize samples of squad-like datasets, on batches. It differentiates between BPE tokenizers and others as there are errors in these ones if they are processed in the conventional way. Parameters ---------- examples: datasets.Dataset Samples from datasets.Dataset. tokenizer: tokenizers.Tokenizer Instance of hf's tokenizer. pad_on_right: bool Whether or not to pad the samples on the right side. True for most models. Returns ------- tokenized_examples: Tokenized samples. """ tokenized_examples = tokenizer( examples["question" if pad_on_right else "context"], examples["context" if pad_on_right else "question"], truncation="only_second" if pad_on_right else "only_first", max_length=512, stride=128, return_overflowing_tokens=True, return_offsets_mapping=True, padding="max_length", ) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") # The offset mappings will give us a map from token to character position in the original context. This will # help us compute the start_positions and end_positions. offset_mapping = tokenized_examples.pop("offset_mapping") # Let's label those examples! tokenized_examples["start_positions"] = [] tokenized_examples["end_positions"] = [] for i, offsets in enumerate(offset_mapping): # We will label impossible answers with the index of the CLS token. input_ids = tokenized_examples["input_ids"][i] cls_index = input_ids.index(tokenizer.cls_token_id) # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_examples.sequence_ids(i) # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] answers = examples["answers"][sample_index] # If no answers are given, set the cls_index as answer. if len(answers["answer_start"]) == 0: tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) else: # Start/end character index of the answer in the text. start_char = answers["answer_start"][0] end_char = start_char + len(answers["text"][0]) # Start token index of the current span in the text. token_start_index = 0 while sequence_ids[token_start_index] != (1 if pad_on_right else 0): token_start_index += 1 # End token index of the current span in the text. token_end_index = len(input_ids) - 1 while sequence_ids[token_end_index] != (1 if pad_on_right else 0): token_end_index -= 1 # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index). if not ( offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char ): tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) else: # Otherwise move the token_start_index and token_end_index to the two ends of the answer. # Note: we could go after the last offset if the answer is the last word (edge case). while ( token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char ): token_start_index += 1 tokenized_examples["start_positions"].append(token_start_index - 1) while offsets[token_end_index][1] >= end_char: token_end_index -= 1 tokenized_examples["end_positions"].append(token_end_index + 1) return tokenized_examples
[docs]def tokenize_summarization(examples, tokenizer, dataset_config): """ Tokenization function for summarization tasks. Parameters ---------- examples: datasets.Dataset Samples from datasets.Dataset. tokenizer: tokenizers.Tokenizer Instance of hf's tokenizer. dataset_config: benchmarker.DatasetConfig Instance of a Dataset Config. Returns ------- examples: datasets.Dataset Tokenized samples with all necessary fields. """ model_inputs = tokenizer( examples[dataset_config.text_field], truncation=True, max_length=tokenizer.model_max_length, ) with tokenizer.as_target_tokenizer(): labels = tokenizer( examples[dataset_config.summary_field], max_length=dataset_config.max_length_summary, truncation=True, ) model_inputs["labels"] = labels["input_ids"] return model_inputs