autotransformers.results_getter

  1from functools import partial
  2import numpy as np
  3import collections
  4import evaluate
  5from .utils import match_questions_multiple_answers
  6from tqdm import tqdm
  7from .dataset_config import DatasetConfig
  8from .model_config import ModelConfig
  9from typing import Any
 10
 11
 12class ResultsGetter:
 13    """
 14    Retrieve results on the test set for different tasks (seq2seq, different forms of classification, NER, QA...).
 15
 16    Parameters
 17    ----------
 18    dataset_config: autotransformers.DatasetConfig
 19        Configuration for the dataset.
 20    model_config: autotransformers.ModelConfig
 21        Configuration for the model.
 22    compute_metrics_func: Any
 23        Function to compute metrics.
 24    """
 25
 26    def __init__(
 27        self,
 28        dataset_config: DatasetConfig,
 29        model_config: ModelConfig,
 30        compute_metrics_func: Any,
 31    ):
 32        self.dataset_config = dataset_config
 33        self.model_config = model_config
 34        self.compute_metrics_func = compute_metrics_func
 35
 36    def __call__(self, trainer, test_dataset):
 37        """
 38        Get results for test dataset, using a trained transformers.Trainer.
 39
 40        Parameters
 41        ----------
 42        trainer: transformers.Trainer
 43            Trainer trained, to get raw predictions on the test dataset.
 44        test_dataset: datasets.Dataset
 45            Test dataset for inference. Metrics are computed on this dataset.
 46
 47        Returns
 48        -------
 49        test_results: Dict
 50            Dictionary with test results.
 51        """
 52        if self.dataset_config.task == "qa":
 53            test_results = self.get_test_results_qa(
 54                test_dataset,
 55                trainer,
 56                self.dataset_config.squad_v2,
 57            )
 58        elif self.dataset_config.task == "seq2seq":
 59            test_results = self.get_test_results_summarization(
 60                test_dataset,
 61                trainer,
 62                self.compute_metrics_func,
 63                additional_metrics=self.dataset_config.additional_metrics,
 64            )
 65        else:
 66            test_results = self.general_get_test_results(
 67                test_dataset,
 68                trainer,
 69                self.compute_metrics_func,
 70            )
 71        return test_results
 72
 73    def get_test_results_summarization(
 74        self, test_dataset, trainer, compute_metrics_func, additional_metrics=None
 75    ):
 76        """
 77        Compute and get the results in test for summarization tasks.
 78
 79        Parameters
 80        ----------
 81        test_dataset: datasets.Dataset
 82            Test dataset.
 83        trainer: transformers.Trainer
 84            HF's transformers trainer.
 85        compute_metrics_func: Any
 86            Function to compute metrics.
 87        model_config: autotransformers.ModelConfig
 88            Configuration for the model.
 89        additional_metrics: List
 90            List with additional metrics to compute.
 91
 92        Returns
 93        -------
 94        metrics: Dict
 95            Dictionary with metrics for the summarization task.
 96        """
 97        if self.model_config.generation_params is None:
 98            preds = trainer.predict(
 99                test_dataset,
100                max_length=self.model_config.max_length_summary,
101                num_beams=self.model_config.num_beams,
102            )
103        else:
104            preds = trainer.predict(
105                test_dataset,
106                max_length=self.model_config.max_length_summary,
107                num_beams=self.model_config.num_beams,
108                **self.model_config.generation_params,
109            )
110        metrics = compute_metrics_func(
111            preds, tokenizer=trainer.tokenizer, additional_metrics=additional_metrics
112        )
113        return metrics
114
115    def general_get_test_results(
116        self, test_dataset, trainer, compute_metrics_func, additional_metrics=None
117    ):
118        """
119        Compute metrics in general for every NLU task except for QA.
120
121        Parameters
122        ----------
123        test_dataset: datasets.Dataset
124            Dataset  on any task except for QA.
125        trainer: transformers.Trainer
126            Trainer trained on a dataset that is not a QA dataset.
127
128        Returns
129        -------
130        metrics: Dict
131            Metrics for the test dataset.
132        """
133        preds = trainer.predict(test_dataset)
134        if hasattr(preds, "metrics"):
135            return preds.metrics
136        metrics = compute_metrics_func(
137            preds,
138            tokenizer=trainer.tokenizer,
139            id2tag=trainer.model.config.id2label,
140            additional_metrics=additional_metrics,
141        )
142        return metrics
143
144    def get_test_results_qa(
145        self, test_dataset, trainer, squad_v2=False, additional_metrics=None
146    ):
147        """
148        Compute metrics on test for QA datasets.
149
150        Parameters
151        ----------
152        test_dataset: datasets.Dataset
153            QA dataset.
154        trainer: transformers.Trainer
155            Trainer trained on QA dataset.
156        squad_v2: bool
157            Whether the dataset is in squad v2 format or not.
158
159        Returns
160        -------
161        metrics: Dict
162            Metrics for the test dataset.
163        """
164        validation_features = test_dataset.map(
165            partial(
166                self.prepare_validation_features_squad,
167                tokenizer=trainer.tokenizer,
168            ),
169            batched=True,
170            remove_columns=test_dataset.column_names,
171        )
172        raw_predictions = trainer.predict(validation_features)
173        validation_features.set_format(
174            type=validation_features.format["type"],
175            columns=list(validation_features.features.keys()),
176        )
177        final_predictions = self.postprocess_qa_predictions(
178            test_dataset,
179            validation_features,
180            raw_predictions.predictions,
181            tokenizer=trainer.tokenizer,
182        )
183        if isinstance(final_predictions, tuple):
184            final_predictions = final_predictions[0]
185
186        metric, formatted_predictions = self._get_metric_and_formatted_predictions(
187            final_predictions, squad_v2
188        )
189
190        references = [{"id": ex["id"], "answers": ex["answers"]} for ex in test_dataset]
191
192        references = match_questions_multiple_answers(formatted_predictions, references)
193
194        metrics = metric.compute(
195            predictions=formatted_predictions, references=references
196        )
197        return metrics
198
199    def prepare_validation_features_squad(self, examples, tokenizer, pad_on_right=True):
200        """
201        Process features for validating on squad-like datasets.
202
203        Parameters
204        ----------
205        examples: datasets.Dataset
206            Samples from datasets.Dataset.
207        tokenizer: tokenizers.Tokenizer
208            Instance of hf's tokenizer.
209        pad_on_right: bool
210            Whether or not to pad the samples on the right side. True for most models.
211
212        Returns
213        -------
214        tokenized_examples:
215            Tokenized samples.
216        """
217        id_field = (
218            self.dataset_config.id_field_qa if self.dataset_config is not None else "id"
219        )
220        # Some of the questions have lots of whitespace on the left, which is not useful and will make the
221        # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
222        # left whitespace
223        examples["question"] = [q.lstrip() for q in examples["question"]]
224
225        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
226        # in one example possible giving several features when a context is long, each of those features having a
227        # context that overlaps a bit the context of the previous feature.
228        tokenized_examples = tokenizer(
229            examples["question" if pad_on_right else "context"],
230            examples["context" if pad_on_right else "question"],
231            truncation="only_second" if pad_on_right else "only_first",
232            max_length=512,
233            stride=128,
234            return_overflowing_tokens=True,
235            return_offsets_mapping=True,
236            padding="max_length",
237            is_split_into_words=False,
238        )
239        # Since one example might give us several features if it has a long context, we need a map from a feature to
240        # its corresponding example. This key gives us just that.
241        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
242
243        # We keep the example_id that gave us this feature and we will store the offset mappings.
244        tokenized_examples["example_id"] = []
245
246        for i in range(len(tokenized_examples["input_ids"])):
247            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
248            sequence_ids = tokenized_examples.sequence_ids(i)
249            context_index = 1 if pad_on_right else 0
250
251            # One example can give several spans, this is the index of the example containing this span of text.
252            sample_index = sample_mapping[i]
253            tokenized_examples["example_id"].append(
254                examples[id_field][sample_index]
255            )  # id
256
257            # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
258            # position is part of the context or not.
259            tokenized_examples["offset_mapping"][i] = [
260                (o if sequence_ids[k] == context_index else None)
261                for k, o in enumerate(tokenized_examples["offset_mapping"][i])
262            ]
263
264        return tokenized_examples
265
266    def postprocess_qa_predictions(
267        self,
268        examples,
269        features,
270        raw_predictions,
271        tokenizer,
272        n_best_size=20,
273        max_answer_length=30,
274        squad_v2=False,
275        min_score=None,
276    ):
277        """
278        Process raw predictions of a QA model.
279
280        Parameters
281        ----------
282        examples: datasets.Dataset
283            Samples from datasets.Dataset.
284        features:
285            Validation features as processed by prepare_validation_features_squad.
286        raw_predictions:
287            Predictions by trainer.
288        tokenizer: tokenizers.Tokenizer
289            Instance of hf's tokenizer.
290        n_best_size: int
291            Number of best answers to get (maximum).
292        max_answer_length: int
293            Maximum answer length in number of characters. Answer longer than this are not even considered.
294        squad_v2: bool
295            Whether the dataset is in squad v2 format or not.
296
297        Returns
298        -------
299        predictions: collections.OrderedDict
300            An ordered dict with the predictions formatted so that we can compute metrics easily.
301        """
302        # After raw predictions are taken by a QA model, this function processes them
303        # and sorts them in terms of score etc. It also takes the concrete text that
304        # was predicted given the predicted start and end tokens.
305        id_field = (
306            self.dataset_config.id_field_qa if self.dataset_config is not None else "id"
307        )
308        all_start_logits, all_end_logits = raw_predictions
309        # Build a map example to its corresponding features.
310        example_id_to_index = {k: i for i, k in enumerate(examples[id_field])}  # id
311        features_per_example = collections.defaultdict(list)
312        for i, feature in enumerate(features):
313            features_per_example[example_id_to_index[feature["example_id"]]].append(i)
314
315        # The dictionaries we have to fill.
316        predictions = collections.OrderedDict()
317        scores = collections.OrderedDict()
318
319        # Logging.
320        print(
321            f"Post-processing {len(examples)} example predictions split into {len(features)} features."
322        )
323
324        # Let's loop over all the examples!
325        for example_index, example in enumerate(tqdm(examples)):
326            # Those are the indices of the features associated to the current example.
327            # example["id"]]#
328            feature_indices = features_per_example[example_index]
329
330            # min_score  # Only used if squad_v2 is True.
331            min_null_score = None
332            valid_answers = []
333
334            cls_scores = []
335
336            context = example["context"]
337            # input_ids = example["input_ids"]
338            # Looping through all the features associated to the current example.
339            for feature_index in feature_indices:
340                # We grab the predictions of the model for this feature.
341                start_logits = all_start_logits[feature_index]
342                end_logits = all_end_logits[feature_index]
343                # This is what will allow us to map some the positions in our logits to span of texts in the original
344                # context.
345                offset_mapping = features[feature_index]["offset_mapping"]
346                # Update minimum null prediction.
347                cls_index = features[feature_index]["input_ids"].index(
348                    tokenizer.cls_token_id
349                )
350                feature_null_score = start_logits[cls_index] + end_logits[cls_index]
351                cls_scores.append(feature_null_score)
352                if min_null_score is None or min_null_score < feature_null_score:
353                    min_null_score = feature_null_score
354
355                # Go through all possibilities for the `n_best_size` greater start and end logits.
356                start_indexes = np.argsort(start_logits)[
357                    -1 : -n_best_size - 1 : -1
358                ].tolist()
359                end_indexes = np.argsort(end_logits)[
360                    -1 : -n_best_size - 1 : -1
361                ].tolist()
362                for start_index in start_indexes:
363                    for end_index in end_indexes:
364                        # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
365                        # to part of the input_ids that are not in the context.
366                        if (
367                            start_index >= len(offset_mapping)
368                            or end_index >= len(offset_mapping)
369                            or offset_mapping[start_index] is None
370                            or offset_mapping[end_index] is None
371                        ):
372                            continue
373                        # Don't consider answers with a length that is either < 0 or > max_answer_length.
374                        if (
375                            end_index < start_index
376                            or end_index - start_index + 1 > max_answer_length
377                        ):
378                            continue
379
380                        start_char = offset_mapping[start_index][0]
381                        end_char = offset_mapping[end_index][1]
382                        text = context[start_char:end_char]
383                        valid_answers.append(
384                            {
385                                "score": start_logits[start_index]
386                                + end_logits[end_index],
387                                "text": text,
388                            }
389                        )
390            if len(valid_answers) > 0:
391                best_answer = sorted(
392                    valid_answers, key=lambda x: x["score"], reverse=True
393                )[0]
394            else:
395                # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
396                # failure.
397                best_answer = {"text": "", "score": 0.0}
398
399            # Let's pick our final answer: the best one or the null answer (only for squad_v2)
400            if not squad_v2:
401                predictions[example["id"]] = best_answer["text"]
402            else:
403                if min_score is not None:
404                    thres = min([min(cls_scores), min_score])
405                else:
406                    thres = min(cls_scores)
407                answer = best_answer["text"] if best_answer["score"] > thres else ""
408                if example["id"] not in predictions:
409                    predictions[example["id"]] = answer
410                    scores[example["id"]] = best_answer["score"]
411
412        return predictions
413
414    def _get_metric_and_formatted_predictions(self, final_predictions, squad_v2):
415        """
416        Get the metric from evaluate and the final predictions formatted.
417
418        Parameters
419        ----------
420        final_predictions: Dict
421            Predictions postprocessed.
422        squad_v2: bool
423            Whether it is squad_v2 mode or not.
424
425        Returns
426        -------
427        metric: evaluate.Metric
428            Metric from the evaluate library.
429        formatted_predictions: Dict
430            Predictions in the correct format for the metric.
431        """
432        if not squad_v2:
433            metric = evaluate.load("squad")
434            formatted_predictions = [
435                {"id": k, "prediction_text": v} for k, v in final_predictions.items()
436            ]
437        else:
438            metric = evaluate.load("squad_v2")
439            formatted_predictions = [
440                {"id": k, "prediction_text": v, "no_answer_probability": 0.0}
441                for k, v in final_predictions.items()
442            ]
443        return metric, formatted_predictions
class ResultsGetter:
 13class ResultsGetter:
 14    """
 15    Retrieve results on the test set for different tasks (seq2seq, different forms of classification, NER, QA...).
 16
 17    Parameters
 18    ----------
 19    dataset_config: autotransformers.DatasetConfig
 20        Configuration for the dataset.
 21    model_config: autotransformers.ModelConfig
 22        Configuration for the model.
 23    compute_metrics_func: Any
 24        Function to compute metrics.
 25    """
 26
 27    def __init__(
 28        self,
 29        dataset_config: DatasetConfig,
 30        model_config: ModelConfig,
 31        compute_metrics_func: Any,
 32    ):
 33        self.dataset_config = dataset_config
 34        self.model_config = model_config
 35        self.compute_metrics_func = compute_metrics_func
 36
 37    def __call__(self, trainer, test_dataset):
 38        """
 39        Get results for test dataset, using a trained transformers.Trainer.
 40
 41        Parameters
 42        ----------
 43        trainer: transformers.Trainer
 44            Trainer trained, to get raw predictions on the test dataset.
 45        test_dataset: datasets.Dataset
 46            Test dataset for inference. Metrics are computed on this dataset.
 47
 48        Returns
 49        -------
 50        test_results: Dict
 51            Dictionary with test results.
 52        """
 53        if self.dataset_config.task == "qa":
 54            test_results = self.get_test_results_qa(
 55                test_dataset,
 56                trainer,
 57                self.dataset_config.squad_v2,
 58            )
 59        elif self.dataset_config.task == "seq2seq":
 60            test_results = self.get_test_results_summarization(
 61                test_dataset,
 62                trainer,
 63                self.compute_metrics_func,
 64                additional_metrics=self.dataset_config.additional_metrics,
 65            )
 66        else:
 67            test_results = self.general_get_test_results(
 68                test_dataset,
 69                trainer,
 70                self.compute_metrics_func,
 71            )
 72        return test_results
 73
 74    def get_test_results_summarization(
 75        self, test_dataset, trainer, compute_metrics_func, additional_metrics=None
 76    ):
 77        """
 78        Compute and get the results in test for summarization tasks.
 79
 80        Parameters
 81        ----------
 82        test_dataset: datasets.Dataset
 83            Test dataset.
 84        trainer: transformers.Trainer
 85            HF's transformers trainer.
 86        compute_metrics_func: Any
 87            Function to compute metrics.
 88        model_config: autotransformers.ModelConfig
 89            Configuration for the model.
 90        additional_metrics: List
 91            List with additional metrics to compute.
 92
 93        Returns
 94        -------
 95        metrics: Dict
 96            Dictionary with metrics for the summarization task.
 97        """
 98        if self.model_config.generation_params is None:
 99            preds = trainer.predict(
100                test_dataset,
101                max_length=self.model_config.max_length_summary,
102                num_beams=self.model_config.num_beams,
103            )
104        else:
105            preds = trainer.predict(
106                test_dataset,
107                max_length=self.model_config.max_length_summary,
108                num_beams=self.model_config.num_beams,
109                **self.model_config.generation_params,
110            )
111        metrics = compute_metrics_func(
112            preds, tokenizer=trainer.tokenizer, additional_metrics=additional_metrics
113        )
114        return metrics
115
116    def general_get_test_results(
117        self, test_dataset, trainer, compute_metrics_func, additional_metrics=None
118    ):
119        """
120        Compute metrics in general for every NLU task except for QA.
121
122        Parameters
123        ----------
124        test_dataset: datasets.Dataset
125            Dataset  on any task except for QA.
126        trainer: transformers.Trainer
127            Trainer trained on a dataset that is not a QA dataset.
128
129        Returns
130        -------
131        metrics: Dict
132            Metrics for the test dataset.
133        """
134        preds = trainer.predict(test_dataset)
135        if hasattr(preds, "metrics"):
136            return preds.metrics
137        metrics = compute_metrics_func(
138            preds,
139            tokenizer=trainer.tokenizer,
140            id2tag=trainer.model.config.id2label,
141            additional_metrics=additional_metrics,
142        )
143        return metrics
144
145    def get_test_results_qa(
146        self, test_dataset, trainer, squad_v2=False, additional_metrics=None
147    ):
148        """
149        Compute metrics on test for QA datasets.
150
151        Parameters
152        ----------
153        test_dataset: datasets.Dataset
154            QA dataset.
155        trainer: transformers.Trainer
156            Trainer trained on QA dataset.
157        squad_v2: bool
158            Whether the dataset is in squad v2 format or not.
159
160        Returns
161        -------
162        metrics: Dict
163            Metrics for the test dataset.
164        """
165        validation_features = test_dataset.map(
166            partial(
167                self.prepare_validation_features_squad,
168                tokenizer=trainer.tokenizer,
169            ),
170            batched=True,
171            remove_columns=test_dataset.column_names,
172        )
173        raw_predictions = trainer.predict(validation_features)
174        validation_features.set_format(
175            type=validation_features.format["type"],
176            columns=list(validation_features.features.keys()),
177        )
178        final_predictions = self.postprocess_qa_predictions(
179            test_dataset,
180            validation_features,
181            raw_predictions.predictions,
182            tokenizer=trainer.tokenizer,
183        )
184        if isinstance(final_predictions, tuple):
185            final_predictions = final_predictions[0]
186
187        metric, formatted_predictions = self._get_metric_and_formatted_predictions(
188            final_predictions, squad_v2
189        )
190
191        references = [{"id": ex["id"], "answers": ex["answers"]} for ex in test_dataset]
192
193        references = match_questions_multiple_answers(formatted_predictions, references)
194
195        metrics = metric.compute(
196            predictions=formatted_predictions, references=references
197        )
198        return metrics
199
200    def prepare_validation_features_squad(self, examples, tokenizer, pad_on_right=True):
201        """
202        Process features for validating on squad-like datasets.
203
204        Parameters
205        ----------
206        examples: datasets.Dataset
207            Samples from datasets.Dataset.
208        tokenizer: tokenizers.Tokenizer
209            Instance of hf's tokenizer.
210        pad_on_right: bool
211            Whether or not to pad the samples on the right side. True for most models.
212
213        Returns
214        -------
215        tokenized_examples:
216            Tokenized samples.
217        """
218        id_field = (
219            self.dataset_config.id_field_qa if self.dataset_config is not None else "id"
220        )
221        # Some of the questions have lots of whitespace on the left, which is not useful and will make the
222        # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
223        # left whitespace
224        examples["question"] = [q.lstrip() for q in examples["question"]]
225
226        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
227        # in one example possible giving several features when a context is long, each of those features having a
228        # context that overlaps a bit the context of the previous feature.
229        tokenized_examples = tokenizer(
230            examples["question" if pad_on_right else "context"],
231            examples["context" if pad_on_right else "question"],
232            truncation="only_second" if pad_on_right else "only_first",
233            max_length=512,
234            stride=128,
235            return_overflowing_tokens=True,
236            return_offsets_mapping=True,
237            padding="max_length",
238            is_split_into_words=False,
239        )
240        # Since one example might give us several features if it has a long context, we need a map from a feature to
241        # its corresponding example. This key gives us just that.
242        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
243
244        # We keep the example_id that gave us this feature and we will store the offset mappings.
245        tokenized_examples["example_id"] = []
246
247        for i in range(len(tokenized_examples["input_ids"])):
248            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
249            sequence_ids = tokenized_examples.sequence_ids(i)
250            context_index = 1 if pad_on_right else 0
251
252            # One example can give several spans, this is the index of the example containing this span of text.
253            sample_index = sample_mapping[i]
254            tokenized_examples["example_id"].append(
255                examples[id_field][sample_index]
256            )  # id
257
258            # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
259            # position is part of the context or not.
260            tokenized_examples["offset_mapping"][i] = [
261                (o if sequence_ids[k] == context_index else None)
262                for k, o in enumerate(tokenized_examples["offset_mapping"][i])
263            ]
264
265        return tokenized_examples
266
267    def postprocess_qa_predictions(
268        self,
269        examples,
270        features,
271        raw_predictions,
272        tokenizer,
273        n_best_size=20,
274        max_answer_length=30,
275        squad_v2=False,
276        min_score=None,
277    ):
278        """
279        Process raw predictions of a QA model.
280
281        Parameters
282        ----------
283        examples: datasets.Dataset
284            Samples from datasets.Dataset.
285        features:
286            Validation features as processed by prepare_validation_features_squad.
287        raw_predictions:
288            Predictions by trainer.
289        tokenizer: tokenizers.Tokenizer
290            Instance of hf's tokenizer.
291        n_best_size: int
292            Number of best answers to get (maximum).
293        max_answer_length: int
294            Maximum answer length in number of characters. Answer longer than this are not even considered.
295        squad_v2: bool
296            Whether the dataset is in squad v2 format or not.
297
298        Returns
299        -------
300        predictions: collections.OrderedDict
301            An ordered dict with the predictions formatted so that we can compute metrics easily.
302        """
303        # After raw predictions are taken by a QA model, this function processes them
304        # and sorts them in terms of score etc. It also takes the concrete text that
305        # was predicted given the predicted start and end tokens.
306        id_field = (
307            self.dataset_config.id_field_qa if self.dataset_config is not None else "id"
308        )
309        all_start_logits, all_end_logits = raw_predictions
310        # Build a map example to its corresponding features.
311        example_id_to_index = {k: i for i, k in enumerate(examples[id_field])}  # id
312        features_per_example = collections.defaultdict(list)
313        for i, feature in enumerate(features):
314            features_per_example[example_id_to_index[feature["example_id"]]].append(i)
315
316        # The dictionaries we have to fill.
317        predictions = collections.OrderedDict()
318        scores = collections.OrderedDict()
319
320        # Logging.
321        print(
322            f"Post-processing {len(examples)} example predictions split into {len(features)} features."
323        )
324
325        # Let's loop over all the examples!
326        for example_index, example in enumerate(tqdm(examples)):
327            # Those are the indices of the features associated to the current example.
328            # example["id"]]#
329            feature_indices = features_per_example[example_index]
330
331            # min_score  # Only used if squad_v2 is True.
332            min_null_score = None
333            valid_answers = []
334
335            cls_scores = []
336
337            context = example["context"]
338            # input_ids = example["input_ids"]
339            # Looping through all the features associated to the current example.
340            for feature_index in feature_indices:
341                # We grab the predictions of the model for this feature.
342                start_logits = all_start_logits[feature_index]
343                end_logits = all_end_logits[feature_index]
344                # This is what will allow us to map some the positions in our logits to span of texts in the original
345                # context.
346                offset_mapping = features[feature_index]["offset_mapping"]
347                # Update minimum null prediction.
348                cls_index = features[feature_index]["input_ids"].index(
349                    tokenizer.cls_token_id
350                )
351                feature_null_score = start_logits[cls_index] + end_logits[cls_index]
352                cls_scores.append(feature_null_score)
353                if min_null_score is None or min_null_score < feature_null_score:
354                    min_null_score = feature_null_score
355
356                # Go through all possibilities for the `n_best_size` greater start and end logits.
357                start_indexes = np.argsort(start_logits)[
358                    -1 : -n_best_size - 1 : -1
359                ].tolist()
360                end_indexes = np.argsort(end_logits)[
361                    -1 : -n_best_size - 1 : -1
362                ].tolist()
363                for start_index in start_indexes:
364                    for end_index in end_indexes:
365                        # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
366                        # to part of the input_ids that are not in the context.
367                        if (
368                            start_index >= len(offset_mapping)
369                            or end_index >= len(offset_mapping)
370                            or offset_mapping[start_index] is None
371                            or offset_mapping[end_index] is None
372                        ):
373                            continue
374                        # Don't consider answers with a length that is either < 0 or > max_answer_length.
375                        if (
376                            end_index < start_index
377                            or end_index - start_index + 1 > max_answer_length
378                        ):
379                            continue
380
381                        start_char = offset_mapping[start_index][0]
382                        end_char = offset_mapping[end_index][1]
383                        text = context[start_char:end_char]
384                        valid_answers.append(
385                            {
386                                "score": start_logits[start_index]
387                                + end_logits[end_index],
388                                "text": text,
389                            }
390                        )
391            if len(valid_answers) > 0:
392                best_answer = sorted(
393                    valid_answers, key=lambda x: x["score"], reverse=True
394                )[0]
395            else:
396                # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
397                # failure.
398                best_answer = {"text": "", "score": 0.0}
399
400            # Let's pick our final answer: the best one or the null answer (only for squad_v2)
401            if not squad_v2:
402                predictions[example["id"]] = best_answer["text"]
403            else:
404                if min_score is not None:
405                    thres = min([min(cls_scores), min_score])
406                else:
407                    thres = min(cls_scores)
408                answer = best_answer["text"] if best_answer["score"] > thres else ""
409                if example["id"] not in predictions:
410                    predictions[example["id"]] = answer
411                    scores[example["id"]] = best_answer["score"]
412
413        return predictions
414
415    def _get_metric_and_formatted_predictions(self, final_predictions, squad_v2):
416        """
417        Get the metric from evaluate and the final predictions formatted.
418
419        Parameters
420        ----------
421        final_predictions: Dict
422            Predictions postprocessed.
423        squad_v2: bool
424            Whether it is squad_v2 mode or not.
425
426        Returns
427        -------
428        metric: evaluate.Metric
429            Metric from the evaluate library.
430        formatted_predictions: Dict
431            Predictions in the correct format for the metric.
432        """
433        if not squad_v2:
434            metric = evaluate.load("squad")
435            formatted_predictions = [
436                {"id": k, "prediction_text": v} for k, v in final_predictions.items()
437            ]
438        else:
439            metric = evaluate.load("squad_v2")
440            formatted_predictions = [
441                {"id": k, "prediction_text": v, "no_answer_probability": 0.0}
442                for k, v in final_predictions.items()
443            ]
444        return metric, formatted_predictions

Retrieve results on the test set for different tasks (seq2seq, different forms of classification, NER, QA...).

Parameters

dataset_config: autotransformers.DatasetConfig Configuration for the dataset. model_config: autotransformers.ModelConfig Configuration for the model. compute_metrics_func: Any Function to compute metrics.

ResultsGetter( dataset_config: autotransformers.dataset_config.DatasetConfig, model_config: autotransformers.model_config.ModelConfig, compute_metrics_func: Any)
27    def __init__(
28        self,
29        dataset_config: DatasetConfig,
30        model_config: ModelConfig,
31        compute_metrics_func: Any,
32    ):
33        self.dataset_config = dataset_config
34        self.model_config = model_config
35        self.compute_metrics_func = compute_metrics_func
dataset_config
model_config
compute_metrics_func
def get_test_results_summarization( self, test_dataset, trainer, compute_metrics_func, additional_metrics=None):
 74    def get_test_results_summarization(
 75        self, test_dataset, trainer, compute_metrics_func, additional_metrics=None
 76    ):
 77        """
 78        Compute and get the results in test for summarization tasks.
 79
 80        Parameters
 81        ----------
 82        test_dataset: datasets.Dataset
 83            Test dataset.
 84        trainer: transformers.Trainer
 85            HF's transformers trainer.
 86        compute_metrics_func: Any
 87            Function to compute metrics.
 88        model_config: autotransformers.ModelConfig
 89            Configuration for the model.
 90        additional_metrics: List
 91            List with additional metrics to compute.
 92
 93        Returns
 94        -------
 95        metrics: Dict
 96            Dictionary with metrics for the summarization task.
 97        """
 98        if self.model_config.generation_params is None:
 99            preds = trainer.predict(
100                test_dataset,
101                max_length=self.model_config.max_length_summary,
102                num_beams=self.model_config.num_beams,
103            )
104        else:
105            preds = trainer.predict(
106                test_dataset,
107                max_length=self.model_config.max_length_summary,
108                num_beams=self.model_config.num_beams,
109                **self.model_config.generation_params,
110            )
111        metrics = compute_metrics_func(
112            preds, tokenizer=trainer.tokenizer, additional_metrics=additional_metrics
113        )
114        return metrics

Compute and get the results in test for summarization tasks.

Parameters

test_dataset: datasets.Dataset Test dataset. trainer: transformers.Trainer HF's transformers trainer. compute_metrics_func: Any Function to compute metrics. model_config: autotransformers.ModelConfig Configuration for the model. additional_metrics: List List with additional metrics to compute.

Returns

metrics: Dict Dictionary with metrics for the summarization task.

def general_get_test_results( self, test_dataset, trainer, compute_metrics_func, additional_metrics=None):
116    def general_get_test_results(
117        self, test_dataset, trainer, compute_metrics_func, additional_metrics=None
118    ):
119        """
120        Compute metrics in general for every NLU task except for QA.
121
122        Parameters
123        ----------
124        test_dataset: datasets.Dataset
125            Dataset  on any task except for QA.
126        trainer: transformers.Trainer
127            Trainer trained on a dataset that is not a QA dataset.
128
129        Returns
130        -------
131        metrics: Dict
132            Metrics for the test dataset.
133        """
134        preds = trainer.predict(test_dataset)
135        if hasattr(preds, "metrics"):
136            return preds.metrics
137        metrics = compute_metrics_func(
138            preds,
139            tokenizer=trainer.tokenizer,
140            id2tag=trainer.model.config.id2label,
141            additional_metrics=additional_metrics,
142        )
143        return metrics

Compute metrics in general for every NLU task except for QA.

Parameters

test_dataset: datasets.Dataset Dataset on any task except for QA. trainer: transformers.Trainer Trainer trained on a dataset that is not a QA dataset.

Returns

metrics: Dict Metrics for the test dataset.

def get_test_results_qa(self, test_dataset, trainer, squad_v2=False, additional_metrics=None):
145    def get_test_results_qa(
146        self, test_dataset, trainer, squad_v2=False, additional_metrics=None
147    ):
148        """
149        Compute metrics on test for QA datasets.
150
151        Parameters
152        ----------
153        test_dataset: datasets.Dataset
154            QA dataset.
155        trainer: transformers.Trainer
156            Trainer trained on QA dataset.
157        squad_v2: bool
158            Whether the dataset is in squad v2 format or not.
159
160        Returns
161        -------
162        metrics: Dict
163            Metrics for the test dataset.
164        """
165        validation_features = test_dataset.map(
166            partial(
167                self.prepare_validation_features_squad,
168                tokenizer=trainer.tokenizer,
169            ),
170            batched=True,
171            remove_columns=test_dataset.column_names,
172        )
173        raw_predictions = trainer.predict(validation_features)
174        validation_features.set_format(
175            type=validation_features.format["type"],
176            columns=list(validation_features.features.keys()),
177        )
178        final_predictions = self.postprocess_qa_predictions(
179            test_dataset,
180            validation_features,
181            raw_predictions.predictions,
182            tokenizer=trainer.tokenizer,
183        )
184        if isinstance(final_predictions, tuple):
185            final_predictions = final_predictions[0]
186
187        metric, formatted_predictions = self._get_metric_and_formatted_predictions(
188            final_predictions, squad_v2
189        )
190
191        references = [{"id": ex["id"], "answers": ex["answers"]} for ex in test_dataset]
192
193        references = match_questions_multiple_answers(formatted_predictions, references)
194
195        metrics = metric.compute(
196            predictions=formatted_predictions, references=references
197        )
198        return metrics

Compute metrics on test for QA datasets.

Parameters

test_dataset: datasets.Dataset QA dataset. trainer: transformers.Trainer Trainer trained on QA dataset. squad_v2: bool Whether the dataset is in squad v2 format or not.

Returns

metrics: Dict Metrics for the test dataset.

def prepare_validation_features_squad(self, examples, tokenizer, pad_on_right=True):
200    def prepare_validation_features_squad(self, examples, tokenizer, pad_on_right=True):
201        """
202        Process features for validating on squad-like datasets.
203
204        Parameters
205        ----------
206        examples: datasets.Dataset
207            Samples from datasets.Dataset.
208        tokenizer: tokenizers.Tokenizer
209            Instance of hf's tokenizer.
210        pad_on_right: bool
211            Whether or not to pad the samples on the right side. True for most models.
212
213        Returns
214        -------
215        tokenized_examples:
216            Tokenized samples.
217        """
218        id_field = (
219            self.dataset_config.id_field_qa if self.dataset_config is not None else "id"
220        )
221        # Some of the questions have lots of whitespace on the left, which is not useful and will make the
222        # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
223        # left whitespace
224        examples["question"] = [q.lstrip() for q in examples["question"]]
225
226        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
227        # in one example possible giving several features when a context is long, each of those features having a
228        # context that overlaps a bit the context of the previous feature.
229        tokenized_examples = tokenizer(
230            examples["question" if pad_on_right else "context"],
231            examples["context" if pad_on_right else "question"],
232            truncation="only_second" if pad_on_right else "only_first",
233            max_length=512,
234            stride=128,
235            return_overflowing_tokens=True,
236            return_offsets_mapping=True,
237            padding="max_length",
238            is_split_into_words=False,
239        )
240        # Since one example might give us several features if it has a long context, we need a map from a feature to
241        # its corresponding example. This key gives us just that.
242        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
243
244        # We keep the example_id that gave us this feature and we will store the offset mappings.
245        tokenized_examples["example_id"] = []
246
247        for i in range(len(tokenized_examples["input_ids"])):
248            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
249            sequence_ids = tokenized_examples.sequence_ids(i)
250            context_index = 1 if pad_on_right else 0
251
252            # One example can give several spans, this is the index of the example containing this span of text.
253            sample_index = sample_mapping[i]
254            tokenized_examples["example_id"].append(
255                examples[id_field][sample_index]
256            )  # id
257
258            # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
259            # position is part of the context or not.
260            tokenized_examples["offset_mapping"][i] = [
261                (o if sequence_ids[k] == context_index else None)
262                for k, o in enumerate(tokenized_examples["offset_mapping"][i])
263            ]
264
265        return tokenized_examples

Process features for validating on squad-like datasets.

Parameters

examples: datasets.Dataset Samples from datasets.Dataset. tokenizer: tokenizers.Tokenizer Instance of hf's tokenizer. pad_on_right: bool Whether or not to pad the samples on the right side. True for most models.

Returns

tokenized_examples: Tokenized samples.

def postprocess_qa_predictions( self, examples, features, raw_predictions, tokenizer, n_best_size=20, max_answer_length=30, squad_v2=False, min_score=None):
267    def postprocess_qa_predictions(
268        self,
269        examples,
270        features,
271        raw_predictions,
272        tokenizer,
273        n_best_size=20,
274        max_answer_length=30,
275        squad_v2=False,
276        min_score=None,
277    ):
278        """
279        Process raw predictions of a QA model.
280
281        Parameters
282        ----------
283        examples: datasets.Dataset
284            Samples from datasets.Dataset.
285        features:
286            Validation features as processed by prepare_validation_features_squad.
287        raw_predictions:
288            Predictions by trainer.
289        tokenizer: tokenizers.Tokenizer
290            Instance of hf's tokenizer.
291        n_best_size: int
292            Number of best answers to get (maximum).
293        max_answer_length: int
294            Maximum answer length in number of characters. Answer longer than this are not even considered.
295        squad_v2: bool
296            Whether the dataset is in squad v2 format or not.
297
298        Returns
299        -------
300        predictions: collections.OrderedDict
301            An ordered dict with the predictions formatted so that we can compute metrics easily.
302        """
303        # After raw predictions are taken by a QA model, this function processes them
304        # and sorts them in terms of score etc. It also takes the concrete text that
305        # was predicted given the predicted start and end tokens.
306        id_field = (
307            self.dataset_config.id_field_qa if self.dataset_config is not None else "id"
308        )
309        all_start_logits, all_end_logits = raw_predictions
310        # Build a map example to its corresponding features.
311        example_id_to_index = {k: i for i, k in enumerate(examples[id_field])}  # id
312        features_per_example = collections.defaultdict(list)
313        for i, feature in enumerate(features):
314            features_per_example[example_id_to_index[feature["example_id"]]].append(i)
315
316        # The dictionaries we have to fill.
317        predictions = collections.OrderedDict()
318        scores = collections.OrderedDict()
319
320        # Logging.
321        print(
322            f"Post-processing {len(examples)} example predictions split into {len(features)} features."
323        )
324
325        # Let's loop over all the examples!
326        for example_index, example in enumerate(tqdm(examples)):
327            # Those are the indices of the features associated to the current example.
328            # example["id"]]#
329            feature_indices = features_per_example[example_index]
330
331            # min_score  # Only used if squad_v2 is True.
332            min_null_score = None
333            valid_answers = []
334
335            cls_scores = []
336
337            context = example["context"]
338            # input_ids = example["input_ids"]
339            # Looping through all the features associated to the current example.
340            for feature_index in feature_indices:
341                # We grab the predictions of the model for this feature.
342                start_logits = all_start_logits[feature_index]
343                end_logits = all_end_logits[feature_index]
344                # This is what will allow us to map some the positions in our logits to span of texts in the original
345                # context.
346                offset_mapping = features[feature_index]["offset_mapping"]
347                # Update minimum null prediction.
348                cls_index = features[feature_index]["input_ids"].index(
349                    tokenizer.cls_token_id
350                )
351                feature_null_score = start_logits[cls_index] + end_logits[cls_index]
352                cls_scores.append(feature_null_score)
353                if min_null_score is None or min_null_score < feature_null_score:
354                    min_null_score = feature_null_score
355
356                # Go through all possibilities for the `n_best_size` greater start and end logits.
357                start_indexes = np.argsort(start_logits)[
358                    -1 : -n_best_size - 1 : -1
359                ].tolist()
360                end_indexes = np.argsort(end_logits)[
361                    -1 : -n_best_size - 1 : -1
362                ].tolist()
363                for start_index in start_indexes:
364                    for end_index in end_indexes:
365                        # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
366                        # to part of the input_ids that are not in the context.
367                        if (
368                            start_index >= len(offset_mapping)
369                            or end_index >= len(offset_mapping)
370                            or offset_mapping[start_index] is None
371                            or offset_mapping[end_index] is None
372                        ):
373                            continue
374                        # Don't consider answers with a length that is either < 0 or > max_answer_length.
375                        if (
376                            end_index < start_index
377                            or end_index - start_index + 1 > max_answer_length
378                        ):
379                            continue
380
381                        start_char = offset_mapping[start_index][0]
382                        end_char = offset_mapping[end_index][1]
383                        text = context[start_char:end_char]
384                        valid_answers.append(
385                            {
386                                "score": start_logits[start_index]
387                                + end_logits[end_index],
388                                "text": text,
389                            }
390                        )
391            if len(valid_answers) > 0:
392                best_answer = sorted(
393                    valid_answers, key=lambda x: x["score"], reverse=True
394                )[0]
395            else:
396                # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
397                # failure.
398                best_answer = {"text": "", "score": 0.0}
399
400            # Let's pick our final answer: the best one or the null answer (only for squad_v2)
401            if not squad_v2:
402                predictions[example["id"]] = best_answer["text"]
403            else:
404                if min_score is not None:
405                    thres = min([min(cls_scores), min_score])
406                else:
407                    thres = min(cls_scores)
408                answer = best_answer["text"] if best_answer["score"] > thres else ""
409                if example["id"] not in predictions:
410                    predictions[example["id"]] = answer
411                    scores[example["id"]] = best_answer["score"]
412
413        return predictions

Process raw predictions of a QA model.

Parameters

examples: datasets.Dataset Samples from datasets.Dataset. features: Validation features as processed by prepare_validation_features_squad. raw_predictions: Predictions by trainer. tokenizer: tokenizers.Tokenizer Instance of hf's tokenizer. n_best_size: int Number of best answers to get (maximum). max_answer_length: int Maximum answer length in number of characters. Answer longer than this are not even considered. squad_v2: bool Whether the dataset is in squad v2 format or not.

Returns

predictions: collections.OrderedDict An ordered dict with the predictions formatted so that we can compute metrics easily.