autotransformers.results_getter
1from functools import partial 2import numpy as np 3import collections 4import evaluate 5from .utils import match_questions_multiple_answers 6from tqdm import tqdm 7from .dataset_config import DatasetConfig 8from .model_config import ModelConfig 9from typing import Any 10 11 12class ResultsGetter: 13 """ 14 Retrieve results on the test set for different tasks (seq2seq, different forms of classification, NER, QA...). 15 16 Parameters 17 ---------- 18 dataset_config: autotransformers.DatasetConfig 19 Configuration for the dataset. 20 model_config: autotransformers.ModelConfig 21 Configuration for the model. 22 compute_metrics_func: Any 23 Function to compute metrics. 24 """ 25 26 def __init__( 27 self, 28 dataset_config: DatasetConfig, 29 model_config: ModelConfig, 30 compute_metrics_func: Any, 31 ): 32 self.dataset_config = dataset_config 33 self.model_config = model_config 34 self.compute_metrics_func = compute_metrics_func 35 36 def __call__(self, trainer, test_dataset): 37 """ 38 Get results for test dataset, using a trained transformers.Trainer. 39 40 Parameters 41 ---------- 42 trainer: transformers.Trainer 43 Trainer trained, to get raw predictions on the test dataset. 44 test_dataset: datasets.Dataset 45 Test dataset for inference. Metrics are computed on this dataset. 46 47 Returns 48 ------- 49 test_results: Dict 50 Dictionary with test results. 51 """ 52 if self.dataset_config.task == "qa": 53 test_results = self.get_test_results_qa( 54 test_dataset, 55 trainer, 56 self.dataset_config.squad_v2, 57 ) 58 elif self.dataset_config.task == "seq2seq": 59 test_results = self.get_test_results_summarization( 60 test_dataset, 61 trainer, 62 self.compute_metrics_func, 63 additional_metrics=self.dataset_config.additional_metrics, 64 ) 65 else: 66 test_results = self.general_get_test_results( 67 test_dataset, 68 trainer, 69 self.compute_metrics_func, 70 ) 71 return test_results 72 73 def get_test_results_summarization( 74 self, test_dataset, trainer, compute_metrics_func, additional_metrics=None 75 ): 76 """ 77 Compute and get the results in test for summarization tasks. 78 79 Parameters 80 ---------- 81 test_dataset: datasets.Dataset 82 Test dataset. 83 trainer: transformers.Trainer 84 HF's transformers trainer. 85 compute_metrics_func: Any 86 Function to compute metrics. 87 model_config: autotransformers.ModelConfig 88 Configuration for the model. 89 additional_metrics: List 90 List with additional metrics to compute. 91 92 Returns 93 ------- 94 metrics: Dict 95 Dictionary with metrics for the summarization task. 96 """ 97 if self.model_config.generation_params is None: 98 preds = trainer.predict( 99 test_dataset, 100 max_length=self.model_config.max_length_summary, 101 num_beams=self.model_config.num_beams, 102 ) 103 else: 104 preds = trainer.predict( 105 test_dataset, 106 max_length=self.model_config.max_length_summary, 107 num_beams=self.model_config.num_beams, 108 **self.model_config.generation_params, 109 ) 110 metrics = compute_metrics_func( 111 preds, tokenizer=trainer.tokenizer, additional_metrics=additional_metrics 112 ) 113 return metrics 114 115 def general_get_test_results( 116 self, test_dataset, trainer, compute_metrics_func, additional_metrics=None 117 ): 118 """ 119 Compute metrics in general for every NLU task except for QA. 120 121 Parameters 122 ---------- 123 test_dataset: datasets.Dataset 124 Dataset on any task except for QA. 125 trainer: transformers.Trainer 126 Trainer trained on a dataset that is not a QA dataset. 127 128 Returns 129 ------- 130 metrics: Dict 131 Metrics for the test dataset. 132 """ 133 preds = trainer.predict(test_dataset) 134 if hasattr(preds, "metrics"): 135 return preds.metrics 136 metrics = compute_metrics_func( 137 preds, 138 tokenizer=trainer.tokenizer, 139 id2tag=trainer.model.config.id2label, 140 additional_metrics=additional_metrics, 141 ) 142 return metrics 143 144 def get_test_results_qa( 145 self, test_dataset, trainer, squad_v2=False, additional_metrics=None 146 ): 147 """ 148 Compute metrics on test for QA datasets. 149 150 Parameters 151 ---------- 152 test_dataset: datasets.Dataset 153 QA dataset. 154 trainer: transformers.Trainer 155 Trainer trained on QA dataset. 156 squad_v2: bool 157 Whether the dataset is in squad v2 format or not. 158 159 Returns 160 ------- 161 metrics: Dict 162 Metrics for the test dataset. 163 """ 164 validation_features = test_dataset.map( 165 partial( 166 self.prepare_validation_features_squad, 167 tokenizer=trainer.tokenizer, 168 ), 169 batched=True, 170 remove_columns=test_dataset.column_names, 171 ) 172 raw_predictions = trainer.predict(validation_features) 173 validation_features.set_format( 174 type=validation_features.format["type"], 175 columns=list(validation_features.features.keys()), 176 ) 177 final_predictions = self.postprocess_qa_predictions( 178 test_dataset, 179 validation_features, 180 raw_predictions.predictions, 181 tokenizer=trainer.tokenizer, 182 ) 183 if isinstance(final_predictions, tuple): 184 final_predictions = final_predictions[0] 185 186 metric, formatted_predictions = self._get_metric_and_formatted_predictions( 187 final_predictions, squad_v2 188 ) 189 190 references = [{"id": ex["id"], "answers": ex["answers"]} for ex in test_dataset] 191 192 references = match_questions_multiple_answers(formatted_predictions, references) 193 194 metrics = metric.compute( 195 predictions=formatted_predictions, references=references 196 ) 197 return metrics 198 199 def prepare_validation_features_squad(self, examples, tokenizer, pad_on_right=True): 200 """ 201 Process features for validating on squad-like datasets. 202 203 Parameters 204 ---------- 205 examples: datasets.Dataset 206 Samples from datasets.Dataset. 207 tokenizer: tokenizers.Tokenizer 208 Instance of hf's tokenizer. 209 pad_on_right: bool 210 Whether or not to pad the samples on the right side. True for most models. 211 212 Returns 213 ------- 214 tokenized_examples: 215 Tokenized samples. 216 """ 217 id_field = ( 218 self.dataset_config.id_field_qa if self.dataset_config is not None else "id" 219 ) 220 # Some of the questions have lots of whitespace on the left, which is not useful and will make the 221 # truncation of the context fail (the tokenized question will take a lots of space). So we remove that 222 # left whitespace 223 examples["question"] = [q.lstrip() for q in examples["question"]] 224 225 # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results 226 # in one example possible giving several features when a context is long, each of those features having a 227 # context that overlaps a bit the context of the previous feature. 228 tokenized_examples = tokenizer( 229 examples["question" if pad_on_right else "context"], 230 examples["context" if pad_on_right else "question"], 231 truncation="only_second" if pad_on_right else "only_first", 232 max_length=512, 233 stride=128, 234 return_overflowing_tokens=True, 235 return_offsets_mapping=True, 236 padding="max_length", 237 is_split_into_words=False, 238 ) 239 # Since one example might give us several features if it has a long context, we need a map from a feature to 240 # its corresponding example. This key gives us just that. 241 sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") 242 243 # We keep the example_id that gave us this feature and we will store the offset mappings. 244 tokenized_examples["example_id"] = [] 245 246 for i in range(len(tokenized_examples["input_ids"])): 247 # Grab the sequence corresponding to that example (to know what is the context and what is the question). 248 sequence_ids = tokenized_examples.sequence_ids(i) 249 context_index = 1 if pad_on_right else 0 250 251 # One example can give several spans, this is the index of the example containing this span of text. 252 sample_index = sample_mapping[i] 253 tokenized_examples["example_id"].append( 254 examples[id_field][sample_index] 255 ) # id 256 257 # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token 258 # position is part of the context or not. 259 tokenized_examples["offset_mapping"][i] = [ 260 (o if sequence_ids[k] == context_index else None) 261 for k, o in enumerate(tokenized_examples["offset_mapping"][i]) 262 ] 263 264 return tokenized_examples 265 266 def postprocess_qa_predictions( 267 self, 268 examples, 269 features, 270 raw_predictions, 271 tokenizer, 272 n_best_size=20, 273 max_answer_length=30, 274 squad_v2=False, 275 min_score=None, 276 ): 277 """ 278 Process raw predictions of a QA model. 279 280 Parameters 281 ---------- 282 examples: datasets.Dataset 283 Samples from datasets.Dataset. 284 features: 285 Validation features as processed by prepare_validation_features_squad. 286 raw_predictions: 287 Predictions by trainer. 288 tokenizer: tokenizers.Tokenizer 289 Instance of hf's tokenizer. 290 n_best_size: int 291 Number of best answers to get (maximum). 292 max_answer_length: int 293 Maximum answer length in number of characters. Answer longer than this are not even considered. 294 squad_v2: bool 295 Whether the dataset is in squad v2 format or not. 296 297 Returns 298 ------- 299 predictions: collections.OrderedDict 300 An ordered dict with the predictions formatted so that we can compute metrics easily. 301 """ 302 # After raw predictions are taken by a QA model, this function processes them 303 # and sorts them in terms of score etc. It also takes the concrete text that 304 # was predicted given the predicted start and end tokens. 305 id_field = ( 306 self.dataset_config.id_field_qa if self.dataset_config is not None else "id" 307 ) 308 all_start_logits, all_end_logits = raw_predictions 309 # Build a map example to its corresponding features. 310 example_id_to_index = {k: i for i, k in enumerate(examples[id_field])} # id 311 features_per_example = collections.defaultdict(list) 312 for i, feature in enumerate(features): 313 features_per_example[example_id_to_index[feature["example_id"]]].append(i) 314 315 # The dictionaries we have to fill. 316 predictions = collections.OrderedDict() 317 scores = collections.OrderedDict() 318 319 # Logging. 320 print( 321 f"Post-processing {len(examples)} example predictions split into {len(features)} features." 322 ) 323 324 # Let's loop over all the examples! 325 for example_index, example in enumerate(tqdm(examples)): 326 # Those are the indices of the features associated to the current example. 327 # example["id"]]# 328 feature_indices = features_per_example[example_index] 329 330 # min_score # Only used if squad_v2 is True. 331 min_null_score = None 332 valid_answers = [] 333 334 cls_scores = [] 335 336 context = example["context"] 337 # input_ids = example["input_ids"] 338 # Looping through all the features associated to the current example. 339 for feature_index in feature_indices: 340 # We grab the predictions of the model for this feature. 341 start_logits = all_start_logits[feature_index] 342 end_logits = all_end_logits[feature_index] 343 # This is what will allow us to map some the positions in our logits to span of texts in the original 344 # context. 345 offset_mapping = features[feature_index]["offset_mapping"] 346 # Update minimum null prediction. 347 cls_index = features[feature_index]["input_ids"].index( 348 tokenizer.cls_token_id 349 ) 350 feature_null_score = start_logits[cls_index] + end_logits[cls_index] 351 cls_scores.append(feature_null_score) 352 if min_null_score is None or min_null_score < feature_null_score: 353 min_null_score = feature_null_score 354 355 # Go through all possibilities for the `n_best_size` greater start and end logits. 356 start_indexes = np.argsort(start_logits)[ 357 -1 : -n_best_size - 1 : -1 358 ].tolist() 359 end_indexes = np.argsort(end_logits)[ 360 -1 : -n_best_size - 1 : -1 361 ].tolist() 362 for start_index in start_indexes: 363 for end_index in end_indexes: 364 # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond 365 # to part of the input_ids that are not in the context. 366 if ( 367 start_index >= len(offset_mapping) 368 or end_index >= len(offset_mapping) 369 or offset_mapping[start_index] is None 370 or offset_mapping[end_index] is None 371 ): 372 continue 373 # Don't consider answers with a length that is either < 0 or > max_answer_length. 374 if ( 375 end_index < start_index 376 or end_index - start_index + 1 > max_answer_length 377 ): 378 continue 379 380 start_char = offset_mapping[start_index][0] 381 end_char = offset_mapping[end_index][1] 382 text = context[start_char:end_char] 383 valid_answers.append( 384 { 385 "score": start_logits[start_index] 386 + end_logits[end_index], 387 "text": text, 388 } 389 ) 390 if len(valid_answers) > 0: 391 best_answer = sorted( 392 valid_answers, key=lambda x: x["score"], reverse=True 393 )[0] 394 else: 395 # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid 396 # failure. 397 best_answer = {"text": "", "score": 0.0} 398 399 # Let's pick our final answer: the best one or the null answer (only for squad_v2) 400 if not squad_v2: 401 predictions[example["id"]] = best_answer["text"] 402 else: 403 if min_score is not None: 404 thres = min([min(cls_scores), min_score]) 405 else: 406 thres = min(cls_scores) 407 answer = best_answer["text"] if best_answer["score"] > thres else "" 408 if example["id"] not in predictions: 409 predictions[example["id"]] = answer 410 scores[example["id"]] = best_answer["score"] 411 412 return predictions 413 414 def _get_metric_and_formatted_predictions(self, final_predictions, squad_v2): 415 """ 416 Get the metric from evaluate and the final predictions formatted. 417 418 Parameters 419 ---------- 420 final_predictions: Dict 421 Predictions postprocessed. 422 squad_v2: bool 423 Whether it is squad_v2 mode or not. 424 425 Returns 426 ------- 427 metric: evaluate.Metric 428 Metric from the evaluate library. 429 formatted_predictions: Dict 430 Predictions in the correct format for the metric. 431 """ 432 if not squad_v2: 433 metric = evaluate.load("squad") 434 formatted_predictions = [ 435 {"id": k, "prediction_text": v} for k, v in final_predictions.items() 436 ] 437 else: 438 metric = evaluate.load("squad_v2") 439 formatted_predictions = [ 440 {"id": k, "prediction_text": v, "no_answer_probability": 0.0} 441 for k, v in final_predictions.items() 442 ] 443 return metric, formatted_predictions
13class ResultsGetter: 14 """ 15 Retrieve results on the test set for different tasks (seq2seq, different forms of classification, NER, QA...). 16 17 Parameters 18 ---------- 19 dataset_config: autotransformers.DatasetConfig 20 Configuration for the dataset. 21 model_config: autotransformers.ModelConfig 22 Configuration for the model. 23 compute_metrics_func: Any 24 Function to compute metrics. 25 """ 26 27 def __init__( 28 self, 29 dataset_config: DatasetConfig, 30 model_config: ModelConfig, 31 compute_metrics_func: Any, 32 ): 33 self.dataset_config = dataset_config 34 self.model_config = model_config 35 self.compute_metrics_func = compute_metrics_func 36 37 def __call__(self, trainer, test_dataset): 38 """ 39 Get results for test dataset, using a trained transformers.Trainer. 40 41 Parameters 42 ---------- 43 trainer: transformers.Trainer 44 Trainer trained, to get raw predictions on the test dataset. 45 test_dataset: datasets.Dataset 46 Test dataset for inference. Metrics are computed on this dataset. 47 48 Returns 49 ------- 50 test_results: Dict 51 Dictionary with test results. 52 """ 53 if self.dataset_config.task == "qa": 54 test_results = self.get_test_results_qa( 55 test_dataset, 56 trainer, 57 self.dataset_config.squad_v2, 58 ) 59 elif self.dataset_config.task == "seq2seq": 60 test_results = self.get_test_results_summarization( 61 test_dataset, 62 trainer, 63 self.compute_metrics_func, 64 additional_metrics=self.dataset_config.additional_metrics, 65 ) 66 else: 67 test_results = self.general_get_test_results( 68 test_dataset, 69 trainer, 70 self.compute_metrics_func, 71 ) 72 return test_results 73 74 def get_test_results_summarization( 75 self, test_dataset, trainer, compute_metrics_func, additional_metrics=None 76 ): 77 """ 78 Compute and get the results in test for summarization tasks. 79 80 Parameters 81 ---------- 82 test_dataset: datasets.Dataset 83 Test dataset. 84 trainer: transformers.Trainer 85 HF's transformers trainer. 86 compute_metrics_func: Any 87 Function to compute metrics. 88 model_config: autotransformers.ModelConfig 89 Configuration for the model. 90 additional_metrics: List 91 List with additional metrics to compute. 92 93 Returns 94 ------- 95 metrics: Dict 96 Dictionary with metrics for the summarization task. 97 """ 98 if self.model_config.generation_params is None: 99 preds = trainer.predict( 100 test_dataset, 101 max_length=self.model_config.max_length_summary, 102 num_beams=self.model_config.num_beams, 103 ) 104 else: 105 preds = trainer.predict( 106 test_dataset, 107 max_length=self.model_config.max_length_summary, 108 num_beams=self.model_config.num_beams, 109 **self.model_config.generation_params, 110 ) 111 metrics = compute_metrics_func( 112 preds, tokenizer=trainer.tokenizer, additional_metrics=additional_metrics 113 ) 114 return metrics 115 116 def general_get_test_results( 117 self, test_dataset, trainer, compute_metrics_func, additional_metrics=None 118 ): 119 """ 120 Compute metrics in general for every NLU task except for QA. 121 122 Parameters 123 ---------- 124 test_dataset: datasets.Dataset 125 Dataset on any task except for QA. 126 trainer: transformers.Trainer 127 Trainer trained on a dataset that is not a QA dataset. 128 129 Returns 130 ------- 131 metrics: Dict 132 Metrics for the test dataset. 133 """ 134 preds = trainer.predict(test_dataset) 135 if hasattr(preds, "metrics"): 136 return preds.metrics 137 metrics = compute_metrics_func( 138 preds, 139 tokenizer=trainer.tokenizer, 140 id2tag=trainer.model.config.id2label, 141 additional_metrics=additional_metrics, 142 ) 143 return metrics 144 145 def get_test_results_qa( 146 self, test_dataset, trainer, squad_v2=False, additional_metrics=None 147 ): 148 """ 149 Compute metrics on test for QA datasets. 150 151 Parameters 152 ---------- 153 test_dataset: datasets.Dataset 154 QA dataset. 155 trainer: transformers.Trainer 156 Trainer trained on QA dataset. 157 squad_v2: bool 158 Whether the dataset is in squad v2 format or not. 159 160 Returns 161 ------- 162 metrics: Dict 163 Metrics for the test dataset. 164 """ 165 validation_features = test_dataset.map( 166 partial( 167 self.prepare_validation_features_squad, 168 tokenizer=trainer.tokenizer, 169 ), 170 batched=True, 171 remove_columns=test_dataset.column_names, 172 ) 173 raw_predictions = trainer.predict(validation_features) 174 validation_features.set_format( 175 type=validation_features.format["type"], 176 columns=list(validation_features.features.keys()), 177 ) 178 final_predictions = self.postprocess_qa_predictions( 179 test_dataset, 180 validation_features, 181 raw_predictions.predictions, 182 tokenizer=trainer.tokenizer, 183 ) 184 if isinstance(final_predictions, tuple): 185 final_predictions = final_predictions[0] 186 187 metric, formatted_predictions = self._get_metric_and_formatted_predictions( 188 final_predictions, squad_v2 189 ) 190 191 references = [{"id": ex["id"], "answers": ex["answers"]} for ex in test_dataset] 192 193 references = match_questions_multiple_answers(formatted_predictions, references) 194 195 metrics = metric.compute( 196 predictions=formatted_predictions, references=references 197 ) 198 return metrics 199 200 def prepare_validation_features_squad(self, examples, tokenizer, pad_on_right=True): 201 """ 202 Process features for validating on squad-like datasets. 203 204 Parameters 205 ---------- 206 examples: datasets.Dataset 207 Samples from datasets.Dataset. 208 tokenizer: tokenizers.Tokenizer 209 Instance of hf's tokenizer. 210 pad_on_right: bool 211 Whether or not to pad the samples on the right side. True for most models. 212 213 Returns 214 ------- 215 tokenized_examples: 216 Tokenized samples. 217 """ 218 id_field = ( 219 self.dataset_config.id_field_qa if self.dataset_config is not None else "id" 220 ) 221 # Some of the questions have lots of whitespace on the left, which is not useful and will make the 222 # truncation of the context fail (the tokenized question will take a lots of space). So we remove that 223 # left whitespace 224 examples["question"] = [q.lstrip() for q in examples["question"]] 225 226 # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results 227 # in one example possible giving several features when a context is long, each of those features having a 228 # context that overlaps a bit the context of the previous feature. 229 tokenized_examples = tokenizer( 230 examples["question" if pad_on_right else "context"], 231 examples["context" if pad_on_right else "question"], 232 truncation="only_second" if pad_on_right else "only_first", 233 max_length=512, 234 stride=128, 235 return_overflowing_tokens=True, 236 return_offsets_mapping=True, 237 padding="max_length", 238 is_split_into_words=False, 239 ) 240 # Since one example might give us several features if it has a long context, we need a map from a feature to 241 # its corresponding example. This key gives us just that. 242 sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") 243 244 # We keep the example_id that gave us this feature and we will store the offset mappings. 245 tokenized_examples["example_id"] = [] 246 247 for i in range(len(tokenized_examples["input_ids"])): 248 # Grab the sequence corresponding to that example (to know what is the context and what is the question). 249 sequence_ids = tokenized_examples.sequence_ids(i) 250 context_index = 1 if pad_on_right else 0 251 252 # One example can give several spans, this is the index of the example containing this span of text. 253 sample_index = sample_mapping[i] 254 tokenized_examples["example_id"].append( 255 examples[id_field][sample_index] 256 ) # id 257 258 # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token 259 # position is part of the context or not. 260 tokenized_examples["offset_mapping"][i] = [ 261 (o if sequence_ids[k] == context_index else None) 262 for k, o in enumerate(tokenized_examples["offset_mapping"][i]) 263 ] 264 265 return tokenized_examples 266 267 def postprocess_qa_predictions( 268 self, 269 examples, 270 features, 271 raw_predictions, 272 tokenizer, 273 n_best_size=20, 274 max_answer_length=30, 275 squad_v2=False, 276 min_score=None, 277 ): 278 """ 279 Process raw predictions of a QA model. 280 281 Parameters 282 ---------- 283 examples: datasets.Dataset 284 Samples from datasets.Dataset. 285 features: 286 Validation features as processed by prepare_validation_features_squad. 287 raw_predictions: 288 Predictions by trainer. 289 tokenizer: tokenizers.Tokenizer 290 Instance of hf's tokenizer. 291 n_best_size: int 292 Number of best answers to get (maximum). 293 max_answer_length: int 294 Maximum answer length in number of characters. Answer longer than this are not even considered. 295 squad_v2: bool 296 Whether the dataset is in squad v2 format or not. 297 298 Returns 299 ------- 300 predictions: collections.OrderedDict 301 An ordered dict with the predictions formatted so that we can compute metrics easily. 302 """ 303 # After raw predictions are taken by a QA model, this function processes them 304 # and sorts them in terms of score etc. It also takes the concrete text that 305 # was predicted given the predicted start and end tokens. 306 id_field = ( 307 self.dataset_config.id_field_qa if self.dataset_config is not None else "id" 308 ) 309 all_start_logits, all_end_logits = raw_predictions 310 # Build a map example to its corresponding features. 311 example_id_to_index = {k: i for i, k in enumerate(examples[id_field])} # id 312 features_per_example = collections.defaultdict(list) 313 for i, feature in enumerate(features): 314 features_per_example[example_id_to_index[feature["example_id"]]].append(i) 315 316 # The dictionaries we have to fill. 317 predictions = collections.OrderedDict() 318 scores = collections.OrderedDict() 319 320 # Logging. 321 print( 322 f"Post-processing {len(examples)} example predictions split into {len(features)} features." 323 ) 324 325 # Let's loop over all the examples! 326 for example_index, example in enumerate(tqdm(examples)): 327 # Those are the indices of the features associated to the current example. 328 # example["id"]]# 329 feature_indices = features_per_example[example_index] 330 331 # min_score # Only used if squad_v2 is True. 332 min_null_score = None 333 valid_answers = [] 334 335 cls_scores = [] 336 337 context = example["context"] 338 # input_ids = example["input_ids"] 339 # Looping through all the features associated to the current example. 340 for feature_index in feature_indices: 341 # We grab the predictions of the model for this feature. 342 start_logits = all_start_logits[feature_index] 343 end_logits = all_end_logits[feature_index] 344 # This is what will allow us to map some the positions in our logits to span of texts in the original 345 # context. 346 offset_mapping = features[feature_index]["offset_mapping"] 347 # Update minimum null prediction. 348 cls_index = features[feature_index]["input_ids"].index( 349 tokenizer.cls_token_id 350 ) 351 feature_null_score = start_logits[cls_index] + end_logits[cls_index] 352 cls_scores.append(feature_null_score) 353 if min_null_score is None or min_null_score < feature_null_score: 354 min_null_score = feature_null_score 355 356 # Go through all possibilities for the `n_best_size` greater start and end logits. 357 start_indexes = np.argsort(start_logits)[ 358 -1 : -n_best_size - 1 : -1 359 ].tolist() 360 end_indexes = np.argsort(end_logits)[ 361 -1 : -n_best_size - 1 : -1 362 ].tolist() 363 for start_index in start_indexes: 364 for end_index in end_indexes: 365 # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond 366 # to part of the input_ids that are not in the context. 367 if ( 368 start_index >= len(offset_mapping) 369 or end_index >= len(offset_mapping) 370 or offset_mapping[start_index] is None 371 or offset_mapping[end_index] is None 372 ): 373 continue 374 # Don't consider answers with a length that is either < 0 or > max_answer_length. 375 if ( 376 end_index < start_index 377 or end_index - start_index + 1 > max_answer_length 378 ): 379 continue 380 381 start_char = offset_mapping[start_index][0] 382 end_char = offset_mapping[end_index][1] 383 text = context[start_char:end_char] 384 valid_answers.append( 385 { 386 "score": start_logits[start_index] 387 + end_logits[end_index], 388 "text": text, 389 } 390 ) 391 if len(valid_answers) > 0: 392 best_answer = sorted( 393 valid_answers, key=lambda x: x["score"], reverse=True 394 )[0] 395 else: 396 # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid 397 # failure. 398 best_answer = {"text": "", "score": 0.0} 399 400 # Let's pick our final answer: the best one or the null answer (only for squad_v2) 401 if not squad_v2: 402 predictions[example["id"]] = best_answer["text"] 403 else: 404 if min_score is not None: 405 thres = min([min(cls_scores), min_score]) 406 else: 407 thres = min(cls_scores) 408 answer = best_answer["text"] if best_answer["score"] > thres else "" 409 if example["id"] not in predictions: 410 predictions[example["id"]] = answer 411 scores[example["id"]] = best_answer["score"] 412 413 return predictions 414 415 def _get_metric_and_formatted_predictions(self, final_predictions, squad_v2): 416 """ 417 Get the metric from evaluate and the final predictions formatted. 418 419 Parameters 420 ---------- 421 final_predictions: Dict 422 Predictions postprocessed. 423 squad_v2: bool 424 Whether it is squad_v2 mode or not. 425 426 Returns 427 ------- 428 metric: evaluate.Metric 429 Metric from the evaluate library. 430 formatted_predictions: Dict 431 Predictions in the correct format for the metric. 432 """ 433 if not squad_v2: 434 metric = evaluate.load("squad") 435 formatted_predictions = [ 436 {"id": k, "prediction_text": v} for k, v in final_predictions.items() 437 ] 438 else: 439 metric = evaluate.load("squad_v2") 440 formatted_predictions = [ 441 {"id": k, "prediction_text": v, "no_answer_probability": 0.0} 442 for k, v in final_predictions.items() 443 ] 444 return metric, formatted_predictions
Retrieve results on the test set for different tasks (seq2seq, different forms of classification, NER, QA...).
Parameters
dataset_config: autotransformers.DatasetConfig Configuration for the dataset. model_config: autotransformers.ModelConfig Configuration for the model. compute_metrics_func: Any Function to compute metrics.
74 def get_test_results_summarization( 75 self, test_dataset, trainer, compute_metrics_func, additional_metrics=None 76 ): 77 """ 78 Compute and get the results in test for summarization tasks. 79 80 Parameters 81 ---------- 82 test_dataset: datasets.Dataset 83 Test dataset. 84 trainer: transformers.Trainer 85 HF's transformers trainer. 86 compute_metrics_func: Any 87 Function to compute metrics. 88 model_config: autotransformers.ModelConfig 89 Configuration for the model. 90 additional_metrics: List 91 List with additional metrics to compute. 92 93 Returns 94 ------- 95 metrics: Dict 96 Dictionary with metrics for the summarization task. 97 """ 98 if self.model_config.generation_params is None: 99 preds = trainer.predict( 100 test_dataset, 101 max_length=self.model_config.max_length_summary, 102 num_beams=self.model_config.num_beams, 103 ) 104 else: 105 preds = trainer.predict( 106 test_dataset, 107 max_length=self.model_config.max_length_summary, 108 num_beams=self.model_config.num_beams, 109 **self.model_config.generation_params, 110 ) 111 metrics = compute_metrics_func( 112 preds, tokenizer=trainer.tokenizer, additional_metrics=additional_metrics 113 ) 114 return metrics
Compute and get the results in test for summarization tasks.
Parameters
test_dataset: datasets.Dataset Test dataset. trainer: transformers.Trainer HF's transformers trainer. compute_metrics_func: Any Function to compute metrics. model_config: autotransformers.ModelConfig Configuration for the model. additional_metrics: List List with additional metrics to compute.
Returns
metrics: Dict Dictionary with metrics for the summarization task.
116 def general_get_test_results( 117 self, test_dataset, trainer, compute_metrics_func, additional_metrics=None 118 ): 119 """ 120 Compute metrics in general for every NLU task except for QA. 121 122 Parameters 123 ---------- 124 test_dataset: datasets.Dataset 125 Dataset on any task except for QA. 126 trainer: transformers.Trainer 127 Trainer trained on a dataset that is not a QA dataset. 128 129 Returns 130 ------- 131 metrics: Dict 132 Metrics for the test dataset. 133 """ 134 preds = trainer.predict(test_dataset) 135 if hasattr(preds, "metrics"): 136 return preds.metrics 137 metrics = compute_metrics_func( 138 preds, 139 tokenizer=trainer.tokenizer, 140 id2tag=trainer.model.config.id2label, 141 additional_metrics=additional_metrics, 142 ) 143 return metrics
Compute metrics in general for every NLU task except for QA.
Parameters
test_dataset: datasets.Dataset Dataset on any task except for QA. trainer: transformers.Trainer Trainer trained on a dataset that is not a QA dataset.
Returns
metrics: Dict Metrics for the test dataset.
145 def get_test_results_qa( 146 self, test_dataset, trainer, squad_v2=False, additional_metrics=None 147 ): 148 """ 149 Compute metrics on test for QA datasets. 150 151 Parameters 152 ---------- 153 test_dataset: datasets.Dataset 154 QA dataset. 155 trainer: transformers.Trainer 156 Trainer trained on QA dataset. 157 squad_v2: bool 158 Whether the dataset is in squad v2 format or not. 159 160 Returns 161 ------- 162 metrics: Dict 163 Metrics for the test dataset. 164 """ 165 validation_features = test_dataset.map( 166 partial( 167 self.prepare_validation_features_squad, 168 tokenizer=trainer.tokenizer, 169 ), 170 batched=True, 171 remove_columns=test_dataset.column_names, 172 ) 173 raw_predictions = trainer.predict(validation_features) 174 validation_features.set_format( 175 type=validation_features.format["type"], 176 columns=list(validation_features.features.keys()), 177 ) 178 final_predictions = self.postprocess_qa_predictions( 179 test_dataset, 180 validation_features, 181 raw_predictions.predictions, 182 tokenizer=trainer.tokenizer, 183 ) 184 if isinstance(final_predictions, tuple): 185 final_predictions = final_predictions[0] 186 187 metric, formatted_predictions = self._get_metric_and_formatted_predictions( 188 final_predictions, squad_v2 189 ) 190 191 references = [{"id": ex["id"], "answers": ex["answers"]} for ex in test_dataset] 192 193 references = match_questions_multiple_answers(formatted_predictions, references) 194 195 metrics = metric.compute( 196 predictions=formatted_predictions, references=references 197 ) 198 return metrics
Compute metrics on test for QA datasets.
Parameters
test_dataset: datasets.Dataset QA dataset. trainer: transformers.Trainer Trainer trained on QA dataset. squad_v2: bool Whether the dataset is in squad v2 format or not.
Returns
metrics: Dict Metrics for the test dataset.
200 def prepare_validation_features_squad(self, examples, tokenizer, pad_on_right=True): 201 """ 202 Process features for validating on squad-like datasets. 203 204 Parameters 205 ---------- 206 examples: datasets.Dataset 207 Samples from datasets.Dataset. 208 tokenizer: tokenizers.Tokenizer 209 Instance of hf's tokenizer. 210 pad_on_right: bool 211 Whether or not to pad the samples on the right side. True for most models. 212 213 Returns 214 ------- 215 tokenized_examples: 216 Tokenized samples. 217 """ 218 id_field = ( 219 self.dataset_config.id_field_qa if self.dataset_config is not None else "id" 220 ) 221 # Some of the questions have lots of whitespace on the left, which is not useful and will make the 222 # truncation of the context fail (the tokenized question will take a lots of space). So we remove that 223 # left whitespace 224 examples["question"] = [q.lstrip() for q in examples["question"]] 225 226 # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results 227 # in one example possible giving several features when a context is long, each of those features having a 228 # context that overlaps a bit the context of the previous feature. 229 tokenized_examples = tokenizer( 230 examples["question" if pad_on_right else "context"], 231 examples["context" if pad_on_right else "question"], 232 truncation="only_second" if pad_on_right else "only_first", 233 max_length=512, 234 stride=128, 235 return_overflowing_tokens=True, 236 return_offsets_mapping=True, 237 padding="max_length", 238 is_split_into_words=False, 239 ) 240 # Since one example might give us several features if it has a long context, we need a map from a feature to 241 # its corresponding example. This key gives us just that. 242 sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") 243 244 # We keep the example_id that gave us this feature and we will store the offset mappings. 245 tokenized_examples["example_id"] = [] 246 247 for i in range(len(tokenized_examples["input_ids"])): 248 # Grab the sequence corresponding to that example (to know what is the context and what is the question). 249 sequence_ids = tokenized_examples.sequence_ids(i) 250 context_index = 1 if pad_on_right else 0 251 252 # One example can give several spans, this is the index of the example containing this span of text. 253 sample_index = sample_mapping[i] 254 tokenized_examples["example_id"].append( 255 examples[id_field][sample_index] 256 ) # id 257 258 # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token 259 # position is part of the context or not. 260 tokenized_examples["offset_mapping"][i] = [ 261 (o if sequence_ids[k] == context_index else None) 262 for k, o in enumerate(tokenized_examples["offset_mapping"][i]) 263 ] 264 265 return tokenized_examples
Process features for validating on squad-like datasets.
Parameters
examples: datasets.Dataset Samples from datasets.Dataset. tokenizer: tokenizers.Tokenizer Instance of hf's tokenizer. pad_on_right: bool Whether or not to pad the samples on the right side. True for most models.
Returns
tokenized_examples: Tokenized samples.
267 def postprocess_qa_predictions( 268 self, 269 examples, 270 features, 271 raw_predictions, 272 tokenizer, 273 n_best_size=20, 274 max_answer_length=30, 275 squad_v2=False, 276 min_score=None, 277 ): 278 """ 279 Process raw predictions of a QA model. 280 281 Parameters 282 ---------- 283 examples: datasets.Dataset 284 Samples from datasets.Dataset. 285 features: 286 Validation features as processed by prepare_validation_features_squad. 287 raw_predictions: 288 Predictions by trainer. 289 tokenizer: tokenizers.Tokenizer 290 Instance of hf's tokenizer. 291 n_best_size: int 292 Number of best answers to get (maximum). 293 max_answer_length: int 294 Maximum answer length in number of characters. Answer longer than this are not even considered. 295 squad_v2: bool 296 Whether the dataset is in squad v2 format or not. 297 298 Returns 299 ------- 300 predictions: collections.OrderedDict 301 An ordered dict with the predictions formatted so that we can compute metrics easily. 302 """ 303 # After raw predictions are taken by a QA model, this function processes them 304 # and sorts them in terms of score etc. It also takes the concrete text that 305 # was predicted given the predicted start and end tokens. 306 id_field = ( 307 self.dataset_config.id_field_qa if self.dataset_config is not None else "id" 308 ) 309 all_start_logits, all_end_logits = raw_predictions 310 # Build a map example to its corresponding features. 311 example_id_to_index = {k: i for i, k in enumerate(examples[id_field])} # id 312 features_per_example = collections.defaultdict(list) 313 for i, feature in enumerate(features): 314 features_per_example[example_id_to_index[feature["example_id"]]].append(i) 315 316 # The dictionaries we have to fill. 317 predictions = collections.OrderedDict() 318 scores = collections.OrderedDict() 319 320 # Logging. 321 print( 322 f"Post-processing {len(examples)} example predictions split into {len(features)} features." 323 ) 324 325 # Let's loop over all the examples! 326 for example_index, example in enumerate(tqdm(examples)): 327 # Those are the indices of the features associated to the current example. 328 # example["id"]]# 329 feature_indices = features_per_example[example_index] 330 331 # min_score # Only used if squad_v2 is True. 332 min_null_score = None 333 valid_answers = [] 334 335 cls_scores = [] 336 337 context = example["context"] 338 # input_ids = example["input_ids"] 339 # Looping through all the features associated to the current example. 340 for feature_index in feature_indices: 341 # We grab the predictions of the model for this feature. 342 start_logits = all_start_logits[feature_index] 343 end_logits = all_end_logits[feature_index] 344 # This is what will allow us to map some the positions in our logits to span of texts in the original 345 # context. 346 offset_mapping = features[feature_index]["offset_mapping"] 347 # Update minimum null prediction. 348 cls_index = features[feature_index]["input_ids"].index( 349 tokenizer.cls_token_id 350 ) 351 feature_null_score = start_logits[cls_index] + end_logits[cls_index] 352 cls_scores.append(feature_null_score) 353 if min_null_score is None or min_null_score < feature_null_score: 354 min_null_score = feature_null_score 355 356 # Go through all possibilities for the `n_best_size` greater start and end logits. 357 start_indexes = np.argsort(start_logits)[ 358 -1 : -n_best_size - 1 : -1 359 ].tolist() 360 end_indexes = np.argsort(end_logits)[ 361 -1 : -n_best_size - 1 : -1 362 ].tolist() 363 for start_index in start_indexes: 364 for end_index in end_indexes: 365 # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond 366 # to part of the input_ids that are not in the context. 367 if ( 368 start_index >= len(offset_mapping) 369 or end_index >= len(offset_mapping) 370 or offset_mapping[start_index] is None 371 or offset_mapping[end_index] is None 372 ): 373 continue 374 # Don't consider answers with a length that is either < 0 or > max_answer_length. 375 if ( 376 end_index < start_index 377 or end_index - start_index + 1 > max_answer_length 378 ): 379 continue 380 381 start_char = offset_mapping[start_index][0] 382 end_char = offset_mapping[end_index][1] 383 text = context[start_char:end_char] 384 valid_answers.append( 385 { 386 "score": start_logits[start_index] 387 + end_logits[end_index], 388 "text": text, 389 } 390 ) 391 if len(valid_answers) > 0: 392 best_answer = sorted( 393 valid_answers, key=lambda x: x["score"], reverse=True 394 )[0] 395 else: 396 # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid 397 # failure. 398 best_answer = {"text": "", "score": 0.0} 399 400 # Let's pick our final answer: the best one or the null answer (only for squad_v2) 401 if not squad_v2: 402 predictions[example["id"]] = best_answer["text"] 403 else: 404 if min_score is not None: 405 thres = min([min(cls_scores), min_score]) 406 else: 407 thres = min(cls_scores) 408 answer = best_answer["text"] if best_answer["score"] > thres else "" 409 if example["id"] not in predictions: 410 predictions[example["id"]] = answer 411 scores[example["id"]] = best_answer["score"] 412 413 return predictions
Process raw predictions of a QA model.
Parameters
examples: datasets.Dataset Samples from datasets.Dataset. features: Validation features as processed by prepare_validation_features_squad. raw_predictions: Predictions by trainer. tokenizer: tokenizers.Tokenizer Instance of hf's tokenizer. n_best_size: int Number of best answers to get (maximum). max_answer_length: int Maximum answer length in number of characters. Answer longer than this are not even considered. squad_v2: bool Whether the dataset is in squad v2 format or not.
Returns
predictions: collections.OrderedDict An ordered dict with the predictions formatted so that we can compute metrics easily.