autotransformers.dataset_config

  1from dataclasses import dataclass, asdict, field
  2from typing import Dict, List, Any
  3
  4
  5@dataclass
  6class DatasetConfig:
  7    """
  8    Configure a dataset for use within the AutoTrainer class.
  9
 10    This determines how to load the dataset,
 11    whether local files are needed, whether additional splits are needed (for example when the original
 12    dataset only has train-test and we want also validation), and so on.
 13
 14    Parameters
 15    ----------
 16    dataset_name: str
 17        The name of the dataset.
 18    alias: str
 19        Alias for the dataset, for saving it.
 20    task: str
 21        The task of the dataset. Currenlty, only classification, ner and qa (question answering) are available.
 22    fixed_training_args: Dict
 23        The training arguments (to use in transformers.TrainingArguments) for every model on this dataset, in dictionary format.
 24    is_multilabel: bool
 25        Whether it is multilabel classification
 26    multilabel_label_names: List
 27        Names of the labels for multilabel training.
 28    hf_load_kwargs: Dict
 29        Arguments for loading the dataset from the huggingface datasets' hub. Example: {'path': 'wikiann', 'name': 'es'}.
 30        If None, it is assumed that all necessary files exist locally and are passed in the files field.
 31    type_load: str
 32        The type of load to perform in load_dataset; for example, if your data is in csv format (d = load_dataset('csv', ...)), this should be csv.
 33    files: Dict
 34        Files to load the dataset from, in Huggingface's datasets format. Possible keys are train, validation and test.
 35    data_field: str
 36        Field to load data from in the case of jsons loading in datasets.
 37    partial_split: bool
 38        Wheter a partial split is needed, that is, if you only have train and test sets, this should be True so that a new validation set is created.
 39    split: bool
 40        This should be true when you only have one split, that is, a big train set; this creates new validation and test sets.
 41    label_col: str
 42        Name of the label column.
 43    val_size: float
 44        In case no validation split is provided, the proportion of the training data to leave for validation.
 45    test_size: float
 46        In case no test split is provided, the proportion of the total data to leave for testing.
 47    pre_func
 48        Function to perform previous transformations. For example, if your dataset lacks a field (like xquad with title field for example), you can fix it in a function provided here.
 49    squad_v2: bool
 50        Only useful for question answering. Whether it is squad v2 format or not. Default is false.
 51    text_field: str
 52        The name of the field containing the text. Useful only in case of unique-text-field datasets,like most datasets are. In case of 2-sentences datasets like xnli or paws-x this is not useful. Default is text.
 53    is_2sents: bool
 54        Whether it is a 2 sentence dataset. Useful for processing datasets like xnli or paws-x.
 55    sentence1_field: str
 56        In case this is a 2 sents dataset, the name of the first sentence field.
 57    sentence2_field: str
 58        In case this is a 2 sents dataset, the name of the second sentence field.
 59    summary_field: str = field(
 60        The name of the field with summaries (we assume the long texts are in the text_field field). Only useful for summarization tasks. Default is summary.
 61    callbacks: List
 62        Callbacks to use inside transformers.
 63    metric_optimize: str
 64        Name of the metric you want to optimize in the hyperparameter search.
 65    direction_optimize : str
 66        Direction of the optimization problem. Whether you want to maximize or minimize metric_optimize.
 67    custom_eval_func: Any
 68        In case we want a special evaluation function, we can provide it here. It must receive EvalPredictions by trainer, like any compute_metrics function in transformers.
 69    seed : int
 70        Seed for optuna sampler.
 71    max_length_summary: int
 72        Max length of the summaries, for tokenization purposes. It will be changed depending on the ModelConfig.
 73    num_proc : int
 74        Number of processes to preprocess data.
 75    loaded_dataset: Any
 76        In case you want to do weird things like concatenating datasets or things like that, you can do that here, by passing a (non-tokenized) dataset in this field.
 77    additional_metrics: List
 78        List of additional metrics loaded from datasets, to compute over the test part.
 79    retrain_at_end: bool
 80        whether to retrain with the best performing model. In most cases this should be True, except when training 1 model with 1 set of hyperparams.
 81    config_num_labels: int
 82        Number of labels to set for the config, if None it will be computed based on number of labels detected.
 83    smoke_test: bool
 84        Whether to select only top 10 rows of the dataset for smoke testing purposes.
 85    augment_data: bool
 86        Whether to augment_data or not.
 87    data_augmentation_steps: List
 88        List of data augmentation techniques to use from NLPAugPipeline.
 89    pretokenized_dataset: Any
 90        Pre-tokenized dataset, to avoid tokenizing inside AutoTrainer, which may cause memory issues with huge datasets.
 91
 92    Examples
 93    --------
 94    One can easily create a DatasetConfig for dataset conll2002 just with the following:
 95
 96    >>> from autotransformers import DatasetConfig
 97
 98    >>> config={'fixed_training_args': {}, 'dataset_name': 'conll2002', 'alias': 'conll2002', 'task': 'ner', 'hf_load_kwargs': {'path': 'conll2002', 'name': 'es'}, 'label_col':'ner_tags'}
 99
100    >>> config = DatasetConfig(**config)
101    """
102
103    dataset_name: str = field(metadata={"help": "The name of the dataset"})
104    alias: str = field(metadata={"help": "Alias for the dataset, for saving it."})
105    task: str = field(
106        metadata={
107            "help": "The task of the dataset. Currenlty, only classification, ner and qa (question answering) are available."
108        }
109    )
110    fixed_training_args: Dict = field(
111        metadata={
112            "help": "The training arguments (to use in transformers.TrainingArguments) for every model on this dataset, in dictionary format."
113        }
114    )
115    is_multilabel: bool = field(
116        default=False, metadata={"help": "Whether it is multilabel classification"}
117    )
118    multilabel_label_names: List = field(
119        default_factory=list,
120        metadata={"help": "Names of the labels for multilabel training."},
121    )
122    hf_load_kwargs: Dict = field(
123        default=None,
124        metadata={
125            "help": (
126                "arguments for loading the dataset from the huggingface datasets' hub. Example: {'path': 'wikiann', 'name': 'es'}."
127                "if None, it is assumed that all necessary files exist locally and are passed in the files field."
128            )
129        },
130    )
131    type_load: str = field(
132        default="json",
133        metadata={
134            "help": "The type of load to perform in load_dataset; for example, if your data is in csv format (d = load_dataset('csv', ...)), this should be csv."
135        },
136    )
137    files: Dict = field(
138        default=None,
139        metadata={
140            "help": "Files to load the dataset from, in Huggingface's datasets format. Possible keys are train, validation and test"
141        },
142    )
143    data_field: str = field(
144        default="data",
145        metadata={
146            "help": "Field to load data from in the case of jsons loading in datasets. "
147        },
148    )
149    partial_split: bool = field(
150        default=False,
151        metadata={
152            "help": "Wheter a partial split is needed, that is, if you only have train and test sets, this should be True so that a new validation set is created."
153        },
154    )
155    split: bool = field(
156        default=False,
157        metadata={
158            "help": "This should be true when you only have one split, that is, a big train set; this creates new validation and test sets."
159        },
160    )
161    label_col: str = field(
162        default="label_list", metadata={"help": "Name of the label column."}
163    )
164    val_size: float = field(
165        default=0.15,
166        metadata={
167            "help": "In case no validation split is provided, the proportion of the training data to leave for validation."
168        },
169    )
170    test_size: float = field(
171        default=0.15,
172        metadata={
173            "help": "In case no test split is provided, the proportion of the total data to leave for testing."
174        },
175    )
176    pre_func: Any = field(
177        default=None,
178        metadata={
179            "help": "function to perform previous transformations. For example, if your dataset lacks a field (like xquad with title field for example), you can fix it in a function provided here."
180        },
181    )
182    remove_fields_pre_func: bool = field(
183        default=False,
184        metadata={"help": "Whether to remove fields after pre_func is applied."},
185    )
186    squad_v2: bool = field(
187        default=False,
188        metadata={
189            "help": "Only useful for question answering. Whether it is squad v2 format or not. Default is false"
190        },
191    )
192    text_field: str = field(
193        default="text",
194        metadata={
195            "help": "The name of the field containing the text. Useful only in case of unique-text-field datasets,like most datasets are. In case of 2-sentences datasets like xnli or paws-x this is not useful."
196        },
197    )
198    is_2sents: bool = field(
199        default=False,
200        metadata={
201            "help": "Whether it is a 2 sentence dataset. Useful for processing datasets like xnli or paws-x."
202        },
203    )
204    sentence1_field: str = field(
205        default=None,
206        metadata={
207            "help": "In case this is a 2 sents dataset, the name of the first sentence field."
208        },
209    )
210    sentence2_field: str = field(
211        default=None,
212        metadata={
213            "help": "In case this is a 2 sents dataset, the name of the second sentence field."
214        },
215    )
216    summary_field: str = field(
217        default="summary",
218        metadata={
219            "help": "The name of the field with summaries (we assume the long texts are in the text_field field). Only useful for summarization tasks."
220        },
221    )
222    callbacks: List = field(
223        default_factory=list, metadata={"help": "Callbacks to use inside transformers."}
224    )
225    metric_optimize: str = field(
226        default="eval_loss",
227        metadata={
228            "help": "Name of the metric you want to optimize in the hyperparameter search."
229        },
230    )
231    direction_optimize: str = field(
232        default="minimize",
233        metadata={
234            "help": "Direction of the optimization problem. Whether you want to maximize or minimize metric_optimize."
235        },
236    )
237    custom_eval_func: Any = field(
238        default=None,
239        metadata={
240            "help": "In case we want a special evaluation function, we can provide it here. It must receive EvalPredictions by trainer, like any compute_metrics function in transformers."
241        },
242    )
243    seed: int = field(default=420, metadata={"help": "Seed for optuna sampler. "})
244    max_length_summary: int = field(
245        default=120,
246        metadata={
247            "help": "Max length of the summaries, for tokenization purposes. It will be changed depending on the ModelConfig."
248        },
249    )
250    num_proc: int = field(
251        default=4, metadata={"help": "Number of processes to preprocess data."}
252    )
253    loaded_dataset: Any = field(
254        default=None,
255        metadata={
256            "help": "In case you want to do weird things like concatenating datasets or things like that, you can do that here, by passing a (non-tokenized) dataset in this field."
257        },
258    )
259    additional_metrics: List = field(
260        default=None,
261        metadata={
262            "help": "List of additional metrics loaded from datasets, to compute over the test part."
263        },
264    )
265    retrain_at_end: bool = field(
266        default=True,
267        metadata={
268            "help": "whether to retrain with the best performing model. In most cases this should be True, except when you're only training 1 model with 1 set of hyperparams."
269        },
270    )
271    config_num_labels: int = field(
272        default=None,
273        metadata={
274            "help": "Number of labels to set for the config, if None it will be computed based on number of labels detected."
275        },
276    )
277    smoke_test: bool = field(
278        default=False,
279        metadata={
280            "help": "Whether to select only top 10 rows of the dataset for smoke testing purposes"
281        },
282    )
283    augment_data: bool = field(
284        default=False, metadata={"help": "Whether to augment_data or not."}
285    )
286    data_augmentation_steps: List = field(
287        default_factory=list,
288        metadata={
289            "help": "List of data augmentation techniques to use from NLPAugPipeline."
290        },
291    )
292    id_field_qa: str = field(
293        default="id",
294        metadata={
295            "help": "Name of the field with the unique id of the examples in a question answering dataset."
296        },
297    )
298    pretokenized_dataset: Any = field(
299        default=None,
300        metadata={
301            "help": "Pre-tokenized dataset, to avoid tokenizing inside AutoTrainer, which may cause memory issues with huge datasets."
302        },
303    )
304    model_config_problem_type: str = field(
305        default=None,
306        metadata={
307            "help": "Problem type to set for the model's config. This depends on the dataset task."
308        },
309    )
310    chat_field: str = field(
311        default="messages",
312        metadata={
313            "help": "The fieldname for the column in the dataset containing the messages between the user and the assistant."
314        },
315    )
316
317    def __str__(
318        self,
319    ):
320        """Representation of dataset config in str."""
321        self_as_dict = asdict(self)
322        attrs_as_str = [f"{k}={v},\n" for k, v in sorted(self_as_dict.items())]
323        return f"{self.__class__.__name__}(\n{''.join(attrs_as_str)})"
@dataclass
class DatasetConfig:
  6@dataclass
  7class DatasetConfig:
  8    """
  9    Configure a dataset for use within the AutoTrainer class.
 10
 11    This determines how to load the dataset,
 12    whether local files are needed, whether additional splits are needed (for example when the original
 13    dataset only has train-test and we want also validation), and so on.
 14
 15    Parameters
 16    ----------
 17    dataset_name: str
 18        The name of the dataset.
 19    alias: str
 20        Alias for the dataset, for saving it.
 21    task: str
 22        The task of the dataset. Currenlty, only classification, ner and qa (question answering) are available.
 23    fixed_training_args: Dict
 24        The training arguments (to use in transformers.TrainingArguments) for every model on this dataset, in dictionary format.
 25    is_multilabel: bool
 26        Whether it is multilabel classification
 27    multilabel_label_names: List
 28        Names of the labels for multilabel training.
 29    hf_load_kwargs: Dict
 30        Arguments for loading the dataset from the huggingface datasets' hub. Example: {'path': 'wikiann', 'name': 'es'}.
 31        If None, it is assumed that all necessary files exist locally and are passed in the files field.
 32    type_load: str
 33        The type of load to perform in load_dataset; for example, if your data is in csv format (d = load_dataset('csv', ...)), this should be csv.
 34    files: Dict
 35        Files to load the dataset from, in Huggingface's datasets format. Possible keys are train, validation and test.
 36    data_field: str
 37        Field to load data from in the case of jsons loading in datasets.
 38    partial_split: bool
 39        Wheter a partial split is needed, that is, if you only have train and test sets, this should be True so that a new validation set is created.
 40    split: bool
 41        This should be true when you only have one split, that is, a big train set; this creates new validation and test sets.
 42    label_col: str
 43        Name of the label column.
 44    val_size: float
 45        In case no validation split is provided, the proportion of the training data to leave for validation.
 46    test_size: float
 47        In case no test split is provided, the proportion of the total data to leave for testing.
 48    pre_func
 49        Function to perform previous transformations. For example, if your dataset lacks a field (like xquad with title field for example), you can fix it in a function provided here.
 50    squad_v2: bool
 51        Only useful for question answering. Whether it is squad v2 format or not. Default is false.
 52    text_field: str
 53        The name of the field containing the text. Useful only in case of unique-text-field datasets,like most datasets are. In case of 2-sentences datasets like xnli or paws-x this is not useful. Default is text.
 54    is_2sents: bool
 55        Whether it is a 2 sentence dataset. Useful for processing datasets like xnli or paws-x.
 56    sentence1_field: str
 57        In case this is a 2 sents dataset, the name of the first sentence field.
 58    sentence2_field: str
 59        In case this is a 2 sents dataset, the name of the second sentence field.
 60    summary_field: str = field(
 61        The name of the field with summaries (we assume the long texts are in the text_field field). Only useful for summarization tasks. Default is summary.
 62    callbacks: List
 63        Callbacks to use inside transformers.
 64    metric_optimize: str
 65        Name of the metric you want to optimize in the hyperparameter search.
 66    direction_optimize : str
 67        Direction of the optimization problem. Whether you want to maximize or minimize metric_optimize.
 68    custom_eval_func: Any
 69        In case we want a special evaluation function, we can provide it here. It must receive EvalPredictions by trainer, like any compute_metrics function in transformers.
 70    seed : int
 71        Seed for optuna sampler.
 72    max_length_summary: int
 73        Max length of the summaries, for tokenization purposes. It will be changed depending on the ModelConfig.
 74    num_proc : int
 75        Number of processes to preprocess data.
 76    loaded_dataset: Any
 77        In case you want to do weird things like concatenating datasets or things like that, you can do that here, by passing a (non-tokenized) dataset in this field.
 78    additional_metrics: List
 79        List of additional metrics loaded from datasets, to compute over the test part.
 80    retrain_at_end: bool
 81        whether to retrain with the best performing model. In most cases this should be True, except when training 1 model with 1 set of hyperparams.
 82    config_num_labels: int
 83        Number of labels to set for the config, if None it will be computed based on number of labels detected.
 84    smoke_test: bool
 85        Whether to select only top 10 rows of the dataset for smoke testing purposes.
 86    augment_data: bool
 87        Whether to augment_data or not.
 88    data_augmentation_steps: List
 89        List of data augmentation techniques to use from NLPAugPipeline.
 90    pretokenized_dataset: Any
 91        Pre-tokenized dataset, to avoid tokenizing inside AutoTrainer, which may cause memory issues with huge datasets.
 92
 93    Examples
 94    --------
 95    One can easily create a DatasetConfig for dataset conll2002 just with the following:
 96
 97    >>> from autotransformers import DatasetConfig
 98
 99    >>> config={'fixed_training_args': {}, 'dataset_name': 'conll2002', 'alias': 'conll2002', 'task': 'ner', 'hf_load_kwargs': {'path': 'conll2002', 'name': 'es'}, 'label_col':'ner_tags'}
100
101    >>> config = DatasetConfig(**config)
102    """
103
104    dataset_name: str = field(metadata={"help": "The name of the dataset"})
105    alias: str = field(metadata={"help": "Alias for the dataset, for saving it."})
106    task: str = field(
107        metadata={
108            "help": "The task of the dataset. Currenlty, only classification, ner and qa (question answering) are available."
109        }
110    )
111    fixed_training_args: Dict = field(
112        metadata={
113            "help": "The training arguments (to use in transformers.TrainingArguments) for every model on this dataset, in dictionary format."
114        }
115    )
116    is_multilabel: bool = field(
117        default=False, metadata={"help": "Whether it is multilabel classification"}
118    )
119    multilabel_label_names: List = field(
120        default_factory=list,
121        metadata={"help": "Names of the labels for multilabel training."},
122    )
123    hf_load_kwargs: Dict = field(
124        default=None,
125        metadata={
126            "help": (
127                "arguments for loading the dataset from the huggingface datasets' hub. Example: {'path': 'wikiann', 'name': 'es'}."
128                "if None, it is assumed that all necessary files exist locally and are passed in the files field."
129            )
130        },
131    )
132    type_load: str = field(
133        default="json",
134        metadata={
135            "help": "The type of load to perform in load_dataset; for example, if your data is in csv format (d = load_dataset('csv', ...)), this should be csv."
136        },
137    )
138    files: Dict = field(
139        default=None,
140        metadata={
141            "help": "Files to load the dataset from, in Huggingface's datasets format. Possible keys are train, validation and test"
142        },
143    )
144    data_field: str = field(
145        default="data",
146        metadata={
147            "help": "Field to load data from in the case of jsons loading in datasets. "
148        },
149    )
150    partial_split: bool = field(
151        default=False,
152        metadata={
153            "help": "Wheter a partial split is needed, that is, if you only have train and test sets, this should be True so that a new validation set is created."
154        },
155    )
156    split: bool = field(
157        default=False,
158        metadata={
159            "help": "This should be true when you only have one split, that is, a big train set; this creates new validation and test sets."
160        },
161    )
162    label_col: str = field(
163        default="label_list", metadata={"help": "Name of the label column."}
164    )
165    val_size: float = field(
166        default=0.15,
167        metadata={
168            "help": "In case no validation split is provided, the proportion of the training data to leave for validation."
169        },
170    )
171    test_size: float = field(
172        default=0.15,
173        metadata={
174            "help": "In case no test split is provided, the proportion of the total data to leave for testing."
175        },
176    )
177    pre_func: Any = field(
178        default=None,
179        metadata={
180            "help": "function to perform previous transformations. For example, if your dataset lacks a field (like xquad with title field for example), you can fix it in a function provided here."
181        },
182    )
183    remove_fields_pre_func: bool = field(
184        default=False,
185        metadata={"help": "Whether to remove fields after pre_func is applied."},
186    )
187    squad_v2: bool = field(
188        default=False,
189        metadata={
190            "help": "Only useful for question answering. Whether it is squad v2 format or not. Default is false"
191        },
192    )
193    text_field: str = field(
194        default="text",
195        metadata={
196            "help": "The name of the field containing the text. Useful only in case of unique-text-field datasets,like most datasets are. In case of 2-sentences datasets like xnli or paws-x this is not useful."
197        },
198    )
199    is_2sents: bool = field(
200        default=False,
201        metadata={
202            "help": "Whether it is a 2 sentence dataset. Useful for processing datasets like xnli or paws-x."
203        },
204    )
205    sentence1_field: str = field(
206        default=None,
207        metadata={
208            "help": "In case this is a 2 sents dataset, the name of the first sentence field."
209        },
210    )
211    sentence2_field: str = field(
212        default=None,
213        metadata={
214            "help": "In case this is a 2 sents dataset, the name of the second sentence field."
215        },
216    )
217    summary_field: str = field(
218        default="summary",
219        metadata={
220            "help": "The name of the field with summaries (we assume the long texts are in the text_field field). Only useful for summarization tasks."
221        },
222    )
223    callbacks: List = field(
224        default_factory=list, metadata={"help": "Callbacks to use inside transformers."}
225    )
226    metric_optimize: str = field(
227        default="eval_loss",
228        metadata={
229            "help": "Name of the metric you want to optimize in the hyperparameter search."
230        },
231    )
232    direction_optimize: str = field(
233        default="minimize",
234        metadata={
235            "help": "Direction of the optimization problem. Whether you want to maximize or minimize metric_optimize."
236        },
237    )
238    custom_eval_func: Any = field(
239        default=None,
240        metadata={
241            "help": "In case we want a special evaluation function, we can provide it here. It must receive EvalPredictions by trainer, like any compute_metrics function in transformers."
242        },
243    )
244    seed: int = field(default=420, metadata={"help": "Seed for optuna sampler. "})
245    max_length_summary: int = field(
246        default=120,
247        metadata={
248            "help": "Max length of the summaries, for tokenization purposes. It will be changed depending on the ModelConfig."
249        },
250    )
251    num_proc: int = field(
252        default=4, metadata={"help": "Number of processes to preprocess data."}
253    )
254    loaded_dataset: Any = field(
255        default=None,
256        metadata={
257            "help": "In case you want to do weird things like concatenating datasets or things like that, you can do that here, by passing a (non-tokenized) dataset in this field."
258        },
259    )
260    additional_metrics: List = field(
261        default=None,
262        metadata={
263            "help": "List of additional metrics loaded from datasets, to compute over the test part."
264        },
265    )
266    retrain_at_end: bool = field(
267        default=True,
268        metadata={
269            "help": "whether to retrain with the best performing model. In most cases this should be True, except when you're only training 1 model with 1 set of hyperparams."
270        },
271    )
272    config_num_labels: int = field(
273        default=None,
274        metadata={
275            "help": "Number of labels to set for the config, if None it will be computed based on number of labels detected."
276        },
277    )
278    smoke_test: bool = field(
279        default=False,
280        metadata={
281            "help": "Whether to select only top 10 rows of the dataset for smoke testing purposes"
282        },
283    )
284    augment_data: bool = field(
285        default=False, metadata={"help": "Whether to augment_data or not."}
286    )
287    data_augmentation_steps: List = field(
288        default_factory=list,
289        metadata={
290            "help": "List of data augmentation techniques to use from NLPAugPipeline."
291        },
292    )
293    id_field_qa: str = field(
294        default="id",
295        metadata={
296            "help": "Name of the field with the unique id of the examples in a question answering dataset."
297        },
298    )
299    pretokenized_dataset: Any = field(
300        default=None,
301        metadata={
302            "help": "Pre-tokenized dataset, to avoid tokenizing inside AutoTrainer, which may cause memory issues with huge datasets."
303        },
304    )
305    model_config_problem_type: str = field(
306        default=None,
307        metadata={
308            "help": "Problem type to set for the model's config. This depends on the dataset task."
309        },
310    )
311    chat_field: str = field(
312        default="messages",
313        metadata={
314            "help": "The fieldname for the column in the dataset containing the messages between the user and the assistant."
315        },
316    )
317
318    def __str__(
319        self,
320    ):
321        """Representation of dataset config in str."""
322        self_as_dict = asdict(self)
323        attrs_as_str = [f"{k}={v},\n" for k, v in sorted(self_as_dict.items())]
324        return f"{self.__class__.__name__}(\n{''.join(attrs_as_str)})"

Configure a dataset for use within the AutoTrainer class.

This determines how to load the dataset, whether local files are needed, whether additional splits are needed (for example when the original dataset only has train-test and we want also validation), and so on.

Parameters

dataset_name: str The name of the dataset. alias: str Alias for the dataset, for saving it. task: str The task of the dataset. Currenlty, only classification, ner and qa (question answering) are available. fixed_training_args: Dict The training arguments (to use in transformers.TrainingArguments) for every model on this dataset, in dictionary format. is_multilabel: bool Whether it is multilabel classification multilabel_label_names: List Names of the labels for multilabel training. hf_load_kwargs: Dict Arguments for loading the dataset from the huggingface datasets' hub. Example: {'path': 'wikiann', 'name': 'es'}. If None, it is assumed that all necessary files exist locally and are passed in the files field. type_load: str The type of load to perform in load_dataset; for example, if your data is in csv format (d = load_dataset('csv', ...)), this should be csv. files: Dict Files to load the dataset from, in Huggingface's datasets format. Possible keys are train, validation and test. data_field: str Field to load data from in the case of jsons loading in datasets. partial_split: bool Wheter a partial split is needed, that is, if you only have train and test sets, this should be True so that a new validation set is created. split: bool This should be true when you only have one split, that is, a big train set; this creates new validation and test sets. label_col: str Name of the label column. val_size: float In case no validation split is provided, the proportion of the training data to leave for validation. test_size: float In case no test split is provided, the proportion of the total data to leave for testing. pre_func Function to perform previous transformations. For example, if your dataset lacks a field (like xquad with title field for example), you can fix it in a function provided here. squad_v2: bool Only useful for question answering. Whether it is squad v2 format or not. Default is false. text_field: str The name of the field containing the text. Useful only in case of unique-text-field datasets,like most datasets are. In case of 2-sentences datasets like xnli or paws-x this is not useful. Default is text. is_2sents: bool Whether it is a 2 sentence dataset. Useful for processing datasets like xnli or paws-x. sentence1_field: str In case this is a 2 sents dataset, the name of the first sentence field. sentence2_field: str In case this is a 2 sents dataset, the name of the second sentence field. summary_field: str = field( The name of the field with summaries (we assume the long texts are in the text_field field). Only useful for summarization tasks. Default is summary. callbacks: List Callbacks to use inside transformers. metric_optimize: str Name of the metric you want to optimize in the hyperparameter search. direction_optimize : str Direction of the optimization problem. Whether you want to maximize or minimize metric_optimize. custom_eval_func: Any In case we want a special evaluation function, we can provide it here. It must receive EvalPredictions by trainer, like any compute_metrics function in transformers. seed : int Seed for optuna sampler. max_length_summary: int Max length of the summaries, for tokenization purposes. It will be changed depending on the ModelConfig. num_proc : int Number of processes to preprocess data. loaded_dataset: Any In case you want to do weird things like concatenating datasets or things like that, you can do that here, by passing a (non-tokenized) dataset in this field. additional_metrics: List List of additional metrics loaded from datasets, to compute over the test part. retrain_at_end: bool whether to retrain with the best performing model. In most cases this should be True, except when training 1 model with 1 set of hyperparams. config_num_labels: int Number of labels to set for the config, if None it will be computed based on number of labels detected. smoke_test: bool Whether to select only top 10 rows of the dataset for smoke testing purposes. augment_data: bool Whether to augment_data or not. data_augmentation_steps: List List of data augmentation techniques to use from NLPAugPipeline. pretokenized_dataset: Any Pre-tokenized dataset, to avoid tokenizing inside AutoTrainer, which may cause memory issues with huge datasets.

Examples

One can easily create a DatasetConfig for dataset conll2002 just with the following:

>>> from autotransformers import DatasetConfig
>>> config={'fixed_training_args': {}, 'dataset_name': 'conll2002', 'alias': 'conll2002', 'task': 'ner', 'hf_load_kwargs': {'path': 'conll2002', 'name': 'es'}, 'label_col':'ner_tags'}
>>> config = DatasetConfig(**config)
DatasetConfig( dataset_name: str, alias: str, task: str, fixed_training_args: Dict, is_multilabel: bool = False, multilabel_label_names: List = <factory>, hf_load_kwargs: Dict = None, type_load: str = 'json', files: Dict = None, data_field: str = 'data', partial_split: bool = False, split: bool = False, label_col: str = 'label_list', val_size: float = 0.15, test_size: float = 0.15, pre_func: Any = None, remove_fields_pre_func: bool = False, squad_v2: bool = False, text_field: str = 'text', is_2sents: bool = False, sentence1_field: str = None, sentence2_field: str = None, summary_field: str = 'summary', callbacks: List = <factory>, metric_optimize: str = 'eval_loss', direction_optimize: str = 'minimize', custom_eval_func: Any = None, seed: int = 420, max_length_summary: int = 120, num_proc: int = 4, loaded_dataset: Any = None, additional_metrics: List = None, retrain_at_end: bool = True, config_num_labels: int = None, smoke_test: bool = False, augment_data: bool = False, data_augmentation_steps: List = <factory>, id_field_qa: str = 'id', pretokenized_dataset: Any = None, model_config_problem_type: str = None, chat_field: str = 'messages')
dataset_name: str
alias: str
task: str
fixed_training_args: Dict
is_multilabel: bool = False
multilabel_label_names: List
hf_load_kwargs: Dict = None
type_load: str = 'json'
files: Dict = None
data_field: str = 'data'
partial_split: bool = False
split: bool = False
label_col: str = 'label_list'
val_size: float = 0.15
test_size: float = 0.15
pre_func: Any = None
remove_fields_pre_func: bool = False
squad_v2: bool = False
text_field: str = 'text'
is_2sents: bool = False
sentence1_field: str = None
sentence2_field: str = None
summary_field: str = 'summary'
callbacks: List
metric_optimize: str = 'eval_loss'
direction_optimize: str = 'minimize'
custom_eval_func: Any = None
seed: int = 420
max_length_summary: int = 120
num_proc: int = 4
loaded_dataset: Any = None
additional_metrics: List = None
retrain_at_end: bool = True
config_num_labels: int = None
smoke_test: bool = False
augment_data: bool = False
data_augmentation_steps: List
id_field_qa: str = 'id'
pretokenized_dataset: Any = None
model_config_problem_type: str = None
chat_field: str = 'messages'