-
Notifications
You must be signed in to change notification settings - Fork 504
/
arguments.py
375 lines (365 loc) · 14.9 KB
/
arguments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
from dataclasses import dataclass, field
from typing import Optional, List
from transformers import Seq2SeqTrainingArguments
@dataclass
class ModelArguments:
"""
Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
"""
model_name_or_path: str = field(
metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
)
config_name: Optional[str] = field(
default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
)
feature_extractor_name: Optional[str] = field(
default=None, metadata={"help": "Pretrained feature extractor name or path if not the same as model_name"}
)
description_tokenizer_name: Optional[str] = field(
default=None, metadata={"help": "Pretrained description tokenizer name or path if not the same as model_name"}
)
prompt_tokenizer_name: Optional[str] = field(
default=None,
metadata={"help": "Pretrained prompt tokenizer name or path if not the same as description_tokenizer_name"},
)
cache_dir: Optional[str] = field(
default=None,
metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
)
use_fast_tokenizer: bool = field(
default=True,
metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
)
model_revision: str = field(
default="main",
metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
)
pad_token_id: int = field(
default=None,
metadata={"help": "If specified, change the model pad token id."},
)
decoder_start_token_id: int = field(
default=None,
metadata={"help": "If specified, change the model decoder start token id."},
)
freeze_text_encoder: bool = field(
default=False,
metadata={"help": "Whether to freeze the text encoder."},
)
do_sample: bool = field(
default=True,
metadata={"help": "Whether to do sampling or greedy decoding."},
)
temperature: float = field(
default=1.0,
metadata={"help": "Temperature if sampling."},
)
max_length: int = field(
default=2580,
metadata={"help": "Generation max length."},
)
bandwidth: float = field(
default=6,
metadata={"help": "Audio encoder bandwidth."},
)
asr_model_name_or_path: str = field(
default="distil-whisper/distil-large-v2",
metadata={
"help": "Used to compute WER during evaluation. Path to pretrained model or model identifier from huggingface.co/models"
},
)
clap_model_name_or_path: str = field(
default="laion/larger_clap_music_and_speech",
metadata={
"help": "Used to compute audio similarity during evaluation. Path to pretrained model or model identifier from huggingface.co/models"
},
)
attn_implementation: str = field(
default="eager",
metadata={"help": "Attention implementation used. One of `eager`, `sdpa`, `flash_attention_2`"},
)
cross_attention_implementation_strategy: str = field(
default=None,
metadata={
"help": "If not specified, the cross-attention implementation will be the same as `_attn_implementation`. If `always_eager`, it will always be the eager implementation. If `always_sdpa`, it will always be the sdpa implementation."
},
)
prompt_padding_side: Optional[str] = field(
default="left",
metadata={
"help": "Prompt tokenizer padding side. Defaults to `left`. If the prompt is pre-pended to the codebooks hidden states, it should be padded on the left."
},
)
@dataclass
class DataTrainingArguments:
"""
Arguments pertaining to what data we are going to input our model for training and eval.
Using `HfArgumentParser` we can turn this class
into argparse arguments to be able to specify them on
the command line.
"""
train_dataset_name: str = field(
default=None,
metadata={
"help": "The name of the training dataset to use (via the datasets library). Load and combine "
"multiple datasets by separating dataset ids by a '+' symbol. For example, to load and combine "
" librispeech and common voice, set `train_dataset_name='librispeech_asr+common_voice'`."
},
)
train_dataset_config_name: Optional[str] = field(
default=None,
metadata={
"help": "The configuration name of the training dataset to use (via the datasets library). Load and combine "
"multiple datasets by separating dataset configs by a '+' symbol."
},
)
train_split_name: str = field(
default="train",
metadata={
"help": ("The name of the training data set split to use (via the datasets library). Defaults to 'train'")
},
)
train_dataset_samples: str = field(
default=None,
metadata={
"help": "Number of samples in the training data. Load and combine "
"multiple datasets by separating dataset samples by a '+' symbol."
},
)
train_metadata_dataset_name: str = field(
default=None,
metadata={
"help": "The name of the metadata training dataset to use (via the datasets library). Load and combine "
"multiple datasets by separating dataset ids by a '+' symbol. For example, to load and combine "
" librispeech and common voice, set `train_dataset_name='librispeech_asr+common_voice'`."
},
)
eval_dataset_name: str = field(
default=None,
metadata={
"help": "The name of the evaluation dataset to use (via the datasets library). Defaults to the training dataset name if unspecified."
},
)
eval_dataset_config_name: Optional[str] = field(
default=None,
metadata={
"help": "The configuration name of the evaluation dataset to use (via the datasets library). Defaults to the training dataset config name if unspecified"
},
)
eval_split_name: str = field(
default="test",
metadata={
"help": "The name of the evaluation data set split to use (via the datasets library). Defaults to 'test'"
},
)
eval_metadata_dataset_name: str = field(
default=None,
metadata={
"help": "The name of the metadata training dataset to use (via the datasets library). Load and combine "
"multiple datasets by separating dataset ids by a '+' symbol. For example, to load and combine "
" librispeech and common voice, set `train_dataset_name='librispeech_asr+common_voice'`."
},
)
target_audio_column_name: str = field(
default="audio",
metadata={"help": "The name of the dataset column containing the target audio data. Defaults to 'audio'"},
)
description_column_name: str = field(
default=None,
metadata={"help": "The name of the dataset column containing the description text data. Defaults to 'None'."},
)
prompt_column_name: str = field(
default=None,
metadata={"help": "The name of the dataset column containing the prompt text data. Defaults to 'None'."},
)
overwrite_cache: bool = field(
default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
)
preprocessing_num_workers: Optional[int] = field(
default=None,
metadata={"help": "The number of processes to use for the preprocessing."},
)
max_train_samples: Optional[int] = field(
default=None,
metadata={
"help": (
"For debugging purposes or quicker training, truncate the number of training examples to this "
"value if set."
)
},
)
max_eval_samples: Optional[int] = field(
default=None,
metadata={
"help": (
"For debugging purposes or quicker training, truncate the number of validation examples to this "
"value if set."
)
},
)
max_duration_in_seconds: float = field(
default=35.0,
metadata={
"help": (
"Filter audio files that are longer than `max_duration_in_seconds` seconds to 'max_duration_in_seconds`."
"Also, used to set maximum audio length if `pad_to_max_length=True`."
)
},
)
min_duration_in_seconds: float = field(
default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
)
max_text_length: int = field(
default=500, metadata={"help": "If set, max description lengths in number of characters."}
)
max_prompt_token_length: int = field(
default=None,
metadata={
"help": (
"If set, filter samples with prompts that are longer than `max_prompt_token_length` tokens."
"Also, used to set maximum prompt token length if `pad_to_max_length=True`."
)
},
)
max_description_token_length: int = field(
default=None,
metadata={
"help": (
"If set, filter samples with descriptions that are longer than `max_description_token_length` tokens."
"Also, used to set maximum description token length if `pad_to_max_length=True`."
)
},
)
pad_to_max_length: bool = field(
default=False,
metadata={
"help": (
"If `True`, pad audio, prompt and description to a maximum length set with respectively "
"`max_duration_in_seconds`, `max_prompt_token_length`, `max_description_token_length`."
)
},
)
preprocessing_only: bool = field(
default=False,
metadata={
"help": (
"Whether to only do data preprocessing and skip training. This is especially useful when data"
" preprocessing errors out in distributed training due to timeout. In this case, one should run the"
" preprocessing in a non-distributed setup with `preprocessing_only=True` so that the cached datasets"
" can consequently be loaded in distributed training."
" In this training script, `save_to_disk` must be set to the path in which the dataset should be saved. "
)
},
)
token: str = field(
default=None,
metadata={
"help": (
"The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
"generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
)
},
)
use_auth_token: bool = field(
default=None,
metadata={
"help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead."
},
)
trust_remote_code: bool = field(
default=False,
metadata={
"help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
"execute code present on the Hub on your local machine."
)
},
)
add_audio_samples_to_wandb: bool = field(
default=False,
metadata={"help": "If set and if `wandb` in args.report_to, will add generated audio samples to wandb logs."},
)
id_column_name: str = field(default=None, metadata={"help": "id column name."})
wandb_project: str = field(
default="parler-speech",
metadata={"help": "The name of the wandb project."},
)
wandb_run_name: str = field(
default=None,
metadata={
"help": "If specified, the name of the run. If not specified, wandb will give a random name to this run."
},
)
save_to_disk: str = field(
default=None,
metadata={
"help": "If set, will save the dataset to this path if this is an empyt folder. If not empty, will load the datasets from it."
},
)
temporary_save_to_disk: str = field(default=None, metadata={"help": "Temporarily save audio labels here."})
save_codec_steps: Optional[int] = field(
default=500,
metadata={"help": "Temporarily save the audio labels every `save_steps`."},
)
pad_to_multiple_of: Optional[int] = field(
default=2,
metadata={"help": ("Pad to multiple of for tokenizers.")},
)
@dataclass
class ParlerTTSTrainingArguments(Seq2SeqTrainingArguments):
dtype: Optional[str] = field(
default="float32",
metadata={
"help": (
"The data type (dtype) in which to run training. One of `float32` (full-precision), "
"`float16` or `bfloat16` (both half-precision)."
)
},
)
audio_encoder_per_device_batch_size: int = field(
default=8,
metadata={"help": ("Specify the batch size of the audio encoding pre-processing steps.")},
)
eval_dataloader_num_workers: Optional[int] = field(
default=0,
metadata={
"help": (
"Number of subprocesses to use for evaluation data loading (PyTorch only). 0 means that the data will be loaded in the main process."
)
},
)
compute_clap_similarity_metric: bool = field(
default=True,
metadata={
"help": (
"Whether or not to compute the clap similarity metric between the description and the generation during evalution."
)
},
)
compute_noise_level_metric: bool = field(
default=True,
metadata={"help": ("Whether or not to compute the squim si-sdr measure of the generations.")},
)
noise_level_to_compute_clean_wer: float = field(
default=25,
metadata={
"help": (
"if `compute_noise_level_metric=True`, will compute a 'clean' WER on samples with generated noise higher than `noise_level_to_compute_clean_wer`."
"This is a proxy measure to compute WER on clean audios, provided that the model learn to generate clean audios."
)
},
)
eval_generation_steps: Optional[int] = field(
default=None,
metadata={
"help": (
"Number of update steps between two generation evaluation. Will default to the same"
"value as `eval_steps` if not set. Should be an integer and a multiple of `eval_steps`."
)
},
)
codebook_weights: Optional[List[float]] = field(
default=None,
metadata={"help": "Weights applied to each codebook."},
)