-
Notifications
You must be signed in to change notification settings - Fork 0
/
opts.py
86 lines (73 loc) · 4.14 KB
/
opts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import argparse
def preprocess_opts(parser):
# Data options
group = parser.add_argument_group('Data')
group.add_argument('-data_type', default="text",
help="""Type of the source input.
Options are [text|visual|audio].""")
group.add_argument('-train_src', required=True,
help="Path to the training source data")
group.add_argument('-train_tgt', required=True,
help="Path to the training target data")
group.add_argument('-valid_src', required=True,
help="Path to the validation source data")
group.add_argument('-valid_tgt', required=True,
help="Path to the validation target data")
group.add_argument('-src_dir', default="",
help="Source directory for image or audio files.")
group.add_argument('-save_data', required=True,
help="Output file for the prepared data")
group.add_argument('-max_shard_size', type=int, default=0,
help="""For text corpus of large volume, it will
be divided into shards of this size to preprocess.
If 0, the data will be handled as a whole. The unit
is in bytes. Optimal value should be multiples of
64 bytes.""")
# Dictionary options, for text corpus
group = parser.add_argument_group('Vocab')
group.add_argument('-src_vocab',
help="Path to an existing source vocabulary")
group.add_argument('-tgt_vocab',
help="Path to an existing target vocabulary")
group.add_argument('-features_vocabs_prefix', type=str, default='',
help="Path prefix to existing features vocabularies")
group.add_argument('-src_vocab_size', type=int, default=50000,
help="Size of the source vocabulary")
group.add_argument('-tgt_vocab_size', type=int, default=50000,
help="Size of the target vocabulary")
group.add_argument('-src_words_min_frequency', type=int, default=0)
group.add_argument('-tgt_words_min_frequency', type=int, default=0)
group.add_argument('-dynamic_dict', action='store_true',
help="Create dynamic dictionaries")
group.add_argument('-share_vocab', action='store_true',
help="Share source and target vocabulary")
# Truncation options, for text corpus
group = parser.add_argument_group('Pruning')
group.add_argument('-src_seq_length', type=int, default=50,
help="Maximum source sequence length")
group.add_argument('-src_seq_length_trunc', type=int, default=0,
help="Truncate source sequence length.")
group.add_argument('-tgt_seq_length', type=int, default=50,
help="Maximum target sequence length to keep.")
group.add_argument('-tgt_seq_length_trunc', type=int, default=0,
help="Truncate target sequence length.")
group.add_argument('-lower', action='store_true', help='lowercase data')
# Data processing options
group = parser.add_argument_group('Random')
group.add_argument('-shuffle', type=int, default=1,
help="Shuffle data")
group.add_argument('-seed', type=int, default=3435,
help="Random seed")
group = parser.add_argument_group('Logging')
group.add_argument('-report_every', type=int, default=100000,
help="Report status every this many sentences")
# Options most relevant to speech
group = parser.add_argument_group('Speech')
group.add_argument('-sample_rate', type=int, default=16000,
help="Sample rate.")
group.add_argument('-window_size', type=float, default=.02,
help="Window size for spectrogram in seconds.")
group.add_argument('-window_stride', type=float, default=.01,
help="Window stride for spectrogram in seconds.")
group.add_argument('-window', default='hamming',
help="Window type for spectrogram generation.")