-
Notifications
You must be signed in to change notification settings - Fork 1
/
train_tokenizer.py
110 lines (86 loc) · 3.33 KB
/
train_tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import argparse
import os
import re
import glob
# configs
from tools.config import cfg_from_yaml_file
from tokenizers import normalizers
# utils
from tools.utils import (
multiprocessing_with_async,
preprocess_mecab_pool, preprocess_shuf_pool,
preprocess_mecab_pool_line
)
import logging
log = logging.getLogger(__name__)
def sampling(data_path: str, sample_rate: float, save_path: str = '/samples/') -> None:
files = glob.glob(str(data_path))
params = {'inputs': files, 'targets': ["/".join([os.path.dirname(i), save_path, os.path.basename(i)]) for i in files]}
params.update({'sample_rate': sample_rate})
log.debug(params)
res = multiprocessing_with_async(params, func=preprocess_shuf_pool)
return res, str(data_path) + str(save_path)
def morphme(data_path: str, save_path: str = '/mecab/') -> None:
files = glob.glob(str(data_path) + '/*.txt')
params = {'inputs': files, 'targets': ["/".join([os.path.dirname(i), save_path, os.path.basename(i)]) for i in files]}
log.debug(params)
res = multiprocessing_with_async(params, func=preprocess_mecab_pool)
return res, str(data_path) + str(save_path)
def morphme_lines(data_path: str, save_path: str = '/mecab/') -> None:
files = glob.glob(str(data_path) + '/*.txt')
params = {}
file_lines = []
input_files = []
# files
for file_idx, _file in enumerate(files):
read_from = open(_file, "r").read().split('\n')
input_files.append(read_from)
for line_idx in range(len(read_from)):
file_lines.append("{}-{}".format(file_idx, line_idx))
params.update({'inputs': file_lines, 'files': input_files})
log.debug(params)
res = multiprocessing_with_async(params, func=preprocess_mecab_pool_line)
for file_idx in enumerate(files):
write_file = "/".join([os.path.dirname(_file), save_path, os.path.basename(_file)])
write_from = open(write_file, "w")
for line, values in res[int(file_idx)].items():
write_from.write("\n".join(values))
write_from.close()
return res, str(data_path) + str(save_path)
def main(cfg):
config = cfg_from_yaml_file(cfg)
# Sampling
log.info('start sampling')
_, path = sampling(data_path=config['Path']['data-path'], sample_rate=config['Samples']['rate'], save_path='/samples/')
# Morphme
if config['Morpheme-aware']:
log.info('start morpheme')
_, save_path = morphme(data_path=config['Path']['save-path'], save_path='/mecab/')
else:
save_path = config['Path']['save-path']
texts = glob.glob(save_path + '*.txt')
log.debug(texts)
# tokenizer
log.info('set tokenizer')
tokenizer = config['Pipelines']['Tokenizer']
tokenizer.pre_tokenizer = config['Pipelines']['pre_tokenizer']
tokenizer.normalizer = normalizers.Sequence(config['Pipelines']['normalizer'])
tokenizer.decoder = config['Pipelines']['decoder']
# train
log.info('train tokenizer')
tokenizer.train(texts, show_progress=True)
# eval
log.info('eval tokenizer')
log.info(tokenizer.encode("안녕하세요").tokens)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
"--cfg",
default=None,
metavar="path",
type=str,
required=True,
help="",
)
args = parser.parse_args()
main(args.cfg)