forked from gooofy/kaldi-adapt-lm
-
Notifications
You must be signed in to change notification settings - Fork 2
/
replace.py
45 lines (39 loc) · 1.55 KB
/
replace.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import re
import argparse
parser = argparse.ArgumentParser(
description="Replace special characters line-by-line in a file with space and print or store result.")
parser.add_argument("-f", "--file", default=None, help="input file", required=True)
parser.add_argument("-o", "--out", default=None, help="output file to store result")
parser.add_argument("-c", "--char-set", default=1, help="""character sets
1: [a-zA-Z0-9\u00c0-\u00ff.<>'_-],
2: optimized for English,
3: optimized for German
""")
parser.add_argument("-l", "--lower", action='store_true', help="make everything lower-case")
args = parser.parse_args()
char_set = int(args.char_set)
if char_set not in [1, 2, 3]:
raise SystemExit(f'-c ${char_set} NOT YET SUPPORTED')
new_file = [] if args.out is not None else None
with open(args.file, "r") as fp:
for line in fp:
if char_set == 1:
# replace everything with space except a-z A-Z À-ÿ . <> ' _ -
line = re.sub(r"[^a-zA-Z0-9\u00c0-\u00ff.<>'_-]", ' ', line)
elif char_set == 2:
# English characters
line = re.sub(r"[^a-zA-Z']", ' ', line)
elif char_set == 3:
# German characters
line = re.sub(r"[']", '', line)
line = re.sub(r"[^a-zA-ZÄÖÜäöüß]", ' ', line)
if args.lower:
line = line.lower()
if args.out is None:
print(line)
else:
new_file.append(line)
if new_file is not None:
with open(args.out, 'w') as fp:
for line in new_file:
fp.write("%s\n" % line)