-
Notifications
You must be signed in to change notification settings - Fork 1
/
add_vocabs.py
53 lines (43 loc) · 2.02 KB
/
add_vocabs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import sys
if __name__ == '__main__':
hmr_lang = sys.argv[1]
lmr_lang = sys.argv[2]
print(hmr_lang, lmr_lang)
mono_path = './data/{}-wmt/'.format(hmr_lang)
proc_path = './data/{}-{}-wmt/'.format(hmr_lang, lmr_lang)
lmr = []
hmr_vocab = []
hmr_vocab_and_ind = []
new_items_to_be_added = []
overlapping_words = []
with open(mono_path + 'vocab.' + hmr_lang, 'r') as file1:
for line in file1:
if len(line.split()) == 2:
hmr_vocab.append(line.split()[0])
hmr_vocab_and_ind.append(line)
# print("Length of hmr vocabulary is {}".format(len(hmr_vocab)))
with open(proc_path + 'vocab.' + lmr_lang, 'r') as file2:
for line in file2:
lmr.append(line.split()[0])
if len(line.split()) == 2:
if not line.split()[0] in hmr_vocab:
new_items_to_be_added.append(line)
if line.split()[0] in hmr_vocab:
overlapping_words.append(line)
# print("Length of lmr language is {}\n".format(len(lmr)))
biggest_lmr = new_items_to_be_added[0].split()[1]
# print("Length of new items to be added is {} and the final vocabulary will "
# "have {} items\n".format(len(new_items_to_be_added),
# int(len(new_items_to_be_added)) + int(len(hmr_vocab))))
intersection = set(lmr).intersection(hmr_vocab)
# print("Length of intersection of 2 vocabs is {}\n".format(len(intersection)))
final_vocab = []
for line in hmr_vocab_and_ind:
value = int(line.split()[1]) + int(biggest_lmr)
final_vocab.append("{} {}\n".format(line.split()[0], value))
for line in new_items_to_be_added:
final_vocab.append(line)
with open(proc_path + 'vocab.{}-{}-ext-by-{}'.format(lmr_lang, hmr_lang, len(new_items_to_be_added)), 'w') as f:
for item in final_vocab:
f.write("%s" % item)
sys.stdout.write('vocab.%s-%s-ext-by-%s' % (lmr_lang, hmr_lang, len(new_items_to_be_added)))