-
Notifications
You must be signed in to change notification settings - Fork 3
/
hownet_corpus_data_picker.py
35 lines (34 loc) · 1.18 KB
/
hownet_corpus_data_picker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# -*- coding: utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding="utf-8"
if (len(sys.argv)<4):
print 'no enough parameter'
exit()
hownet_filename = sys.argv[1]
embedding_filename = sys.argv[2]
target_filename = sys.argv[3]
with open(hownet_filename,'r') as hownet:
with open(embedding_filename,'r') as embedding:
with open(target_filename,'w') as target:
wordsBuf = embedding.readlines()
dim_size = int(wordsBuf[0].strip().split()[1])
dic = hownet.readlines()
wordlen = len(wordsBuf)
words = {}
for i in range(1,wordlen):
line = wordsBuf[i].strip().split()
words[line[0].strip()] = i
index = 0
diclen = len(dic)
Strings = []
while(index<diclen):
now = dic[index].strip()
#print now
if (now in words):
#target.write(wordsBuf[words[now]])
Strings.append(wordsBuf[words[now]])
index+=2
target.write(str(len(Strings))+" "+str(dim_size)+"\n")
for line in Strings:
target.write(line)