forked from wewanna/appropriate-filetering
-
Notifications
You must be signed in to change notification settings - Fork 10
/
word_to_vec.py
43 lines (29 loc) · 832 Bytes
/
word_to_vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from Hangulpy import decompose, is_hangul
import csv
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('input')
# parser.add_argument('output')
parser.add_argument('D')
args = parser.parse_args()
dimension = int(args.D)
print('dimension: {}'.format(dimension))
jamo_dictionary = dict()
def decompose_string(text):
result = []
for c in text:
if is_hangul(c):
result += [decompose(c)]
return result
def add_to_dict(composed):
for c in composed:
for jamo in c:
if jamo not in jamo_dictionary:
jamo_dictionary[jamo] = len(jamo_dictionary)
with open(args.input, 'r') as f:
rdr = csv.reader(f)
for line in rdr:
s = line[1]
add_to_dict(decompose_string(s))
print(len(jamo_dictionary))
print(jamo_dictionary)