-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathutils.py
52 lines (42 loc) · 1.59 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from collections import namedtuple, Counter
from math import log
import os
import gzip
import numpy as np
RT = "./"
BROWNCLUSFILE_100 = RT + "model/resources/MetaOptimize/BrownWC/brown-rcv1.clean.tokenized-CoNLL03.txt-c100-freq1.txt"
BROWNCLUSFILE = RT + "model/resources/MetaOptimize/BrownWC/bronwn-rcv1-1000-freq1.txt"
STOPWORDFILE = RT + "model/resources/nltkstopwords.txt"
def readMetaOptimizeBrownCluster_100():
print ("loading brown clusters...")
word_cluster_d = {}
cluster_2_index = {}
with open(BROWNCLUSFILE_100, "r", encoding='utf-8') as f:
for line in f:
bitstr, word, numocc = line.strip().split("\t")
word_cluster_d[word] = bitstr
if bitstr not in cluster_2_index:
cluster_2_index[bitstr] = len(cluster_2_index)
print ("done; # words: ,", len(word_cluster_d))
return word_cluster_d, cluster_2_index
def readMetaOptimizeBrownCluster():
print ("loading brown clusters...")
word_cluster_d = {}
cluster_2_index = {}
with open(BROWNCLUSFILE, "r", encoding='utf-8') as f:
for line in f:
bitstr, word, numocc = line.strip().split("\t")
word_cluster_d[word] = bitstr
if bitstr not in cluster_2_index:
cluster_2_index[bitstr] = len(cluster_2_index)
print ("done; # words: ,", len(word_cluster_d))
return word_cluster_d, cluster_2_index
def readStopwords():
ret = set()
with open(STOPWORDFILE) as f:
for line in f:
l = line.strip()
if len(l) > 0:
ret.add(l)
f.close()
return ret