forked from brannondorsey/GloVe-experiments
-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
50 lines (41 loc) · 1.69 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import codecs, json
import numpy as np
'''Serializable/Pickleable class to replicate the functionality of collections.defaultdict'''
class autovivify_list(dict):
def __missing__(self, key):
value = self[key] = []
return value
def __add__(self, x):
'''Override addition for numeric types when self is empty'''
if not self and isinstance(x, Number):
return x
raise ValueError
def __sub__(self, x):
'''Also provide subtraction method'''
if not self and isinstance(x, Number):
return -1 * x
raise ValueError
def build_word_vector_matrix(vector_file, n_words):
'''Read a GloVe array from sys.argv[1] and return its vectors and labels as arrays'''
np_arrays = []
labels_array = []
with codecs.open(vector_file, 'r', 'utf-8') as f:
for i, line in enumerate(f):
sr = line.split()
labels_array.append(sr[0])
np_arrays.append(np.array([float(j) for j in sr[1:]]))
if i == n_words - 1:
return np.array(np_arrays), labels_array
def get_cache_filename_from_args(args):
a = (args.vector_dim, args.num_words, args.num_clusters)
return '{}D_{}-words_{}-clusters.json'.format(*a)
def get_label_dictionaries(labels_array):
id_to_word = dict(zip(range(len(labels_array)), labels_array))
word_to_id = dict((v,k) for k,v in id_to_word.items())
return word_to_id, id_to_word
def save_json(filename, results):
with open(filename, 'w') as f:
json.dump(results, f)
def load_json(filename):
with open(filename, 'r') as f:
return json.load(f)