-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrand_vecs.py
161 lines (117 loc) · 4.25 KB
/
rand_vecs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
from preprocessing import process_embedding
from preprocessing import check_valid_file
from preprocessing import check_valid_dir
from next_batch import next_batch
import tensorflow.contrib.layers as lays
import multiprocessing as mp
import tensorflow as tf
import pandas as pd
import numpy as np
from progressbar import progressbar
from tqdm import tqdm
import datetime
import pyemblib
import scipy
import queue
import time
import sys
import os
'''
rand_vecs.py
Script to generate an embedding with random, normalized vectors for each
token from a source vocab file.
'''
#========1=========2=========3=========4=========5=========6=========7==
# RETURNS: a tuple of the script arguments
def parse_args():
emb_path = sys.argv[1]
emb_format = sys.argv[2] # 'Word2Vec' or 'Glove'
if len(sys.argv) > 3:
first_n = sys.argv[3]
else:
first_n = 0
args = [emb_path, emb_format, first_n]
return args
#========1=========2=========3=========4=========5=========6=========7==
# VECTOR GENERATION FUNCTION
def epoch( vectors_matrix,
labels_df,
new_emb_path):
name = mp.current_process().name
print(name, 'Starting')
sys.stdout.flush()
# shape [<num_inputs>,<dimensions>]
rand_emb_array = []
for i in range(len(vectors_matrix)):
vec = np.random.rand(len(vectors_matrix[0]))
vec = vec / np.linalg.norm(vec)
rand_emb_array.append(vec)
print("labels shape: ", labels_df.shape)
# creates the emb dict
dist_emb_dict = {}
for i in tqdm(range(len(labels_df))):
emb_array_row = rand_emb_array[i]
dist_emb_dict.update({labels_df[i]:emb_array_row})
# saves the embedding
pyemblib.write(dist_emb_dict,
new_emb_path,
mode=pyemblib.Mode.Binary)
print("Embedding saved to: " + new_emb_path)
print(name, 'Exiting')
return
#=========1=========2=========3=========4=========5=========6=========7=
def mkproc(func, arguments):
p = mp.Process(target=func, args=arguments)
p.start()
return p
#========1=========2=========3=========4=========5=========6=========7==
def genflow(emb_path, emb_format, first_n):
print_sleep_interval = 1
print("checkpoint 1")
check_valid_file(emb_path)
source_name = os.path.splitext(os.path.basename(emb_path))[0]
print("Source name:", source_name)
# take the first n most frequent word vectors for a subset
# set to 0 to take entire embedding
first_n = 0
# Preprocess.
vectors_matrix,label_df = process_embedding(emb_path,
emb_format,
first_n,
None)
# We get the dimensions of the input dataset.
shape = vectors_matrix.shape
print("Shape of embedding matrix: ", shape)
time.sleep(print_sleep_interval)
sys.stdout.flush()
# number of rows in the embedding
num_inputs = shape[0]
num_outputs = num_inputs
# dimensionality of the embedding file
num_hidden = shape[1]
#===================================================================
now = datetime.datetime.now()
timestamp = now.strftime("%Y-%m-%d-%H%M")
# the name of the embedding to save
parent = os.path.abspath(os.path.join(emb_path, "../"))
check_valid_dir(parent)
new_emb_path = str(os.path.join(parent, "random__source--" + source_name
+ "__" + timestamp + ".bin"))
print("Writing to: ", new_emb_path)
# RUN THE TRAINING PROCESS
eval_process = mp.Process(name="eval",
target=epoch,
args=(vectors_matrix,
label_df,
new_emb_path))
eval_process.start()
eval_process.join()
return
#========1=========2=========3=========4=========5=========6=========7==
if __name__ == "__main__":
# stuff only to run when not called via 'import' here
args = parse_args()
emb_path = args[0]
emb_format = args[1]
first_n = args[2]
genflow(emb_path, emb_format, first_n)