-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtransform.py
157 lines (116 loc) · 4.34 KB
/
transform.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
from preprocessing import process_embedding
from preprocessing import check_valid_file
from preprocessing import check_valid_dir
from config import get_config
import multiprocessing as mp
import pandas as pd
import numpy as np
from progressbar import progressbar
from tqdm import tqdm
import datetime
import pyemblib
import scipy
import queue
import time
import sys
import os
'''
transform.py
Script to generate a sequence of affine transformations of a set of
pretrained word embeddings.
'''
#========1=========2=========3=========4=========5=========6=========7==
# RETURNS: a tuple of the script arguments
def parse_args():
emb_path = sys.argv[1]
emb_format = sys.argv[2] # 'Word2Vec' or 'Glove'
if len(sys.argv) > 3:
first_n = sys.argv[3]
else:
first_n = 0
args = [emb_path, emb_format, first_n]
return args
#========1=========2=========3=========4=========5=========6=========7==
def genflow(emb_path, emb_format, first_n):
print_sleep_interval = 1
print("checkpoint 1")
check_valid_file(emb_path)
sys.stdout.flush()
source_name = os.path.splitext(os.path.basename(emb_path))[0]
print("Source name:", source_name)
sys.stdout.flush()
# take the first n most frequent word vectors for a subset
# set to 0 to take entire embedding
first_n = 0
# Preprocess.
print("About to preprocess. ")
sys.stdout.flush()
vectors_matrix,label_df = process_embedding(emb_path,
emb_format,
first_n,
None)
print("Done preprocessing. ")
sys.stdout.flush()
# We get the dimensions of the input dataset.
shape = vectors_matrix.shape
print("Shape of embedding matrix: ", shape)
time.sleep(print_sleep_interval)
sys.stdout.flush()
# number of rows in the embedding
num_inputs = shape[0]
num_outputs = num_inputs
# dimensionality of the embedding file
dim = shape[1]
#===================================================================
now = datetime.datetime.now()
timestamp = now.strftime("%Y-%m-%d-%H%M")
# The name of the embedding to save.
parent = os.path.abspath(os.path.join(emb_path, "../"))
check_valid_dir(parent)
print("Is anything happening here?")
sys.stdout.flush()
transforms = get_config(dim)
print("Got transforms. ")
sys.stdout.flush()
output_embedding_paths = []
for i,transform in tqdm(enumerate(transforms)):
func = transform[0]
arglist = transform[1]
new_emb_path = str(os.path.join(parent, "affine-" + str(i) + "__source--" + source_name
+ "__" + "time--" + timestamp + ".bin"))
sys.stdout.flush()
output_embedding_paths.append(new_emb_path)
print("About to start generation.")
sys.stdout.flush()
transformed_vectors = func(vectors_matrix, arglist)
# shape [<num_inputs>,<dimensions>]
print("labels shape: ", label_df.shape)
sys.stdout.flush()
# creates the emb dict
dist_emb_dict = {}
for i in tqdm(range(len(label_df))):
emb_array_row = transformed_vectors[i]
dist_emb_dict.update({label_df[i]:emb_array_row})
sys.stdout.flush()
print("Embedding dict created. ")
sys.stdout.flush()
# saves the embedding
pyemblib.write(dist_emb_dict,
new_emb_path,
mode=pyemblib.Mode.Binary)
print("Embedding saved to: " + new_emb_path)
# Write the output embedding names to a text file.
outputlist_name = "affine-outputlist__source--" + source_name + "__time--" + timestamp + ".txt"
outputlist_path = os.path.join(parent, outputlist_name)
with open(outputlist_path, 'w') as f:
for path in output_embedding_paths:
f.write(path + "\n")
return
#========1=========2=========3=========4=========5=========6=========7==
if __name__ == "__main__":
# stuff only to run when not called via 'import' here
args = parse_args()
emb_path = args[0]
emb_format = args[1]
first_n = args[2]
genflow(emb_path, emb_format, first_n)