-
Notifications
You must be signed in to change notification settings - Fork 0
/
complete_relevance_matrix.py
167 lines (145 loc) · 5.49 KB
/
complete_relevance_matrix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import argparse
from gensim.models import KeyedVectors
from multiprocessing import Pool, freeze_support
import csv
import numpy as np
from gensim.models import KeyedVectors
import time
import json
import os
global_npy_dict = None
global_word2vec = None
try:
# Due to multiprocessing, this needs to be initialized here.
global_word2vec = KeyedVectors.load("./data/word2vec_model")
except:
"Word embddings not definined yet, generating word embeddings..."
def prepare_from_NPY(filepath_in: str):
'''
Retrieves data from a RELISH npy file, combining both title and abstract into a document.
Parameters
----------
filepath_in: str
The filepath of the RELISH input npy file.
Returns
-------
dict of nump array
A dictionary where each tokenized document is stored at their pmid.
'''
doc = np.load(filepath_in, allow_pickle=True)
dict = {}
for line in doc:
if isinstance(line[1], (np.ndarray, np.generic)):
document = np.ndarray.tolist(line[1])
document.extend(np.ndarray.tolist(line[2]))
else:
document = line[1]
document.extend(line[2])
dict[int(line[0])] = [w for w in document]
return dict
def generate_Word2Vec_model(params: dict, iteration: int, word_embedding_directory: str):
'''
Generates a word2vec model from all RELISH sentences using gensim and saves three model files.
Parameters
----------
filepath_in: list of str
The filepath of the RELISH input npy file.
params: dict
A dictionary of the hyperparameters for the model.
iteration: int
The number of hyperparameters processed.
'''
from gensim.models import Word2Vec
sentence_list = []
for pmid in global_npy_dict:
sentence_list.append(global_npy_dict[pmid])
params['sentences'] = sentence_list
model = Word2Vec(**params)
model.save("./data/word2vec_model") # This has to be static
os.makedirs(f"{word_embedding_directory}/{iteration}", exist_ok=True)
model.save(f"{word_embedding_directory}/{iteration}/word2vec_model")
def get_WMD_distance(tokens: list):
'''
Computes the word mover's distance between two documents.
Used in complete_relevance_matrix with multiprocessing.
Parameters
----------
tokens: list of strings
Tokenized document pair
Returns
-------
float
The rounded Word Mover's Distance (WMD).
'''
return global_word2vec.wv.wmdistance(tokens[0], tokens[1])
def complete_relevance_matrix(evaluation_file: str, iteration: int):
'''
Adds Word Mover's Distance to the evaluation matrix from the .npy formatted embeddings.
Parameters
----------
evaluation_file: str
The evaluation matrix csv file.
iteration: int
The number of hyperparameters processed.
'''
start = time.time()
print("Preparing rows...")
header = []
rows = []
tokenset_pairs = []
with open("./data/relevance_WMD_blank.tsv", newline='') as csvfile:
spamreader = csv.reader(csvfile, delimiter='\t')
header = next(spamreader) # Save and remove header
for row in spamreader:
try:
first_doc = global_npy_dict[int(row[0])]
second_doc = global_npy_dict[int(row[1])]
rows.append(row)
tokenset_pairs.append((
first_doc,
second_doc
))
except KeyError:
# print(f"KeyError: {row[0]} or {row[1]} not found in dictionary")
continue
print(f"Processing {len(tokenset_pairs)} rows...")
with open(f"{evaluation_file}_{iteration}.tsv", 'w', newline='') as csvfile:
writer = csv.writer(csvfile, delimiter='\t')
writer.writerow(header)
total_processed = 0
with Pool() as p:
iterator = p.imap(get_WMD_distance, tokenset_pairs, 100)
for distance in iterator:
row = rows[total_processed]
row[3] = round(1/(1+distance), 4)
writer.writerow(row)
total_processed += 1
if total_processed % 100 == 0 or total_processed == len(tokenset_pairs):
print(
f"Processed {total_processed}/{len(tokenset_pairs)} rows...")
p.join()
p.close()
global_word2vec = None
print(time.time() - start)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input", type=str,
help="Path to input RELISH tokenized .npy file")
parser.add_argument("-ma", "--matrix", type=str,
help="Path of relevance matrix file")
parser.add_argument("-pj", "--params_json", type=str,
help="File location of word2vec parameter list.")
parser.add_argument("-md", "--model_directory", type=str,
help="Directory to store word embeddings in.")
args = parser.parse_args()
params = []
with open(args.params_json, "r") as openfile:
params = json.load(openfile)
print("Preparing NPY dict...")
global_npy_dict = prepare_from_NPY(args.input)
for iteration in range(len(params)):
generate_Word2Vec_model(
params[iteration], iteration, args.model_directory)
global_word2vec = KeyedVectors.load("./data/word2vec_model")
freeze_support()
complete_relevance_matrix(args.matrix, iteration)