forked from mgruppi/sense-demo
-
Notifications
You must be signed in to change notification settings - Fork 0
/
demo_app.py
227 lines (174 loc) · 6.88 KB
/
demo_app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
from flask import Flask, render_template, request, jsonify
import os
import argparse
import numpy as np
import pickle
from scipy.spatial.distance import cosine
from sklearn.decomposition import PCA
import json
from WordVectors import WordVectors
from preprocessing.generate_sentences import generate_sentence_samples
import re
app = Flask(__name__)
app.config["IMAGE_DIR"] = os.path.join("images")
data = None
class Globals:
def __init__(self):
self.wv1 = dict()
self.wv2 = dict()
self.sorted_words = None
self.distances_ab = dict()
self.indices_ab = dict()
self.distances_ba = dict()
self.indices_ba = dict()
self.d = dict()
self.common = 0
self.filename1 = "A"
self.filename2 = "B"
self.display_name = "Unnamed"
self.common_vocab = 0
self.description = "(description)"
self.period_1 = (0, 0)
self.period_2 = (1, 1)
self.corpus_1 = "A"
self.corpus_2 = "B"
def fetch_datasets():
"""
Returns a list of available datasets in `data`/
"""
for root, dir, files in os.walk("data"):
datasets = [f.replace(".pickle", "") for f in sorted(files)]
return datasets
def fetch_metadata():
"""
Returns a list of dictionaries containing the metadata for each dataset.
"""
for root, dir, files in os.walk("metadata"):
metadata = dict()
for f in files:
with open(os.path.join(root, f)) as fin:
metadata[f.split(".")[0]] = json.load(fin)
return metadata
@app.route("/", methods=["GET", "POST"])
def index():
method = request.method
if method == "GET":
datasets = fetch_datasets()
metadata = fetch_metadata()
return render_template("demo.html", data=None,
datasets=datasets,
metadata=metadata)
else:
pass
@app.route("/loadDataset", methods=["GET", "POST"])
def load_dataset():
"""
Loads a dataset on the server-side application.
"""
data_path = request.args.get('data', type=str)
path = os.path.join("data", data_path+".pickle")
with open(path, "rb") as fin:
global data
data = pickle.load(fin)
return "ok", 200
@app.route("/getMostShiftedWords", methods=["GET"])
def get_most_shifted():
"""
Gets the most shifted words for a given alignment method from a loaded dataset.
"""
if data is None:
return "Error: dataset not loaded.", 400
method = request.args.get("method", type=str)
d_cosine = np.array([cosine(u, v) for u, v in zip(data.wv1[method].vectors, data.wv2[method].vectors)])
i_most_shifted = np.argsort(d_cosine)[::-1] # Indices sorted by highest to lowers cosine distance
n = 20
out_words = [data.wv1[method].words[i] for i in i_most_shifted[:n]]
out_scores = ["%.4f" % float(d_cosine[i]) for i in i_most_shifted[:n]]
output = {"method": method, "words": out_words, "scores": out_scores}
output = jsonify(output)
return output, 200
def get_neighbor_coordinates(x):
"""
Apply decomposition to an input matrix and returns a 2d set of points.
"""
_x = PCA(n_components=2).fit_transform(x)
return _x.tolist()
@app.route("/getWordContext", methods=["GET"])
def get_word_context():
"""
Returns the nearest neighbors of a given target word in each of the input corpora.
"""
if data is None:
return "Error: dataset not loaded.", 400
target = request.args.get("target", type=str)
m = "global"
if target not in data.wv1[m]: # Word not found
output = {"error": "word not found"}
else:
target_id = data.wv1[m].word_id[target]
output = {"target": target}
neighbor_ids_ab = data.indices_ab[m][target_id]
neighbor_ids_ba = data.indices_ba[m][target_id]
n_ab = [data.wv1[m].words[i] for i in neighbor_ids_ab]
n_ba = [data.wv2[m].words[i] for i in neighbor_ids_ba]
output["neighbors_ab"] = n_ab
output["neighbors_ba"] = n_ba
# Compute coordinates
x_ab = get_neighbor_coordinates([data.wv1[m][target_id]] + [data.wv2[m][i] for i in neighbor_ids_ab])
x_ba = get_neighbor_coordinates([data.wv2[m][target_id]] + [data.wv1[m][i] for i in neighbor_ids_ba])
output["x_ab"] = x_ab
output["x_ba"] = x_ba
return jsonify(output), 200
def highlight_sentence(sent, target, tag_s="<span class='target-highlight'>", tag_e="</span>"):
"""
Given an input sentence `sent` and a target word `target`, return a string that wraps every occurrence of `target`
in `sent` with a tag for highlighting.
By default, it surrounds every occurrence of `target` with the <span class='target-highlight'> tag.
Args:
sent (str): Input sentence.
target (str): Target word to be highlighted.
tag_s (str, optional): Sets the starting tag before `target`.
tag_e (str, optional): Sets the ending tag after `target`.
Return:
sent_ (str): Output sentence.
"""
# case insensitive sub, replaces original casing
# sent_ = re.sub(target, "%s%s%s" % (tag_s, target, tag_e), sent, flags=re.IGNORECASE)
# Case insensitive detection, case-preserving substitution.
sent_ = re.sub(r"(?=%s)" % target, tag_s, sent, flags=re.IGNORECASE)
sent_ = re.sub(r"(?<=%s)" % target, tag_e, sent_, flags=re.IGNORECASE)
return sent_
@app.route("/getWords", methods=["GET"])
def get_words():
"""
Returns the list of wall words in the loaded data model.
"""
if data is None:
return "Error: dataset not loaded.", 400
words = sorted(data.wv1["s4"].words)
output = {"words": words}
return jsonify(output), 200
@app.route("/getSentenceExamples", methods=["GET"])
def get_sentence_examples():
"""
Returns sentence examples for a given word in different corpora A and B.
"""
if data is None:
return "Error: dataset not loaded.", 400
target = request.args.get("target", type=str)
if target not in data.wv1["s4"]:
return jsonify({"error": "word not found"}), 200
sents_a, sents_b, samples_a, samples_b = generate_sentence_samples(data, target)
# Highlight target words
sents_a = [highlight_sentence(s, target) for s in sents_a]
sents_b = [highlight_sentence(s, target) for s in sents_b]
output = {"sents_a": sents_a, "sents_b": sents_b, "samples_a": samples_a, "samples_b": samples_b}
return jsonify(output), 200
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--host", type=str, default="0.0.0.0", help="Host address of the app")
# parser.add_argument("--debug", action="store_true", help="Set debug mode to ON")
parser.add_argument("--production", action="store_true", help="Run in production mode (debug off).")
args = parser.parse_args()
debug = not args.production
app.run(host="0.0.0.0", debug=debug)