-
Notifications
You must be signed in to change notification settings - Fork 0
/
run.py
236 lines (207 loc) · 10.1 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
"""Process a whole Wikipedia dump with the fruit fly
Usage:
run.py --lang=<language_code> --pipeline
run.py --lang=<language_code> --train_tokenizer
run.py --lang=<language_code> --download_wiki
run.py --lang=<language_code> --train_umap
run.py --lang=<language_code> --train_pca
run.py --lang=<language_code> --cluster_data
run.py --lang=<language_code> --train_fly
run.py --lang=<language_code> --binarize_data
run.py --lang=<language_code> --query_expansion
run.py (-h | --help)
run.py --version
Options:
--lang=<language code> The language of the Wikipedia to process.
--pipeline Run the whole pipeline (this can take several hours!)
--train_tokenizer Train a sentencepiece tokenizer for a language.
--download_wiki Download and preprocess Wikipedia for the chosen language.
--train_umap Train the UMAP dimensionality reduction model.
--train_pca Train a PCA dimensionality reduction model (alternative to UMAP).
--cluster_data Learn cluster names and apply clustering to the entire Wikipedia.
--train_fly Train the fruit fly over dimensionality-reduced representations.
--binarize_data Apply the fruit fly to the entire Wikipedia.
--query_expansion Train ridge model for query expansion.
-h --help Show this screen.
--version Show version.
"""
import configparser
from os.path import exists
from docopt import docopt
from glob import glob
from random import shuffle
import joblib
import re
import sentencepiece as spm
from codecarbon import EmissionsTracker
from spm.spm_train_on_wiki import mk_spm
from datasets.get_wiki_data import mk_wiki_data
from fly.train_models import train_umap, hack_umap_model, run_pca, hack_pca_model, train_birch, train_fly, train_query_expansion_model
from fly.apply_models import apply_dimensionality_reduction, apply_dimensionality_reduction_titles, apply_fly
from fly.prepare_clusters import generate_cluster_labels, generate_cluster_centroids
from fly.vectorizer import vectorize_scale
sp = spm.SentencePieceProcessor()
def get_training_data(lang, train_spf_path):
def get_n_docs(input_file_path, output_file, n):
article_count = 0
article = ""
input_file = open(input_file_path)
for l in input_file:
if "</doc" in l:
article+=l
output_file.write(article)
article = ""
article_count+=1
if article_count == n:
break
else:
article+=l
input_file.close()
return article_count
print("--- Gathering training data from sample of dump files ---")
required_article_count = 50000
train_spf = open(train_spf_path,'w')
'''Get first dump file, which usually contains 'core' articles.'''
dump_split = True
try:
first_sp_file = glob(f"./datasets/data/{lang}/{lang}wiki-latest-pages-articles1.*sp")[0]
except:
first_sp_file = f"./datasets/data/{lang}/{lang}wiki-latest-pages-articles.xml.sp"
dump_split = False
c = get_n_docs(first_sp_file, train_spf, 30000) #Up to 30,000 articles from first dump file
print(">>> Gathered",c,"articles from ",first_sp_file)
'''Get sample from other dump files, to get correct data distribution.'''
required_article_count-=c
if dump_split:
spfs = glob(f"./datasets/data/{lang}/{lang}wiki-latest-pages-articles*[0-9].sp")
shuffle(spfs)
for i in range(4):
c = int(required_article_count / 4)
get_n_docs(spfs[i], train_spf, c) #Articles from other dump files
print(">>> Gathered", c,"articles from ",spfs[i])
train_spf.close()
print(">>> Finished building the training corpus ---")
def init_config(lang):
config_path = lang+'.hyperparameters.cfg'
if exists(config_path):
return 1
else:
config = configparser.ConfigParser()
config['GENERIC'] = {}
config['GENERIC']['language'] = lang
config['PREPROCESSING'] = {}
config['PREPROCESSING']['logprob_power'] = 'None'
config['PREPROCESSING']['top_words'] = 'None'
config['REDUCER'] = {}
config['REDUCER']['type'] = 'None'
config['REDUCER']['dimensionality'] = 'None'
config['REDUCER']['path'] = 'None'
config['RIDGE'] = {}
config['RIDGE']['path'] = 'None'
config['FLY'] = {}
config['FLY']['num_trials'] = 'None'
config['FLY']['kc_size'] = 'None'
config['FLY']['neighbours'] = 'None'
config['FLY']['path'] = 'None'
with open(config_path, 'w+') as configfile:
config.write(configfile)
def read_config(lang):
config_path = lang+'.hyperparameters.cfg'
config = configparser.ConfigParser()
config.read(config_path)
return config_path, config
def update_config(lang, section, k, v):
config_path, config = read_config(lang)
config[section][k] = str(v)
with open(config_path, 'w+') as configfile:
config.write(configfile)
if __name__ == '__main__':
args = docopt(__doc__, version='Get Wikipedia in fruit fly vectors, ver 0.1')
lang = args['--lang']
#tracker = EmissionsTracker(output_dir="./emission_tracking", project_name="Multilingual Fly")
#tracker.start()
if args['--lang']:
init_config(lang)
train_path = f"./datasets/data/{lang}/{lang}wiki-latest-pages-articles.train.sp"
if args['--train_tokenizer'] or args['--pipeline']:
mk_spm(lang, 10000)
if args['--download_wiki'] or args['--pipeline']:
mk_wiki_data(lang, lang) #In the normal case, input and spm model case are the same language
get_training_data(lang, train_path)
if args['--train_umap'] or args['--pipeline']:
umap_path, input_m, umap_m, best_logprob_power, best_top_words = train_umap(lang, train_path)
print("UMAP LOG: BEST LOG POWER - ",best_logprob_power, "BEST TOP WORDS:", best_top_words)
update_config(lang, 'PREPROCESSING', 'logprob_power', best_logprob_power)
update_config(lang, 'PREPROCESSING', 'top_words', best_top_words)
update_config(lang, 'REDUCER', 'path', umap_path)
update_config(lang, 'REDUCER', 'type', 'UMAP')
update_config(lang, 'REDUCER', 'dimensionality', str(umap_m.shape[1]))
hacked_path, hacked_m = hack_umap_model(lang, train_path, best_logprob_power, best_top_words, input_m, umap_m)
update_config(lang, 'RIDGE', 'path', hacked_path)
if args['--train_pca']:
pca_path, input_m, pca_m, best_logprob_power, best_top_words = run_pca(lang, train_path)
print("PCA LOG: BEST LOG POWER - ",best_logprob_power, "BEST TOP WORDS:", best_top_words)
update_config(lang, 'PREPROCESSING', 'logprob_power', best_logprob_power)
update_config(lang, 'PREPROCESSING', 'top_words', best_top_words)
update_config(lang, 'REDUCER', 'path', pca_path)
update_config(lang, 'REDUCER', 'type', 'PCA')
update_config(lang, 'REDUCER', 'dimensionality', str(pca_m.shape[1]))
hacked_path, hacked_m = hack_pca_model(lang, train_path, best_logprob_power, best_top_words, input_m, pca_m)
update_config(lang, 'RIDGE', 'path', hacked_path)
if args['--cluster_data'] or args['--pipeline']:
_ , config = read_config(lang)
best_logprob_power = int(config['PREPROCESSING']['logprob_power'])
best_top_words = int(config['PREPROCESSING']['top_words'])
hacked_path = config['RIDGE']['path']
hacked_m = joblib.load(hacked_path+'.m')
brm, labels = train_birch(lang, hacked_m)
generate_cluster_labels(lang, train_path, labels, best_logprob_power, best_top_words)
apply_dimensionality_reduction(lang, hacked_path, best_logprob_power, best_top_words, brm)
if lang == 'simple':
apply_dimensionality_reduction('en', hacked_path, best_logprob_power, best_top_words)
if args['--train_fly'] or args['--pipeline']:
num_trials = 10
kc_size = 256
k = 20
update_config(lang, 'FLY', 'num_trials', num_trials)
update_config(lang, 'FLY', 'kc_size', kc_size)
update_config(lang, 'FLY', 'neighbours', k)
fly_path, _ = train_fly(lang=lang, dataset=train_path, num_trials=num_trials, kc_size=kc_size, k=k)
update_config(lang, 'FLY', 'path', fly_path)
if args['--binarize_data'] or args['--pipeline']:
_, config = read_config(lang)
generate_cluster_centroids(train_path)
best_logprob_power = int(config['PREPROCESSING']['logprob_power'])
best_top_words = int(config['PREPROCESSING']['top_words'])
apply_fly(lang, best_logprob_power, best_top_words, True)
if lang == 'simple':
apply_fly('simple', best_logprob_power, best_top_words, True, 'en')
if args['--query_expansion']:
def tokenize_text(sp, text):
text = ' '.join([wp for wp in sp.encode_as_pieces(text.lower())])
return text
_, config = read_config(lang)
best_logprob_power = int(config['PREPROCESSING']['logprob_power'])
best_top_words = int(config['PREPROCESSING']['top_words'])
hacked_path = config['RIDGE']['path']
hacked_m = joblib.load(hacked_path+'.m')
sp.load(f'./spm/{lang}/{lang}wiki.model')
spfs = glob(f"./datasets/data/{lang}/{lang}wiki-latest-pages-articles*sp")
for spf in spfs:
if "titles" in spf:
continue
print("Making title file for",spf)
input_file = open(spf)
out_file = spf.replace('.sp','.titles.sp')
out = open(out_file,'w')
for l in input_file:
if "<doc" in l:
m = re.search('.*title="([^"]*)"',l)
title = m.group(1)
out.write(l)
out.write(tokenize_text(sp, title)+'\n')
out.write("</doc>\n")
out.close()
apply_dimensionality_reduction_titles(lang, hacked_path, best_logprob_power, best_top_words)
train_query_expansion_model(lang, train_path)
#tracker.stop()