-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy path02_calc_similarities.py
318 lines (272 loc) · 11.6 KB
/
02_calc_similarities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
# Import libraries
import sys
import time
import json
import string
import operator
import gzip
import os
import math
import shutil
from datetime import datetime
# import our specific functions
from functions import manga_utils
from functions import manga_compator
# files in and out and settings
dir_in = "data/jsons/"
dir_logs = "data/logs/"
dir_out = "output/"
min_same_labels = 4
min_same_labels_h18 = 2
min_desc_chars = 100
max_num_matches = 25
ignore_tfidf_score_above_this_val = 0.30
ignore_tfidf_score_below_this_val = 0.05
weighting_label_score = 0.8
use_labels_to_match = True
redo_all_matches = True
if len(sys.argv) == 3:
id_start = int(sys.argv[1])
id_end = int(sys.argv[2])
else:
id_start = 1
id_end = 60000
assert id_end >= id_start
assert id_end > 0
assert id_start > 0
# Open the manga json file and load
time_start = time.time()
t01 = time.time()
manga_data = manga_utils.read_raw_manga_data_files(dir_in)
t11 = time.time()
print("loaded " + str(len(manga_data)) + " mangas (" + str(round(t11 - t01, 2)) + " seconds)")
# get our list of labels (genres, demographics, themes, etc)
t02 = time.time()
labels_dict = manga_utils.get_used_labels(manga_data)
print("loaded " + str(len(labels_dict)) + " labels")
labels_vec = []
for label in sorted(labels_dict.keys()):
# print(" " + str(labels_dict[label]) + " " + label + " in data")
labels_vec.append(label)
labels_weights = manga_utils.get_label_ranks(labels_vec)
# print largest to smallest generes
labels_dict_sorted = sorted(labels_dict.items(), key=operator.itemgetter(1))
for label, num in labels_dict_sorted:
print(" " + str(num) + " " + label + " in data")
# calculate mapping between manga_data index and mangadex id
data2mangadexid = {}
for idx, manga in enumerate(manga_data):
data2mangadexid[manga.id] = idx
# Loop through each manga, and create a corpus of descriptions
# NOTE: we remove non-numeric and non-alphabetical chars
# NOTE: we remove non-english words, is this really ok?
# NOTE: should we add the genres here?? not sure..
corpus = []
id2corups = {}
corups2id = {}
counter = 0
printable = set(string.printable)
for idx, manga in enumerate(manga_data):
# create a compounded text description
str_raw = manga.title + ". " + manga.description
if "description" in manga.external_al:
str_raw = str_raw + manga.external_al["description"]
# loop through all the related and append those descriptions also
for related in manga.related:
if related["id"] in data2mangadexid:
str_raw = str_raw + manga_data[data2mangadexid[related["id"]]].description
break
# append description is not long enough
if len(manga.description) > min_desc_chars:
corpus.append((idx, manga_utils.clean_string(str_raw, False)))
id2corups[idx] = counter
corups2id[counter] = idx
counter = counter + 1
# build a TF/IDF matrix for each paper
tfidf_matrix = manga_compator.load_corpus_into_tfidf(corpus)
t12 = time.time()
print("setup tf-idf in " + str(round(t12 - t02, 2)) + " seconds")
# loop through each and find the top matches
count_manga_matching = 0
count_manga_matched_to = 0
for ct, manga1 in enumerate(manga_data):
# nice debug only a certain set of manga ids
# test_manga = [29557, 33691, 38917, 50727, 52297]
# if manga1.id not in test_manga:
# continue
# skip if not in the range we will match through
if manga1.id < id_start or manga1.id > id_end:
continue
# skip if we already have matches
if len(manga1.matches) > 0 and not redo_all_matches:
continue
manga1.matches.clear()
manga_data[ct].matches.clear()
# nice debug print of which one we are processing
print(str(round(100 * float(ct) / len(manga_data), 2)) +
"% -> manga " + str(manga1.id) + " - " + manga1.title + " - " + manga1.url)
# get our scores based on tfidf and labels
# only get tfidf score if we have a big enough description
scores = {}
if len(manga1.description) > min_desc_chars:
scores = manga_compator.find_similar_tfidf(tfidf_matrix, id2corups[ct])
# loop through tfidf and change ids to the manga_data ids
scores_temp = {}
for tmpid, score in scores.items():
scores_temp[corups2id[tmpid]] = score
scores = scores_temp
# get score labels if enabled
if use_labels_to_match:
scores_labels = manga_compator.find_similar_labels(manga1, labels_vec, labels_weights, manga_data)
else:
scores_labels = {}
# loop through our data, and combine our two scores
for idx, manga in enumerate(manga_data):
# cache if we are in either list
bool_in_s1 = idx in scores
bool_in_s2 = idx in scores_labels
# if it is greater, then add a total score
if bool_in_s1 and scores[idx] >= ignore_tfidf_score_above_this_val:
scores[idx] += 1.0
continue
# if less than, just use the genre matching scheme
if bool_in_s1 and bool_in_s2 and scores[idx] < ignore_tfidf_score_below_this_val:
scores[idx] = scores_labels[idx]
continue
# otherwise we should just add the score we have
if bool_in_s1 and bool_in_s2:
scores[idx] += weighting_label_score * scores_labels[idx]
continue
# if not originally in it, we should append it
if not bool_in_s1 and bool_in_s2:
scores[idx] = weighting_label_score * scores_labels[idx]
continue
# finally sort in decending order, so highest is near the top
scores = dict(sorted(scores.items(), key=operator.itemgetter(1), reverse=True))
# count if how many labels we have
manga1_labels = manga1.compute_xor_label_vector(labels_vec)
count_manga1 = 0
for idlc, val in enumerate(manga1_labels):
if manga1_labels[idlc]:
count_manga1 += 1
# skip if we don't have at least two labels
if count_manga1 < 2:
continue
# get the top matches, and then only select good ones
count_matched = 0
for idx in scores:
# skip if already have our top
if count_matched >= max_num_matches:
break
# get the manga object for the match
manga2 = manga_data[idx]
# skip if the manga ids are the same
if manga1.id == manga2.id:
continue
# skip if the related has (promo) in the title
# this are promotional items for mangas, thus are not really a true manga
if "(promo)" in manga2.title.lower():
continue
# skip any manga which is already a related manga
# no need to append if it is already listed on mangadex
is_related = False
for related in manga1.related:
if manga2.id == related["id"]:
is_related = True
# print("found "+str(manga2.id)+" is already related to "+manga1.url)
break
if is_related:
continue
# skip if this manga does not have any chapters
if manga2.count_chapters < 1:
continue
# calc the label vector
manga2_labels = manga2.compute_xor_label_vector(labels_vec)
count_manga2 = 0
for idlc, val in enumerate(manga2_labels):
if manga2_labels[idlc]:
count_manga2 += 1
# check our exact matches requires and see if we have any invalid
if not manga_compator.is_exact_match(manga1, manga2):
continue
# count the number of same labels between the two
count_same = 0
for idlc, val in enumerate(manga1_labels):
if manga1_labels[idlc] and manga2_labels[idlc]:
count_same += 1
# skip this if it doesn't have the min number of matches
# note that we won't skip if the current manga has less than the required
if manga1.is_r18 and count_same < min(min_same_labels_h18, count_manga1):
continue
if not manga1.is_r18 and count_same < min(min_same_labels, count_manga1):
continue
# if we have a language for this manga, ensure at least one of them matches
# note, we don't know what language the user will be, but if they don't have at least
# one language which is equal, there is no point in making it a similar manga
if len(manga1.languages) > 0 and len(manga2.languages) > 0:
count_same_lang = 0
for lang in manga1.languages:
if lang in manga2.languages:
count_same_lang += 1
if count_same_lang == 0:
continue
# append this matched manga to our current manga
manga_data[ct].matches.append({
'id': manga2.id,
'title': manga2.title,
'url': manga2.url,
'score': math.floor(scores[idx] / 2.0 * 10000)/10000.0,
'r18': manga2.is_r18,
'languages': manga2.languages,
})
# nice debug print
count_matched = count_matched + 1
print(" match " + str(manga2.id) + " (" + str(round(scores[idx] / 2.0, 4))
+ ") - " + manga2.title + " - " + manga2.url)
# if we have had some matches, then we are good
if count_matched > 0:
count_manga_matching = count_manga_matching + 1
count_manga_matched_to = count_manga_matched_to + count_matched
# Save our json to file!
manga_utils.write_raw_manga_data_files(dir_in, manga_data)
print("===========================")
print("outputted to " + dir_in)
# Finally get the compressed representation
dict_compressed = manga_utils.get_compressed_representation_string(manga_data)
manga_utils.make_dir_if_not(dir_out)
with open(dir_out + "mangas_compressed.json", 'w') as outfile:
json.dump(dict_compressed, outfile, sort_keys=False)
with gzip.open(dir_out + 'mangas_compressed.json.gz', 'wb') as f:
out_str = json.dumps(dict_compressed, sort_keys=False)
f.write(out_str.encode('utf-8'))
print("outputted to " + dir_out + "mangas_compressed.json")
print("compressed " + str(len(manga_data)) + " to only " + str(len(dict_compressed)) + " mangas")
# Write the relation between each mangadex id and external
dict_compressed_v2 = manga_utils.get_compressed_representation_string_v2(manga_data)
with open(dir_out + "md2external.json", 'w') as outfile:
json.dump(dict_compressed_v2["external"], outfile, sort_keys=False)
with gzip.open(dir_out + 'md2external.json.gz', 'wb') as f:
out_str = json.dumps(dict_compressed_v2["external"], sort_keys=False)
f.write(out_str.encode('utf-8'))
# write out the second api version format
path_output_api = dir_out + "api/"
if os.path.exists(path_output_api):
shutil.rmtree(path_output_api)
os.makedirs(path_output_api)
for key, manga1 in dict_compressed_v2["data"].items():
path_output_api_file = path_output_api + format(manga1["id"], '05') + ".json"
with open(path_output_api_file, 'w') as outfile:
json.dump(manga1, outfile, indent=2, sort_keys=False)
print("outputted to " + path_output_api)
print("saved " + str(len(dict_compressed_v2["data"])) + " api json endpoints (v2)")
# show how long it took to run
print("script took " + str(round(time.time() - time_start, 2)) + " seconds")
# output content to log file
manga_utils.make_dir_if_not(dir_logs)
with open(dir_logs + "log_calc_similarities.txt", "a") as myfile:
myfile.write(
"[" + datetime.utcnow().strftime("%B %d, %Y %H:%M:%S") + "]: " +
"Calculated similarity for " + str(count_manga_matching) + " mangas. " +
"Average of " + str(round(count_manga_matched_to / (count_manga_matching+1), 2)) + " matches per manga. " +
"Took " + str(round((time.time() - time_start) / 60.0, 2)) + " minutes to complete.\n"
)