-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
394 lines (356 loc) · 10.8 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
import heapq
import os, sys
from os import path
import xml.sax
import re
from collections import defaultdict
from nltk.corpus import stopwords
import Stemmer
import time
total_documents = 99 # stores number of files inverted index is divided into
total_titles = 1 # total title file
titles_map = defaultdict(list)
document_terms = [] # stores number of terms in a document
indexMap = defaultdict(list)
article_number = 0 #stores total number of articles in the end
# modules
stemmer = Stemmer.Stemmer('english')
stopwords = set(stopwords.words('english'))
# args
wiki_dump_path = str(sys.argv[1])
inverted_output_path = str(sys.argv[2])
class WikiDumpHandler(xml.sax.ContentHandler):
def __init__(self):
self.title = ""
self.body = ""
self.cur_type = ""
# called automatically by the parser
def startElement(self,tag,attributes):
self.cur_type = tag
# called automatically by parser when content inside
# the tag is encountered
def characters(self,content):
# print(content, end='')
if self.cur_type == "title":
self.title += content
elif self.cur_type == "text":
self.body += content
# called automatically by the parser
def endElement(self,tag):
if tag == "page":
global titles_map
global article_number
article_number += 1
titles_map[article_number].append(self.refine(self.title))
process_text = ProcessText()
process_text.first_call(self.title, self.body)
self.body = ""
self.title = ""
self.cur_type = ""
def refine(self, title):
return title.strip()
##################################################
class ProcessText:
def __init__(self):
self.title = []
self.body = []
self.infobox = []
self.category = []
self.external_links = []
self.references = []
def first_call(self, title, body):
text = body.split('==References==')
if len(text) == 1:
text = body.split('== References == ')
self.infobox = self.getInfobox(text[0])
self.body = self.getBody(text[0])
self.title = self.getTitle(title)
if len(text) != 1:
self.category = self.get_category(text[1])
self.external_links = self.get_external_links(text[1])
self.references = self.get_references(text[1])
# words in the document after tokenization, stop word removal and stemming
global document_terms
total_words = len(self.infobox) + len(self.body) + len(self.title) + len(self.category) + len(self.external_links) + len(self.references)
document_terms.append(total_words)
# create index
index = Index()
index.createIndex(self.title, self.body, self.infobox, self.category, self.external_links, self.references)
# process_text calls
############################
def tokenize(self, data):
data = data.encode("ascii", errors="ignore").decode()
data = re.sub(r'http[^\ ]*\ ', r' ', data)
data = re.sub(r' |<|>|&|"|'', r' ', data)
data = re.sub(r'\—|\%|\$|\'|\||\.|\*|\[|\]|\:|\;|\,|\{|\}|\(|\)|\=|\+|\-|\_|\#|\!|\`|\"|\?|\/|\>|\<|\&|\\|\u2013|\~|\@|\(|\)|\n', r' ', data)
data = data.lower()
# data = data.split()
return data.split()
def removeStopWords(self, data):
return [w for w in data if w.lower() not in stopwords and len(w)>1]
# return data
def stem(self, data):
# data = data.lower()
return stemmer.stemWords(data)
# get body
def getBody(self, text):
data = re.sub(r'\{\{.*\}\}', r' ', text)
data = self.tokenize(data)
data = self.removeStopWords(data)
data = self.stem(data)
return data
def getTitle(self, text):
data = self.tokenize(text)
data = self.removeStopWords(data)
data = self.stem(data)
return data
# extract external links
def get_external_links(self, text):
data = text.split('\n')
links = []
for line in data:
if re.match(r'\*[\ ]*\[', line):
links.append(line)
data = self.tokenize(' '.join(links))
data = self.removeStopWords(data)
data = self.stem(data)
return data
# extract info box
def getInfobox(self, text):
data = text.split('\n')
flag = 0
info = []
for line in data:
if re.match(r'\{\{infobox', line):
flag = 1
info.append(re.sub(r'\{\{infobox(.*)', r'\1', line))
elif re.match(r'\{\{Infobox', line):
flag = 1
info.append(re.sub(r'\{\{Infobox(.*)', r'\1', line))
elif flag == 1:
if line == '}}':
flag = 0
continue
info.append(line)
data = self.tokenize(' '.join(info))
data = self.removeStopWords(data)
data = self.stem(data)
return data
# extract category
def get_category(self, text):
data = text.split('\n')
# print(data)
categories = []
for line in data:
if re.match(r'\[\[Category', line):
categories.append(re.sub(r'\[\[Category:(.*)\]\]', r'\1', line))
elif re.match(r'\[\[category', line):
categories.append(re.sub(r'\[\[category:(.*)\]\]', r'\1', line))
data = self.tokenize(' '.join(categories))
data = self.removeStopWords(data)
data = self.stem(data)
return data
# extract references
def get_references(self, text):
data = text.split('\n')
refs = []
for line in data:
if re.search(r'<ref', line):
refs.append(re.sub(r'.*title[\ ]*=[\ ]*([^\|]*).*', r'\1', line))
data = self.tokenize(' '.join(refs))
data = self.removeStopWords(data)
data = self.stem(data)
return data
#####################
class Index:
def __init__(self):
pass
def createIndex(self, title, body, infobox, category, external_links, references):
global indexMap
global article_number
words = defaultdict(int)
d = defaultdict(int)
for word in title:
d[word] += 1
words[word] += 1
title_index = d
d = defaultdict(int)
for word in body:
d[word] += 1
words[word] += 1
body_index = d
d = defaultdict(int)
for word in infobox:
d[word] += 1
words[word] += 1
infobox_index = d
d = defaultdict(int)
for word in category:
d[word] += 1
words[word] += 1
category_index = d
d = defaultdict(int)
for word in external_links:
d[word] += 1
words[word] += 1
external_links_index = d
d = defaultdict(int)
for word in references:
d[word] += 1
words[word] += 1
references_index = d
for word in words.keys():
string = 'd' + str(article_number)
t = title_index[word]
if t:
string += 't' + str(t)
b = body_index[word]
if b:
string += 'b' + str(b)
i = infobox_index[word]
if i:
string += 'i' + str(i)
c = category_index[word]
if c:
string += 'c' + str(c)
l = external_links_index[word]
if l:
string += 'l' + str(l)
r = references_index[word]
if r:
string += 'r' + str(r)
indexMap[word].append(string)
# print(article_number%20000==0)
if article_number%100000 == 0:
printIndex = PrintToFile()
printIndex.output_to_file()
print("1L done")
if article_number%100000 == 0:
TitleObject = TitlesFile()
TitleObject.output_to_file()
print("1L titles done")
#####################################
class PrintToFile:
def __init__(self):
pass
def output_to_file(self):
global indexMap
global inverted_output_path
global total_documents
total_documents += 1
if not path.exists(inverted_output_path):
os.mkdir(inverted_output_path)
output_file = inverted_output_path + "index" + str(total_documents)+".txt"
with open(output_file,"w") as out:
for i,word in enumerate(sorted(indexMap.keys())):
# print(f'{"Word postings written to file: "+str(i+1)}\r', end="")
postings = indexMap[word]
# print(str(word)+" "+" ".join(postings))
out.write(str(word)+" "+" ".join(postings)+"\n")
# clear global variables for next iteration
indexMap = defaultdict(list)
############################################
# also prints number of terms of document in the dam file
class TitlesFile:
def __init__(self):
pass
def output_to_file(self):
global titles_map
global document_terms
global total_titles
output_path = "./title/"
if not path.exists(output_path):
os.mkdir(output_path)
output_file = output_path + "title" + str(total_titles) + ".txt"
with open(output_file,"w") as out:
for i,index in enumerate(titles_map):
title = titles_map[index]
out.write(str(index) + " " + str(title[0]) + ":" + str(document_terms[i]) + "\n")
document_terms = []
titles_map = defaultdict(list)
total_titles += 1
############################################
class Merge:
def __init__(self):
global total_documents
self.cur_file = 1
self.total_documents = total_documents
self.input_path = "./index/"
self.output_path = "./Inverted Index/"
self.files = {}
self.flag = [0] * total_documents
self.top = {}
self.heap = []
self.words = {}
self.count = 0
self.data = defaultdict(list)
def mergeFiles(self):
if not path.exists(self.output_path):
os.mkdir(self.output_path)
for i in range(self.total_documents):
filename = "./index/index" + str(i+100) + ".txt"
self.files[i] = open(filename,"r")
self.flag[i] = 1
self.top[i] = self.files[i].readline().strip()
self.words[i] = self.top[i].split()
# print(self.words[i][0])
if self.words[i][0] not in self.heap:
heapq.heappush(self.heap, self.words[i][0])
while any(self.flag) == 1:
self.count += 1
temp = heapq.heappop(self.heap)
for i in range(self.total_documents):
if self.flag[i]:
if self.words[i][0] == temp:
self.data[temp].extend(self.words[i][1:])
self.top[i] = self.files[i].readline().strip()
if self.top[i] == '':
self.flag[i] = 0
self.files[i].close()
else:
self.words[i] = self.top[i].split()
if self.words[i][0] not in self.heap:
heapq.heappush(self.heap, self.words[i][0])
if self.count % 500000 == 0:
self.output_to_file()
print(str(self.count) + " done")
# print remaining
self.output_to_file()
print("remaining done")
def output_to_file(self):
filename = self.output_path + str(self.cur_file) + ".txt"
with open(filename,"w") as out:
for i,word in enumerate(sorted(self.data.keys())):
out.write(str(word)+" "+" ".join(self.data[word])+"\n")
self.data = defaultdict(list)
self.cur_file += 1
############################################
# make parser
# start = time.time()
# parser = xml.sax.make_parser()
# turn off namepsaces so that startElement and endElement
# are called for every tag
# parser.setFeature(xml.sax.handler.feature_namespaces, 0)
# change the handler
# Handler = WikiDumpHandler()
# parser.setContentHandler(Handler)
# initial = time.time()
# for file in os.listdir(wiki_dump_path):
# if file[0] != '.':
# start = time.time()
# parser.parse(str(wiki_dump_path)+str(file))
# print("Done " + str(file))
# print("Time taken = " + str(time.time() - start))
# print("Total time taken = " + str(time.time() - initial))
# print remaining entries
# printIndex = PrintToFile()
# printIndex.output_to_file()
# TitleObject = TitlesFile()
# TitleObject.output_to_file()
# print("remaining done")
# with open("total.txt","w") as out:
# out.write(str(total_documents))
# merge files
mergeObject = Merge()
mergeObject.mergeFiles()
print("Total time taken = " + str(time.time() - initial))