-
Notifications
You must be signed in to change notification settings - Fork 0
/
Index.py
79 lines (65 loc) · 2.35 KB
/
Index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
'''
Index: contains PositionalInvertedIndex and BiwordIndex
TODO: add more Index
'''
from PositionalInvertedIndex import PositionalInvertedIndex
from BiwordIndex import BiwordIndex
from Corpus import Corpus
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
class Index:
def __init__(self, directory):
# invertedIndex = 'InvertedIndex'
self.__index = []
self.__directory = directory
self.__corpus = self.buildCorpus()
def PositionalInvertedIndexer(self):
# stemming
ps = PorterStemmer()
# create InvertedIndex obj
positionalInvertedIndex = PositionalInvertedIndex()
corpus = self.__corpus
for docId in corpus:
position = 1
doc = corpus[docId]
content = doc.getContent()
# tokenize
tokens = word_tokenize(content)
for token in tokens:
token = token.lower()
# apply stemming
token = ps.stem(token)
# add to positionalInvertedIndex
positionalInvertedIndex.addTerm(token, docId, position)
position +=1
return positionalInvertedIndex
def BiwordIndexer(self):
# stemming
ps = PorterStemmer()
biwordIndex = BiwordIndex()
corpus = self.__corpus
for docId in corpus:
doc = corpus[docId]
content = doc.getContent()
# tokenize
tokens = word_tokenize(content)
prevToken = tokens[0]
for i in range(1,len(tokens)):
token = tokens[i]
token = token.lower()
# apply stemming
token = ps.stem(token)
word = prevToken + " "+ token
prevToken = token
# add to Biword Index text = [I love coffee] --> "I love" and "love coffee"
biwordIndex.addTerm(word, docId)
return biwordIndex
# build the corpus for the search engine app
def buildCorpus(self):
corpus = Corpus()
return corpus.buildCorpus(self.__directory)
def buildIndex(self):
return self.PositionalInvertedIndexer(), self.BiwordIndexer()
def getCorpus(self):
return self.__corpus