-
Notifications
You must be signed in to change notification settings - Fork 0
/
InvertedIndex.py
45 lines (33 loc) · 1.17 KB
/
InvertedIndex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
'''
Building an inverted index:
1. collect the documents to be indexed
2. tokenize the text
3. do linguistic processing (normalization)
4. for each token, add the documentID to the term's postings list
'''
from Postings import Postings
class InvertedIndex:
def __init__(self):
self.__dic = {}
def addTerm(self, term:str, docId:int):
# if term is in the dic
if term in self.__dic:
postingList = self.__dic[term]
# add new posting with the docId if the docId is not the same
lastPost = postingList[len(postingList)-1]
if (docId != lastPost.getDocumentId()):
postingList.append(Postings(docId))
else:
posting = Postings(docId)
postingList = []
postingList.append(posting)
self.__dic[term] = postingList
def getPostings(self, term:str):
postingList = []
if term in self.__dic:
postingList = self.__dic[term]
return postingList
def getVocabulary(self):
# need to sort
vocabulary = self.__dic.keys()
return sorted(vocabulary)