-
Notifications
You must be signed in to change notification settings - Fork 0
/
test.py
186 lines (134 loc) · 4.58 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
from PorterStemmer import PorterStemmer
def printContext(docId, termFreq, q):
def getDoc(line, docId): #returns the relavent document
line = int(line)
nextIndex = str(int(docId) + 1)
d = ""
while True:
if line == len(docs)-1:
break
elif docs[line] == nextIndex:
break
d += docs[line]+'\n'
line+=1
#print("document: ")
#print(d)
return d
def context(doc, term):
print("CONTEXT:")
term = term.lower()
doc = doc.lower()
doc = doc.split()
count = 0
for word in doc:
if word.find(term)!=-1:
break
print(word)
count += 1
if count - 10 < 0:
start = 0
else:
start = count - 10
if count + 10 > len(doc):
end = len(doc)
else:
end = count + 10
context = ""
for i in range(start, end):
if i == count:
context+=" >>>"
elif i == count+1:
context += "<<< "
context += doc[i] + " "
print("..." + context + "...")
#print(docId)
#print(termFreq)
for index in docId: #index are the documents that need to be searched for the context
line = 0
for id in docs:
line += 1
if id == index:
#print(f"found at line {line}")
title = docs[line]
print(f"Document ID: {id}")
print(f"Title: {title}")
print(f"Occurrences in document: {termFreq[docId.index(id)]}")
document = getDoc(line, index)
context(document, q)
print("--------------------------------------------------------------")
break
def findMatches(q):
count = 0
for l in dict:
count+=1
bracket = l.find('[') - 1
if l[:bracket] == q:
#print(f"Matching term found at line {count}")
break
elif l == dict[-1]:
print(f"THERE IS NO TERM '{q}' IN THE DICTIONARY")
return
print(f"DOCUMENT FREQUENCY OF '{q}':{dict[count - 1][len(q):]}\n")
#print(posts[count - 1])
listings = posts[count - 1].split(', ')
#print(listings)
docID = []
termFreq = []
for i in listings:
docID.append(i[1:i.find(',')])
termFreq.append(i[i.find(',')+1:-1])
termFreq[-1] = termFreq[-1][:-1]
#print(docID)
#print(termFreq)
printContext(docID, termFreq, q)
def ProcessQuery(q):
punctuation = '''!()-[]{};:'"\, <>./?@#$%^&*_~'''
q = q.lower().strip()#change everything to lowercase
for c in q:
if c in punctuation:
q = q.replace(c, " ")
print(f"Post-punctuation removal: {q}")
l = q.split()
if STEMMER_ENABLED:
for w in l:
q = q.replace(w, p.stem(w, 0, len(w)-1))
print(f"Post-stemming: {q}")
return q
if __name__=="__main__":
p = PorterStemmer() #create stemmer object
dictionary = open("dictionary.txt", 'r') #open dictionary and postings file
postings = open("postings.txt", 'r')
documents = open("documents.txt", 'r')
dict = dictionary.read().splitlines() #load dictionary and postings into memory
posts = postings.read().splitlines()
docs = documents.read().splitlines()
while True:
x = input("Did you enable stemming for invert.py? (y/n): ")
x = x.lower()
if x == "y":
STEMMER_ENABLED = True
print("Input accepted. Stemming enabled for queries.")
break
elif x == "n":
STEMMER_ENABLED = False
print("Input accepted. Stemming disabled for queries.")
break
else:
print("Invalid entry.")
#add stopword enable or disable
while True:
q = input("Enter in a single term (or 'ZZEND to stop'): ")
if q.lower() == 'zzend':
print("Terminating Program.")
break
elif len(q.split())>1:
print("Multiple terms were inputted, enter only one term")
else:
print(f"Input query: {q}")
ppq = ProcessQuery(q)
print(f"Post-processed query: {ppq}\n")
findMatches(ppq)
print("\n")
documents.close()
dictionary.close()
postings.close()