-
Notifications
You must be signed in to change notification settings - Fork 2
/
QueryExpansion.py
473 lines (395 loc) · 15.1 KB
/
QueryExpansion.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
from data_helper import *
from constants import *
from properties_helper import VECTOR_OFFSET
import math
import re
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk import pos_tag
import string
########################### DEFINE CONSTANTS ###########################
vector_post_file_handler = open(VECTOR_POSTINGS_FILE, 'rb')
document_properties = load_data(DOCUMENT_PROPERTIES_FILE)
total_num_documents = len(document_properties)
# debugging only
# def normalise_term(x):
# return x
def idf_transform(x): return math.log(total_num_documents/x, 10)
AND = "AND"
stemmed_stopwords = set([normalise_term(t) for t in stopwords.words('english')])
unstemmed_stopwords = stopwords.words('english')
punctuation = string.punctuation
######################## DRIVER FUNCTION ########################
def get_new_query_strings(line):
"""
First Level Query Refinement Public Method
This method takes a string as the input. This string should be the original query string that is fed into the program.
The possible transformations available are:
0. Positive list (not used here)
1. + Phrase + Bool
2. + Phrase - Bool
3. - Phrase - Bool
4. + Wordnet - Bool
5. Rocchio relevance feedback (not used here)
6. - Phrase + Bool
A list of new query strings will be returned in the order of 034+6.
If any of the query strings are duplicated as a result of the transformation, only one of them will be inserted
into the result.
Bool Query Example:
quiet AND "phone call" ->
1 'quiet AND "phone call"',
2 '"phone call" quiet',
3 'phone call quiet',
4 'smooth mute "quiet down" tranquil tranquillis still tranquil quiet subdu restrain hush placid tranquil quietli
quieten placid tranquil "pipe down" repos tranquil silenc quiesc unruffl hush lull calm seren "calm down"
"phone call" "telephon call" call'
Free Text Query Example:
quiet "phone call" ->
1 'quiet AND "phone call"',
2 'quiet "phone call"',
3 'quiet phone call',
4 'subdu mute tranquil tranquil tranquil hush smooth hush "quiet down" restrain calm still placid placid
quietli quiesc silenc lull "pipe down" tranquillis seren repos unruffl "calm down" tranquil tranquil
quiet quieten call "telephon call" "phone call"'
Additional information:
Wordnet finds the possible synonyms/hyponym of each term in the query string and puts all of them back into
the query string.
:param line: Query String to be expanded
"""
if not isinstance(line, str):
raise Exception("Wrong usage of method: query string should be a str")
# This is the result to be returned
result = []
# Create tokens out of the query string
is_bool, is_phrase, tokens = tokenize(line) # no distinct.
stokens = filter_duplicates(tokens) # distinct. No longer works with AND.
###### 3. NO PHRASE NO BOOL
newlinelist = []
for token in tokens:
if token != AND:
for subtoken in token.split():
# if not (subtoken in unstemmed_stopwords):
newlinelist.append(subtoken)
result.append(convert_list_to_string(newlinelist))
######
##### 4. NO BOOL Wordnet Synonyms
newlinelist = []
count = 0
for token in stokens:
if token != AND:
if token in unstemmed_stopwords:
newlinelist.append(token)
else:
thesaurized = thesaurize_term(token) + [token]
newlinelist += thesaurized
if len(thesaurized) > 1:
count += len(thesaurized) - 1
# Special trigger to switch Rocchio on if Wordnet is not useful
if count < TRIGGER_ROCCHIO_LEVEL:
import constants
constants.EXPAND_QUERY = True
result.append(convert_list_to_string(newlinelist, filter=True))
#####
##### 1. Keep PHRASE Keep BOOL
# result.append(convert_list_to_string(tokens))
#####
###### 2. PHRASE NO BOOL (not used)
# newlinelist = []
# for token in tokens:
# if token != AND:
# newlinelist.append(token)
# result.append(convert_list_to_string(newlinelist))
# ###### 6. NO PHRASE BOOL (not used)
# newlinelist = []
# for token in tokens:
# if token != AND:
# for subtoken in token.split():
# newlinelist.append(subtoken)
# newlinelist = intersperse(newlinelist, AND)
# result.append(convert_list_to_string(newlinelist))
# ######
###### 4.1 NO BOOL POS TAG Wordnet Synonyms (Not useful since the user's free text query can be quite bad)
# newlinelist = []
# tagged = pos_tag(tokens)
# for word, pos in tagged:
# if word != AND:
# if word in unstemmed_stopwords:
# newlinelist.append(word)
# else:
# thesaurized = thesaurize_term_with_pos(word, pos) + [word]
# newlinelist += thesaurized
# result.append(convert_list_to_string(newlinelist, filter=True))
######
##### 4.2 NO BOOL Wordnet Hynonyms (Too many terms returned and may explode)
# newlinelist = []
# for token in stokens:
# if token != AND:
# if token in unstemmed_stopwords:
# newlinelist.append(token)
# else:
# thesaurized = hyponymise_term(token) + [token]
# newlinelist += thesaurized
# result.append(convert_list_to_string(newlinelist, filter=True))
#####
# Remove duplicates
result = filter_duplicates(result)
return result
######################## RELEVANCE FEEDBACK METHODS ########################
def get_new_query_vector(vector, docIDs):
"""
Pseudo Relevance Feedback Public Method. This uses the Rocchio algorithm.
This method takes in the original query vector and list of docIDs.
The query vector is modelled as sparse vector where it is a term -> score mapping. Zero scores are not stored too.
The centroid from the list of docIDs will be calculated and added to original query vector.
Finally, the resulting vector is trimmed so that only the top k terms are returned
:param vector: Original query vector
:param docIDs: List of docIDs to get the centroid
"""
# Guard methods
if not isinstance(vector, dict):
raise Exception("Wrong usage of method: vector should be a dict")
if not isinstance(docIDs, list):
raise Exception("Wrong usage of method: docIDs should be a list")
offset = get_centroid_vector(docIDs)
for key, value in vector.items():
vector[key] = vector[key] * ROCCHIO_ORIGINAL_QUERY_WEIGHT
for key, value in offset.items():
vector[key] = vector.get(key, 0.) + value * ROCCHIO_CENTROID_WEIGHT
return vector
def get_centroid_vector(docIDs):
"""
Util Method which given a set of docIDs, returns the centroid of the document vectors.
The document vectors are precomputed and stored in the document properties dictionary.
A sparse dictionary of term to score mapping is returned.
:param docIDs: List of docIDs to get the centroid
"""
num_of_docs = len(docIDs)
offset = {}
for docID in docIDs:
docID = int(docID)
vector, normalisator = get_vector_from_docID_offset(
document_properties[docID][VECTOR_OFFSET])
for key, value in vector.items():
normalised = extract_value(value) / normalisator
offset[key] = offset.get(key, 0.) + normalised
# Take average
for k in offset.keys():
offset[k] /= num_of_docs
return trim_vector(offset)
def trim_vector(vector):
"""
Since Rocchio will return a large vector, we will only return the top k terms.
Stopwords and and punctuation are filtered from the top k terms.
:param: vector: Sparse vector
"""
new_vector = dict()
number_of_terms_insert = 0
from operator import itemgetter
sort = sorted(vector.items(), key=itemgetter(1), reverse=True)
for key, value in sort:
if (not (key in stemmed_stopwords)) and (not (key in punctuation)):
new_vector[key] = value
number_of_terms_insert += 1
if number_of_terms_insert >= ROCCHIO_TERMS:
break
return new_vector
def extract_value(tuple):
"""
This method is to abstract away the format of vector.txt. Vector.txt keeps all vectors in a tf, df format.
Currently, this method produces tfidf.
:param: tuple: Tuple data that is saved inside vector.txt
"""
return log_tf(tuple[0]) * idf_transform(tuple[1])
def get_vector_from_docID_offset(offset):
"""
Given the docID offset, get the sparse vector from vector.txt
:param: offset: integer offset of the sparse vector inside vector.txt
"""
# vector are stored as sparse indexes
# each valid index will map to (tf, idf)
data = load_data_with_handler(vector_post_file_handler, offset)
# we need to normalise
normalisator = 0.
for key, value in data.items():
normalisator += extract_value(value) ** 2
normalisator = math.sqrt(normalisator)
return data, normalisator
######################## WORDNET EXPANSION METHODS ########################
def thesaurize_term(word):
"""
Given a term t, returns an list of unique synonyms.
If a term that has two words is given, the space will be replaced by a _ (This is the WordNet format)
The resulting list will also have _ replaced back to space.
:param: word: Word to be used against word
"""
word = word.replace(" ", "_")
terms = []
for synset in wordnet.synsets(word):
for item in synset.lemma_names():
terms.append(item)
return list(set(convert_wordnet_terms(terms)))
def thesaurize_term_with_pos(word, pos):
"""
Similar to theasurize term, this method takes in the POS tag of the word,
which helps wordnet to further reduce the number of terms returned
:param: word: Word to be used against word
:param: pos: POS Tag of the word
"""
word = word.replace(' ', '_')
terms = []
for synset in wordnet.synsets(word, pos=get_wordnet_pos(pos)):
for lemma in synset.lemmas():
non_lemmatized = lemma.name().split('.', 1)[0].replace('_', ' ')
terms.append(non_lemmatized)
return (list(set(terms)))
def get_wordnet_pos(treebank_tag):
"""
Util function to convert word tokenise's pos tags to wordnet pos tag
"""
if treebank_tag.startswith('J'):
return wordnet.ADJ
elif treebank_tag.startswith('V'):
return wordnet.VERB
elif treebank_tag.startswith('N'):
return wordnet.NOUN
elif treebank_tag.startswith('R'):
return wordnet.ADV
else:
return ''
def hyponymise_term(word):
"""
Given a term t, return an list of unique hyponyms.
If a term that has two words is given, the space will be replaced by a _ (This is the WordNet format)
The resulting list will also have _ replaced back to space.
:param: word: Word to be used against word
"""
word = word.replace(" ", "_")
terms = []
for synset in wordnet.synsets(word):
for item in synset.closure(lambda s: s.hyponyms()):
terms += item.lemma_names()
return list(set(convert_wordnet_terms(terms)))
def convert_wordnet_terms(terms):
"""
Convert wordnet format back to normal terms such as replacing _ with spaces.
Drops terms that are more than a triword
:param: terms: List of terms in wordnet format
"""
newterms = []
for term in terms:
term = term.replace("_", " ")
if len(term.split()) > 2:
pass
newterms.append(term)
return newterms
######################## UTIL FUNCTIONS ########################
def strip_query_to_free_text(line):
"""
A really simple util function that strips out all boolean operators and phrase markings.
:param: line: Query string
"""
result = []
b1, b2, tokens = tokenize(line)
for token in tokens:
if token != AND:
for subtoken in token.split():
result.append(subtoken)
return convert_list_to_string(result)
def tokenize(line):
"""
Tokenises a line to a list of words, using the delimiter as space or ".
For example, the string 'quiet "phone call"' is converted to a list ['quiet', 'phone call'].
Also returns is_bool and is_phrase to indicate if the line has boolean query or phrases respectively.
:param: line: Query string
"""
is_bool = False
is_phrase = False
line = " ".join(line.split())
result = []
temp = ""
capture = False
for c in line:
if c == '"':
is_phrase = True
if capture == False:
capture = True
else:
result.append(temp.strip())
capture = False
temp = ""
elif c== " ":
if capture:
temp += c
continue
else:
result.append(temp.strip())
temp = ""
else:
temp += c
result.append(temp.strip())
result = list(filter(None, result))
if "AND" in result:
is_bool = True
return is_bool, is_phrase, result
def intersperse(lst, item):
"""
Util method which adds an item in-between every element in the list.
:param lst: list to be modified
:param item: item to be inserted
"""
result = [item] * (len(lst) * 2 - 1)
result[0::2] = lst
return result
def filter_duplicates(line_list):
"""
This method takes in a list of terms and removes the duplicated terms.
:param line_list: List of tokens
"""
tempresult = []
tempresult_set = set()
for i in line_list:
if i in tempresult_set:
pass
else:
tempresult.append(i)
tempresult_set.add(i)
return tempresult
def normalise_all_tokens_in_list(line_list):
"""
This method takes in a list of terms and normalises each of them using case folding and stemming.
A list of normalised terms are returned.
:param: line_list: list of tokens
"""
for i in range(len(line_list)):
if line_list[i] == AND:
continue
line_list[i] = " ".join([normalise_term(x)
for x in line_list[i].split()])
return line_list
def convert_list_to_string(line_list, filter=False):
"""
Util function which converts a list of tokens into string.
If filter is true, duplicates are removed.
Filter will remove also remove AND and cannot be used with boolean queries.
:param: line_list: list of tokens
:param: filter: enable removal of duplicates
"""
result = ""
# normalise all tokens first
line_list = normalise_all_tokens_in_list(list(line_list))
if filter:
line_list = filter_duplicates(line_list)
for line in line_list:
if line == AND:
result += line + " "
continue
subline = line.split()
if len(subline) > 1:
result += ' "'
for s in subline:
result += s + " "
result = result[:-1]
result += '" '
else:
result += line + ' '
return result.strip().replace(" ", " ")