-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreverse_index_builder.py
154 lines (120 loc) · 6.73 KB
/
reverse_index_builder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
from collections import defaultdict
from math import log10
from reverse_index import Reverse_index
import os.path
import dill
from parse_docs import Parse_cacm
class Reverse_index_builder:
PONDERATION_TF_IDF = 'tf_idf'
PONDERATION_NORMAL_TF_IDF = 'normal_tf_idf'
PONDERATION_NORMAL_FREQUENCY = 'normal_frequency'
PONDERATION_LIST = [PONDERATION_NORMAL_TF_IDF, PONDERATION_TF_IDF, PONDERATION_NORMAL_FREQUENCY]
def __init__(self, ponderation_method=PONDERATION_NORMAL_TF_IDF, index_type='dict', save_folder_path='data/'):
self.save_folder_path = save_folder_path
self.ponderation_name = ponderation_method
self.index_type = index_type
if(ponderation_method == self.PONDERATION_TF_IDF):
self.ponderation_method = self.create_with_ponderation_tf_idf
elif(ponderation_method == self.PONDERATION_NORMAL_TF_IDF):
self.ponderation_method = self.create_with_ponderation_normal_tf_idf
elif(ponderation_method == self.PONDERATION_NORMAL_FREQUENCY):
self.ponderation_method = self.create_with_ponderation_normal_frequency
else:
raise ValueError(ponderation_method)
return None
def create_reverse_index(self, documents_filename, common_words_filename):
# Load reverse index if already exists, create it (and save it) otherwise.
reverse_index_file = self.save_folder_path + self.ponderation_name + '.rev'
if os.path.isfile(reverse_index_file):
print 'Loading reverse index...',
with open(reverse_index_file, 'rb') as in_strm:
reverse_index = dill.load(in_strm)
print 'done'
else:
print 'Loading raw documents...',
# Parse the documents
Parser = Parse_cacm('sources/cacm.all', 'sources/common_words')
index = Parser.parse_file()
print 'done'
print 'Creating reverse index...',
reverse_index = self.ponderation_method(index)
reverse_index.other_infos['ponderation_method'] = self.ponderation_name
reverse_index.other_infos['number_of_documents'] = len(index)
with open(reverse_index_file, 'wb') as output:
dill.dump(reverse_index, output, dill.HIGHEST_PROTOCOL)
print 'done'
return reverse_index
def create_with_ponderation_tf_idf(self, index, compute_norm=True):
N = len(index)
reverse_index = Reverse_index(self.index_type)
reverse_index.idf = self.create_idf_counter(index)
reverse_index.other_infos['norms'] = defaultdict(lambda: defaultdict(float))
id_full_list = []
for (document_id, tf_counter) in index:
for term in tf_counter:
tf_idf_ponderation = (1 + self.custom_log(tf_counter[term])) * log10(float(N) / reverse_index.idf[term])
reverse_index.add_entry(term, document_id, tf_idf_ponderation)
id_full_list.append(document_id)
if compute_norm:
reverse_index.other_infos['norms'][document_id]['linear'] += tf_idf_ponderation
reverse_index.other_infos['norms'][document_id]['quadratic'] += tf_idf_ponderation * tf_idf_ponderation
reverse_index.set_id_set(set(id_full_list))
return reverse_index
def create_with_ponderation_normal_tf_idf(self, index):
reverse_index = self.create_with_ponderation_tf_idf(index, compute_norm=False)
# We need the unnormalized ponderation for vectorial search, when we normalize the query
reverse_index.other_infos['max_unnormalized_ponderation'] = defaultdict(float)
max_ponderation = {}
N = len(index)
for word in reverse_index.get_index():
max_ponderation[word] = max(reverse_index.get_entry(word).values())
reverse_index.other_infos['max_unnormalized_ponderation'][word] = max_ponderation[word]
# In-place modification. Avoids huge entries duplications.
for document_id in reverse_index.get_entry(word):
reverse_index.get_entry(word)[document_id] = reverse_index.get_entry(word)[document_id] / float(max_ponderation[word])
# Set norm.
for (document_id, tf_counter) in index:
for term in tf_counter:
sum_element = (1 + self.custom_log(tf_counter[term])) * log10(float(N) / reverse_index.idf[term]) / float(max_ponderation[term])
reverse_index.other_infos['norms'][document_id]['linear'] += sum_element
reverse_index.other_infos['norms'][document_id]['quadratic'] += sum_element * sum_element
return reverse_index
def create_with_ponderation_normal_frequency(self, index):
# w = tf / max_document (tf)
reverse_index = Reverse_index(self.index_type)
reverse_index.idf = self.create_idf_counter(index)
reverse_index.other_infos['norms'] = defaultdict(lambda: defaultdict(float))
id_full_list = []
max_frequency_in_document = defaultdict(int)
# First, create unnormalized reverse index...
for (document_id, tf_counter) in index:
for term in tf_counter:
tf_ponderation = tf_counter[term]
reverse_index.add_entry(term, document_id, tf_ponderation)
max_frequency_in_document[document_id] = max(max_frequency_in_document[document_id], tf_ponderation)
id_full_list.append(document_id)
# Then, normalize each term by the maximum frequency occurence in the document
for word in reverse_index.get_all_words():
for document_id in reverse_index.get_entry(word):
reverse_index.get_entry(word)[document_id] = reverse_index.get_entry(word)[document_id] / float(max_frequency_in_document[document_id])
reverse_index.other_infos['norms'][document_id]['linear'] += tf_ponderation
reverse_index.other_infos['norms'][document_id]['quadratic'] += tf_ponderation * tf_ponderation
reverse_index.set_id_set(set(id_full_list))
return reverse_index
def custom_log(self, number):
if number > 0:
return log10(float(number))
else:
return 0
def create_idf_counter(self, index):
idf_counter = defaultdict(int)
for (document_id, tf_counter) in index:
for word in tf_counter:
idf_counter[word] += 1
# Horribly slow
# idf_counter = Counter()
# for (document_id, tf_counter) in index:
# # Increment the idf counter by 1 for each different term contained in tf_counter
# # set(tf_counter) deletes the info about the occurrences, then we simply add with counters
# idf_counter += Counter(set(tf_counter))
return idf_counter