-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtfidf.py
99 lines (85 loc) · 2.73 KB
/
tfidf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# ddd.qqq = lnc.ltn
import math
from PostingList import PostingList
import os
from collections import OrderedDict
class tfidf:
def __init__(self):
pl = PostingList()
self.posting_list = pl.get_posting_list()
self.N = len(os.listdir("Dataset"))
self.tfidf_wts = {}
def log_tf(self, tf):
if tf == 1:
return 1
return 1 + math.log10(tf)
def idf(self, df, isQuery):
if not isQuery:
return 1
if df == 0:
return 0
return math.log10(self.N/df)
# def __cosine_normalization(self, dic):
# sum = 0
# for key in dic:
# sum += dic[key] ** 2
# sum = math.sqrt(sum)
# for key in dic:
# dic[key] /= sum
# return dic
def normalize(self, dic, isQuery):
if isQuery:
return dic
sum = 0
for key in dic:
sum += dic[key] ** 2
sum = math.sqrt(sum)
for key in dic:
dic[key] /= sum
return dic
def query_tfidf(self, query):
dic = {}
for word in query:
if word in dic:
dic[word] += 1
else:
dic[word] = 1
# idf = self.idf(0, isQuery=True)
for key in dic:
idf = self.idf(self.posting_list[key][0], isQuery=True)
dic[key] = self.log_tf(dic[key]) * idf
dic = self.normalize(dic, isQuery=True)
return dic
def tf_idf(self, is_query):
for key in self.posting_list:
idf = self.idf(self.posting_list[key][0], is_query)
dic = self.posting_list[key][1]
for key2 in dic:
dic[key2] = self.log_tf(dic[key2]) * idf
# print(dic)
self.posting_list[key][1] = self.normalize(dic, is_query)
self.tfidf_wts = self.posting_list
self.store_tfidf_weights("tfidf_weights/weights.txt")
def get_tfidf_weights(self):
exist = os.path.exists("tfidf_weights/weights.txt")
# print(exist)
if exist:
if os.stat("tfidf_weights/weights.txt").st_size != 0: # check if file is not empty
file = open('tfidf_weights/weights.txt',mode='r')
text = str(file.read())
file.close()
dictionary = eval(text)
self.tfidf_wts = dictionary
else:
self.tf_idf(is_query=False)
# saves posting lists to a file
def store_tfidf_weights(self, filename):
f = open(filename, 'w')
f.write(str(self.tfidf_wts))
f.close()
def get_N(self):
return self.N
if __name__ == '__main__':
tfidf = tfidf()
tfidf.get_tfidf_weights()
print(tfidf.tfidf_wts)