forked from theanti9/PyCrawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
content_processor.py
139 lines (120 loc) · 3.4 KB
/
content_processor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
from multiprocessing import Pool
import re, sys, logging
from ready_queue import ready_queue
logger = logging.getLogger("crawler_logger")
def rankKeywords(text):
invalid_keywords = ['', ' ', "i", "a", "an", "and", "the", "for", "be", "to", "or", "too", "also"]
ranks = {}
text = text.split(' ')
for t in text:
if t in invalid_keywords:
continue
if not ranks.has_key(t):
ranks[t] = 1
else:
ranks[t] += 1
return ranks
def stripPunctuation(text):
pattern = re.compile(r'[^\w\s]')
return pattern.sub(' ', text)
def stripScript(text):
pattern = re.compile(r'<script.*?\/script>')
return pattern.sub(' ', text)
class ContentProcessor:
def __init__(self, url, status, text):
self.keyword_dicts = []
self.invalid_keywords = ['', ' ', "i", "a", "an", "and", "the", "for", "be", "to", "or", "too", "also"]
self.keywords = {}
self.text = text
self.size = 0
self.url = url
self.status = status
def setText(self, text):
self.text = text
self.size = len(text)
def setUrl(self, url):
self.url = url
def setStatus(self, status):
self.status = status
def setInfo(self, url, status, text):
self.url = url
self.status = status
self.text = text
self.size = len(text)
def reset(self):
self.keyword_dicts = []
self.keywords = {}
self.text = None
self.head = None
self.body = None
self.title = None
self.size = 0
self.status = None
def combineKeywordLists(self):
if len(self.keyword_dicts) == 1:
self.keywords = self.keyword_dicts[0]
return
for l in self.keyword_dicts:
for k,v in l.items():
if self.keywords.has_key(k):
self.keywords[k] += v
else:
self.keywords[k] = v
# returns links to queue
def processBody(self):
queue = ready_queue(self.url, self.body)
#print "found %i links to queue" % len(queue)
self.text = stripPunctuation(self.remove_html_tags(stripScript(self.body)))
if len(self.text) > 5000:
offset = 0
i = 0
l = []
while True:
j = self.findnth(self.text[i:],' ',500)
offset += j
if j == -1:
break
l.append(self.text[i:j])
i = offset + j+1
logger.debug("processing with %i threads" % len(l))
try:
if len(l) == 0:
return []
pool = Pool(processes=(len(l)))
self.keyword_dicts = pool.map(rankKeywords, l)
except KeyboardInterrupt:
pool.terminate()
pool.join()
sys.exit()
else:
pool.close()
pool.join()
logger.debug("processed, returned %i dicts" % len(self.keyword_dicts))
else:
self.keyword_dicts.append(rankKeywords(self.text))
return queue
def processHead(self):
pass
def remove_html_tags(self, data):
p = re.compile(r'<.*?>')
return p.sub('', data)
def findnth(self, haystack, needle, n):
parts = haystack.split(needle, n)
if len(parts) <= n:
return -1
return len(haystack)-len(parts[-1])-len(needle)
# returns the queue from processBody
def process(self):
text_lower = self.text.lower()
self.title = self.text[text_lower.find('<title')+6:text_lower.find('</title>')]
self.head = self.text[text_lower.find('<head')+5:text_lower.find('</head>')]
self.processHead()
self.body = self.text[text_lower.find('<body'):text_lower.find('</body>')]
queue = self.processBody()
self.combineKeywordLists()
return queue
def getDataDict(self):
for k,v in self.keywords.items():
if v < 3:
del self.keywords[k]
return {"address":self.url, "title":self.title, "status":self.status, "size":self.size, "keywords":self.keywords}