-
Notifications
You must be signed in to change notification settings - Fork 0
/
Crawler.py
159 lines (134 loc) · 5.77 KB
/
Crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#!/bin/bash python
from bs4 import BeautifulSoup
import urllib2
import random
import urlparse
import time
import csv
import requests
class Crawler(object):
'''Contructor '''
def __init__(self):
self.soup = None # Beautiful Soup object
self.current_page = "http://uoregon.edu/" # Current page's address
self.links = set() # Queue with every links fetched
self.visited_links = set()
self.numberVisited = {}
self.counter = 0 # Simple counter for debug purpose
'''open url and acquire list of urls'''
def open(self):
# Open url
print self.counter, ":", self.current_page
# Try to open url incase there is a file extension it will through the
# exception page Error for Error 404 or 500 data
try:
# html = requests.get(self.current_page).text
res = urllib2.urlopen(self.current_page)
html_code = res.read()
self.visited_links.add(self.current_page)
self.numberVisited[self.current_page] = 1
# self.soup = BeautifulSoup(html)
self.soup = BeautifulSoup(html_code)
except:
"page Error"
# Fetch all Links
page_links = []
try:
for link in [h.get('href') for h in self.soup.find_all('a')]:
# check to avoid these in the link data
if link is not None and u'calendar' not in link and u'.com' \
not in link and u'Shibboleth' not in link and u'.pdf' \
not in link and u'.gzip' not in link and u'.zip' \
not in link and u'.aspx' not in link \
and u'search' not in link \
and u'jobs' not in link \
and u'?' not in link \
and u'&' not in link \
and u'=' not in link \
and u'#' not in link \
and u'.jpg' not in link \
and u'.ppsx' not in link \
and u'.pptx' not in link \
and u'mp4' not in link \
and u'png' not in link \
and u'php' not in link \
and u'jpeg' not in link:
print "Found link: '" + link + "'"
if link.startswith('http') and u'uoregon.edu' in link:
page_links.append(link)
print "Adding link" + link + "\n"
elif link.startswith('//'):
parts = urlparse.urlparse(self.current_page)
page_links.append(parts.scheme + ":" + link)
print "Adding link " + parts.scheme + ":" + link
elif link.startswith('/'):
parts = urlparse.urlparse(self.current_page)
page_links.append(parts.scheme + '://' +
parts.netloc + link)
print "Adding link " + parts.scheme + '://' + \
parts.netloc + link + "\n"
else:
pass
except Exception, ex:
print ex
# Checks if link is in Dictionary if so add 1 if not add it and place 1
# for the count
for link in page_links:
if link in self.numberVisited:
self.numberVisited[link] += 1
else:
self.numberVisited[link] = 1
# Update the links data
self.links = self.links.union(set(page_links))
# Choose a random url from non-visited set
self.current_page = random.sample(self.links.difference(
self.visited_links), 1)[0]
self.counter += 1
def run(self):
start = time.time()
# Crawl 100 webpages (or stop if all url has been fetched)
while len(self.visited_links) < 700 or \
(self.visited_links == self.links):
self.open()
for link in self.links:
print link
csvFile = open("C:/Users/Matthew/Documents/CS321/cs321proj3" +
"/csvFiles/urlCSVdata.csv", mode="w")
csvWriter = csv.writer(csvFile)
csvWriter.writerow(['Id Number', 'Number of times',
'File Name', 'URL', 'title', 'text'])
count = 0
for link in self.links:
try:
# Replace special characters for a file Name
fileName = link.replace(":", "").replace("/", "").\
replace(".", "").replace("?", "").replace("=", "").\
replace("&", "").replace("%", "")
with open("C:/Users/Matthew/Documents/CS321/cs321proj3" +
"/htmlFiles/"+fileName+".txt", mode="w") as outfile:
result = urllib2.urlopen(link)
# result = requests.get(link).text
source_code = result.read()
outfile.write(source_code)
# outfile.write(result)
soup_code = BeautifulSoup(source_code)
# soup_code = BeautifulSoup(result)
title = soup_code.title.string.encode('ascii', 'ignore')
for script in soup_code(["script", "style"]):
script.extract()
text = soup_code.get_text().replace("\n", " ")
csvWriter.writerow([count,
self.numberVisited[link],
fileName,
link,
title,
text.encode('ascii', 'ignore')])
count += 1
except:
pass
csvFile.close()
print len(self.links)
print time.time()-start, 'seconds'
if __name__ == '__main__':
C = Crawler()
C.run()