-
Notifications
You must be signed in to change notification settings - Fork 1
/
domain_link_scraper.py
232 lines (194 loc) · 8.49 KB
/
domain_link_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
# -*- coding: utf-8 -*-
"""
domain_link_scraper()
@author: methylDragon
. .
. |\-^-/| .
/| } O.=.O { |\
/´ \ \_ ~ _/ / `\
/´ | \-/ ~ \-/ | `\
| | /\\ //\ | |
\|\|\/-""-""-\/|/|/
______/ /
'------'
_ _ _ ___
_ __ ___| |_| |_ _ _| || \ _ _ __ _ __ _ ___ _ _
| ' \/ -_) _| ' \ || | || |) | '_/ _` / _` / _ \ ' \
|_|_|_\___|\__|_||_\_, |_||___/|_| \__,_\__, \___/_||_|
|__/ |___/
-------------------------------------------------------
github.com/methylDragon
Description:
This script lets you (naively) run through every single page in a given
source, extracting a unique list of every internal link mentioned in <a> tags
in the page.
The list outputs both http:// and https:// versions of relative links to aid
in ambiguous comment scraping from Facebook's comment widget
Functions:
- get_domain_links(domain, start_page = 1, end_page = 99999)
(This is a generator function! Just keep calling it to get the next url list!)
(Make sure that the "domain" is something like "website.com/page/"<page # here>)
(Scrape from start page, up to but not including the end page)
- get_links(url, internal = False, custom_domain = "")
(Scrape all links from url)
(Set internal as True if you only want pages from the source URL)
(Set the custom_domain if you want the internal link checker to use another
url as the source, but still want to scrape from the url)
"""
from bs4 import BeautifulSoup
import requests
import re
import logging
import time
import random
# Agent header for reducing 403 errors (makes you look like a user)
user_agent_header = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}
# Turn off annoying messages
logging.getLogger("requests").setLevel(logging.WARNING)
logging.getLogger("bs4").setLevel(logging.ERROR)
# Link scraping function
def get_links(url, internal = False, custom_domain = ""):
# Download and parse the page
html_page = requests.get(url, headers = user_agent_header)
soup = BeautifulSoup(html_page.text, "lxml")
# Initialise the URL list
found_links = []
# If we want internal URLs only, prepare the regex filtering term
if internal != False:
# If no custom_domain stated, strip all urls of prefixes
if custom_domain == "":
if url[:7] == "http://":
url = url[7:]
elif url[:8] == "https://":
url = url[8:]
# Otherwise, set the filtering term as the custom_domain
else:
if custom_domain[:7] == "http://":
custom_domain = custom_domain[7:]
elif custom_domain[:8] == "https://":
custom_domain = custom_domain[8:]
url = custom_domain
# Else, filter out nothing
else:
url = ""
# Add in absolute links, filtering out unwanted links using BeautifulSoup
for link in soup.findAll('a', attrs={'href': re.compile("^http[s]?://" + url)}):
found_links.append(link.get('href'))
# Add in relative links (both HTTP and HTTPS versions)
for link in soup.findAll('a', attrs={'href': re.compile("^/")}):
found_links.append("https://" + url + link.get('href'))
found_links.append("http://" + url + link.get('href'))
# Return the list
return list(set(found_links))
# Link scraping function across all pages in a domain
def get_domain_links(domain, start_page = 1, end_page = 99999):
print("FETCHING DOMAIN", str(domain))
# Strip all prefixes
if type(domain) == str:
domain = domain.strip()
if domain[:7] == "http://":
domain = domain[7:]
elif domain[:8] == "https://":
domain = domain[8:]
# If it's in tuple form strip it this alternative way
else:
if domain[0][:7] == "http://":
domain = domain[0][7:].strip() + domain[1].strip()
elif domain[0][:8] == "https://":
domain = domain[0][8:].strip() + domain[1].strip()
# Return the domain itself! (In case you just want to grab a single article)
try:
if type(domain) == str:
yield [str("http://" + str(domain))], "BASE"
else:
yield [str("http://" + str(domain[0]) + str(domain[1]))], "BASE"
except GeneratorExit:
print("\nCleaning up domain scraper")
return
# Initialise counters
error_limit = 0
unknown_error_limit = 0
page_counter = start_page
# Run the infinite loop until broken out of by consecutive errors
while True:
if type(domain) == str:
# Create the URL
page_url = "http://" + str(domain) + str(page_counter)
else:
try:
page_url = "http://" + str(domain[0]) + str(page_counter) + str(domain[1])
except:
pass
print("\nFETCHING PAGE:", page_url)
# Check to see if it's within the search range
if page_counter > end_page:
print("Page fetch limit reached:", page_counter - start_page)
return
# Download and parse the page
try:
page = requests.get(page_url, headers = user_agent_header)
# Break if there are consecutive failures to fetch the page
# If the first time, the page returns false
# Give it 10 minutes or so, keep retrying every 30 seconds
if page.status_code != 200:
error_sub_limit = 0
# While loop for n number of minutes
while error_sub_limit < 60:
page = requests.get(page_url, headers = user_agent_header)
print("PAGE STATUS:", page.status_code)
# Escape the loop if it works!
if page.status_code == 200:
break
else:
print("ERROR FETCHING DOMAIN!")
error_sub_limit += 1
print("WAITING FOR 1 HOUR:", error_sub_limit)
time.sleep(10)
print(".....")
time.sleep(10)
print("....")
time.sleep(10)
print("...")
time.sleep(10)
print("..")
time.sleep(10)
print(".")
time.sleep(10)
# Final try
page = requests.get(page_url, headers = user_agent_header)
print("\nFETCH STATUS:", page.status_code)
if page.status_code != 200:
error_limit += 1
print("\nNetwork Errors encountered:", error_limit)
if error_limit > 5:
print("URL FETCH ERROR LIMIT REACHED, BREAKING")
return
else:
print("\nPage #" + str(page_counter) + " fetched")
# Note: I use yield here so we don't actually have to
# wait to get all the links before parsing them
# So instead we can just parse them on a page-by-page
# basis! Generators are great!
if type(domain) == str:
try:
yield get_links(page_url, True, domain.split("/")[0]), str(page_counter)
except GeneratorExit:
print("\nCleaning up domain scraper")
return
else:
try:
yield get_links(page_url, True, domain[0].split("/")[0]), str(page_counter)
except GeneratorExit:
print("\nCleaning up domain scraper")
return
except:
pass
# Or random errors occur that stop the page fetch
except:
unknown_error_limit += 1
print("\nUnknown Fetch Errors encountered:", unknown_error_limit)
if unknown_error_limit > 5:
print("\nUNKNOWN FETCH ERROR LIMIT REACHED, BREAKING")
return
# Increment page counter
page_counter += 1