-
Notifications
You must be signed in to change notification settings - Fork 0
/
web_crawler.py
160 lines (151 loc) · 6.68 KB
/
web_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#internal and external links are saved in sets so no repetition
import sys
import argparse
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import os
"""
should not use {} to initilize empty set here as it may cause confusion with empty dict as well ,better to implicitly
define it as a set using set() . Used set here to avoid repeating links in recursion as it will become an infinte loop otherwise
"""
visited_links = set() #to store already visited links
internal_links = set() #to group internal links which will be crawled further
external_links = set() #to store external links(links with same domain) which will just be mentioned and not crawled
def crawl(url, start_domain, depth, max_depth):
if max_depth != None :
if depth>max_depth :
return
"""
Crawls the given URL and extracts links from the page.
Recursively follows the links within the same domain up to the maximum depth threshold.
"""
visited_links.add(url)
print("Crawling:", url)
try:
response = requests.get(url) #on succesful request response.status_code is set to 200
except requests.exceptions.RequestException as e:
print("An error occurred:", e)
return
if response.status_code != 200:
print("Failed to crawl:", url)
return
"""
#reponse.text is a str containing html or xml test of the url we can also use response.content which returns the html
or xml doc in bytes format which can be converted to str using response.content.decode() . As for 2nd argument we can use
"html.parser" or "lxml" html.parser is linenent and can convert even imperfect or malfunctioned html but lxml has more features
and it is also faster. Hence, it is recommended for html and xml docs
"""
soup = BeautifulSoup(response.text, "lxml")
links = soup.find_all(["a", "img", "link", "script", "iframe", "form"])
for link in links:
if "href" in link.attrs:
href = link["href"]
if href.startswith("http"):
next_url = href
else :
next_url = urljoin(url, href) #to manage relative url (makes relative url absolute)
"""
urljoin is used instead of simply concatenating the strings because if the url has already some relative part attached
to it then it removes that and attach the relative url provided by user as argument and returns it as a proper url
"""
process_url(next_url, start_domain, depth , max_depth)
if "src" in link.attrs:
src = link["src"]
if src.startswith("http"):
next_url = src
else:
next_url = urljoin(url, src)
process_url(next_url, start_domain, depth , max_depth)
def process_url(url, start_domain, depth, max_depth):
"""
Processes the URL and determines whether to crawl or consider it as an internal or external link.
"""
parsed_url = urlparse(url)
domain = parsed_url.netloc
if domain == start_domain:
if url not in visited_links:
if max_depth is None or depth <= max_depth:
internal_links.add(url)
crawl(url, start_domain, depth+1, max_depth)
else:
external_links.add(url)
def group_domain() :
domain=set()
for link in external_links :
parsed_url=urlparse(link)
if parsed_url.netloc:
domain.add(parsed_url.netloc)
for domain_name in domain :
print(f"\n{domain_name}\n")
for link in external_links :
parsed_url=urlparse(link)
if domain_name==parsed_url.netloc :
print("\t",link)
def sort_extensions(links) :
extension=set()
for link in links:
parsed_link = urlparse(link)
path = parsed_link.path
extension.add(os.path.splitext(path)[1])
for ext in extension:
if ext!="":
print(f"\n{ext}\n")
else :
print("\nMiscellaneous\n")
for link in links :
parsed = urlparse(link)
link_path = parsed.path
link_ext=os.path.splitext(link_path)[1]
if link_ext==ext :
print("\t",link)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Web Crawler")
parser.add_argument("-u", "--url", help="Start URL", required=True)
parser.add_argument("-t", "--threshold", help="Recursion Depth Threshold", type=int, default=None)
parser.add_argument("-o","--output", help="Create an output file ", default="")
parser.add_argument("-s","--simple",help="Simplify the results that we get",action="store_true")
parser.add_argument("-e","--ext_sort",help="Sort the external links based on extensioin",action="store_true")
args = parser.parse_args()
start_url = args.url
threshold = args.threshold
output_file=args.output
simple=args.simple
ext_sort=args.ext_sort
parsed_start_url = urlparse(start_url)
start_domain = parsed_start_url.netloc
flag=False #to check if output file was made or not
if threshold is None or threshold>0 :
if output_file :
if os.path.exists(output_file) :
print("\nThis file already exists in the current directory..... Printing output on the command line\n\n")
else :
path=f"./{output_file}"
sys.stdout=open(path,"w")
flag=True
crawl(start_url, start_domain, 1,threshold)
print("\nInternal links:\n")
if simple==True : #getting simplified output
for link in internal_links:
print("\t",link)
else :
sort_extensions(internal_links)
print(f"\nTotal Internal Links = {len(internal_links)} \n")
print("\nExternal links:")
if simple==True : #getting simplified output
for link in external_links :
print("\t",link)
else :
group_domain()
if ext_sort==True :
print("\nExternal links sorted based on extensions :\n")
sort_extensions(external_links)
print(f"\nTotal External Links = {len(external_links)} \n")
print(f"\nTotal Links = {len(internal_links)+len(external_links)} \n")
if flag :
sys.stdout=sys.__stdout__ #restoring sys,stdout to its original state, sys.__stdout__ refers to original output stream
print(f"\nTotal Internal Links Found = {len(internal_links)} \n")
print(f"\nTotal External Links Found = {len(external_links)} \n")
print(f"\nTotal Links Found = {len(internal_links)+len(external_links)} \n")
else :
print("Invalid Threshhold")