-
Notifications
You must be signed in to change notification settings - Fork 0
/
spiderbolt.py
151 lines (124 loc) · 6 KB
/
spiderbolt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from collections import defaultdict
from threading import Lock, Thread
from queue import Queue
from colorama import Fore, Style, init
from datetime import datetime
import random
import os
init(autoreset=True)
visited_lock = Lock()
def art():
art = f"""{Fore.RED}
██
██
███████ ██████ ██ ██████ ███████ ██████ ██████ ██████ ██ ████████ ██
██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██
███████ ██████ ██ ██ ██ █████ ██████ ██████ ██ ██ ██ ██ ███████
██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██
███████ ██ ██ ██████ ███████ ██ ██ ██████ ██████ ███████ ██ ██
██
██
- Just A Link Scraper by ogtirth ;)
{Style.RESET_ALL}
"""
print(art)
def dname(url):
parsed_url = urlparse(url)
return parsed_url.netloc or parsed_url.path.replace("/", "_")
def dsame(url, domain):
return urlparse(url).netloc == domain
def lpath(links):
grouped_links = defaultdict(list)
for link in links:
parsed = urlparse(link)
path = parsed.path.split("/")[1] if "/" in parsed.path and parsed.path != "/" else "root"
grouped_links[path].append(link)
return grouped_links
def scrape(url, domain, user_agents, visited, html_links, other_links, request_queue):
try:
user_agent = random.choice(user_agents)
headers = {"User-Agent": user_agent}
response = requests.get(url, headers=headers, timeout=5)
if response.status_code == 200:
soup = BeautifulSoup(response.content, "html.parser")
for a_tag in soup.find_all("a", href=True):
full_url = urljoin(url, a_tag["href"])
if dsame(full_url, domain):
with visited_lock:
if full_url not in visited:
visited.add(full_url)
request_queue.put(full_url)
if full_url.endswith((".html", ".php", ".asp", ".aspx", "/")):
html_links.add(full_url)
else:
other_links.add(full_url)
print(f"{Fore.GREEN}{full_url}{Style.RESET_ALL}")
except Exception as e:
print(f"{Fore.RED}Error scraping {url}: {e}{Style.RESET_ALL}")
def main():
os.system("cls" if os.name == "nt" else "clear")
art()
url = input(f"{Fore.BLUE}Enter the website link to scrape ;) {Style.RESET_ALL}").strip()
domain = urlparse(url).netloc
while True:
try:
num_threads = int(input(f"{Fore.BLUE}How many threads do you want to use?: {Style.RESET_ALL}").strip())
if num_threads <= 0 or num_threads > 500:
raise ValueError("Threads must be between 1 and 500 :) ")
break
except ValueError as e:
print(f"{Fore.RED}Invalid input:/ {e}{Style.RESET_ALL}")
try:
with open("user-agents.txt", "r") as f:
user_agents = [ua.strip() for ua in f.readlines()]
except FileNotFoundError:
print(f"{Fore.RED}Error: 'user-agents.txt' file not found. Please provide a valid file :/{Style.RESET_ALL}")
return
visited = set()
html_links = set()
other_links = set()
request_queue = Queue()
request_queue.put(url)
visited.add(url)
print(f"{Fore.YELLOW}\n[Scraping links...]\n{Style.RESET_ALL}")
threads = []
def worker():
while not request_queue.empty():
current_url = request_queue.get()
scrape(current_url, domain, user_agents, visited, html_links, other_links, request_queue)
request_queue.task_done()
for _ in range(num_threads):
thread = Thread(target=worker)
thread.start()
threads.append(thread)
for thread in threads:
thread.join()
domain_name = dname(url)
output_file = f"{domain_name}.txt"
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
html_grouped = lpath(html_links)
other_grouped = lpath(other_links)
with open(output_file, "w") as f:
f.write(f"Scraping Timestamp: {timestamp}\n")
f.write(f"Target URL: {url}\n")
f.write("=" * 80 + "\n\n")
f.write("########## HTML LINKS BY PATH ##########\n\n")
for path, links in html_grouped.items():
f.write(f"===== {path.upper()} =====\n")
for link in links:
f.write(link + "\n")
f.write("\n")
f.write("########## OTHER LINKS BY PATH ##########\n\n")
for path, links in other_grouped.items():
f.write(f"===== {path.upper()} =====\n")
for link in links:
f.write(link + "\n")
f.write("\n")
print(f"{Fore.CYAN}\n" + "=" * 80 + f"{Style.RESET_ALL}")
print(f"{Fore.GREEN}Scraping completed. {len(html_links)} HTML links and {len(other_links)} other links saved to {output_file}.{Style.RESET_ALL}")
print(f"{Fore.CYAN}" + "=" * 80 + f"{Style.RESET_ALL}")
if __name__ == "__main__":
main()