-
Notifications
You must be signed in to change notification settings - Fork 1
/
scraper.py
171 lines (141 loc) · 6.73 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime
from urllib.parse import urlencode
import subprocess
# Base URLs
BASE_URL_1337X = 'https://1337x.to'
API_BASE_URL_PIRATEBAY = 'https://apibay.org'
# Function to perform a search query on The Pirate Bay
def search_piratebay(search_term):
search_url = f"{API_BASE_URL_PIRATEBAY}/q.php?q={search_term}"
response = requests.get(search_url)
response.raise_for_status()
return response.json()
# Function to perform a search query on 1337x with category selection
def search_1337x(search_term, category="all"):
category_urls = {
"all": f"/search/{search_term}/1/",
"movies": f"/category-search/{search_term}/Movies/1/",
"tv": f"/category-search/{search_term}/TV/1/",
"games": f"/category-search/{search_term}/Games/1/",
"music": f"/category-search/{search_term}/Music/1/",
"applications": f"/category-search/{search_term}/Apps/1/",
"documentaries": f"/category-search/{search_term}/Documentaries/1/",
"anime": f"/category-search/{search_term}/Anime/1/",
"other": f"/category-search/{search_term}/Other/1/"
}
# Build the search URL
search_url = BASE_URL_1337X + category_urls.get(category, category_urls["all"])
response = requests.get(search_url)
response.raise_for_status()
# Parse the search results page
soup = BeautifulSoup(response.text, 'html.parser')
return parse_1337x_results(soup)
# Function to parse 1337x search results
def parse_1337x_results(soup):
torrents = []
rows = soup.select('tbody tr')
for row in rows:
name_tag = row.select_one('.coll-1.name a:nth-child(2)')
seeders = row.select_one('.coll-2').text.strip()
leeches = row.select_one('.coll-3').text.strip()
time = row.select_one('.coll-date').text.strip()
size = row.select_one('.coll-4.size').text.strip()
uploader = row.select_one('.coll-5').text.strip()
title = name_tag.text.strip()
details_url = f"{BASE_URL_1337X}{name_tag['href']}"
# Fetch the magnet link from the details page
magnet_link = fetch_magnet_link(details_url)
# Store the extracted data
torrents.append({
'Title': title,
'Size': size,
'Seeders': seeders,
'Leeches': leeches,
'Date Uploaded': time,
'Uploader': uploader,
'Magnet Link': magnet_link
})
return torrents
# Function to fetch the magnet link from a 1337x torrent details page
def fetch_magnet_link(details_url):
response = requests.get(details_url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
magnet_link_tag = soup.select_one('#openPopup')
return magnet_link_tag['href'] if magnet_link_tag else 'N/A'
# Function to sort torrents by seeders and date uploaded (descending)
def sort_torrents(torrents):
sorted_torrents = sorted(torrents, key=lambda x: (int(x['Seeders']), x.get('Date Uploaded', '')), reverse=True)
return sorted_torrents
# Function to display torrents in batches and prompt the user to continue
def display_torrents(torrents):
index = 0
while index < len(torrents):
for torrent in torrents[index:index + 25]:
title = torrent.get('Title', 'N/A')
size = torrent.get('Size', 'N/A')
seeders = torrent.get('Seeders', 'N/A')
leeches = torrent.get('Leeches', 'N/A')
upload_date = torrent.get('Date Uploaded', 'N/A')
uploader = torrent.get('Uploader', 'N/A')
print(f"{title:<50} {size:<10} {seeders:<8} {leeches:<8} {upload_date:<15} {uploader:<10}")
index += 25
if index < len(torrents):
more = input("Do you want to see 25 more results? (yes/no): ").strip().lower()
if more != 'yes':
break
# Main function
def main():
search_term = input("Enter the search term (e.g., movie name): ").strip()
category = input("Enter the category (all/movies/tv/games/music/applications/documentaries/anime/other): ").strip().lower()
all_torrents = []
print(f"Searching torrents for '{search_term}' in category '{category}' on 1337x...")
# Scrape torrents from 1337x
all_torrents += search_1337x(search_term, category)
print(f"Searching torrents for '{search_term}' on The Pirate Bay...")
# Search torrents on The Pirate Bay
piratebay_torrents = search_piratebay(search_term)
for torrent in piratebay_torrents:
all_torrents.append({
'Title': torrent['name'],
'Size': format_size(torrent['size']),
'Seeders': torrent['seeders'],
'Leeches': torrent['leechers'],
'Date Uploaded': format_date(torrent['added']),
'Magnet Link': create_magnet_link(torrent['info_hash'], torrent['name']),
'Uploader': 'N/A' # The Pirate Bay API does not provide uploader information
})
min_seeders = int(input("Enter the minimum number of seeders required for the CSV file: ").strip())
filtered_torrents = [torrent for torrent in all_torrents if int(torrent['Seeders']) >= min_seeders]
sorted_torrents = sort_torrents(filtered_torrents)
print(f"\n{'Title':<50} {'Size':<10} {'Seeders':<8} {'Leeches':<8} {'Date Uploaded':<15} {'Uploader':<10}")
print("=" * 100)
display_torrents(sorted_torrents)
with open("sorted_torrents.csv", "w", newline="") as csv_file:
writer = csv.writer(csv_file)
writer.writerow(["Title", "Size", "Seeders", "Leeches", "Date Uploaded", "Uploader", "Magnet Link"])
for torrent in sorted_torrents:
writer.writerow([torrent['Title'], torrent['Size'], torrent['Seeders'], torrent['Leeches'], torrent['Date Uploaded'], torrent['Uploader'], torrent['Magnet Link']])
print(f"\nScraping completed. Torrents with more than {min_seeders} seeders were saved to 'sorted_torrents.csv'.")
# Function to format the size from bytes to a readable format with whole numbers
def format_size(size_bytes):
size_bytes = int(size_bytes)
for unit in ['B', 'KB', 'MB', 'GB', 'TB', 'PB']:
if size_bytes < 1024:
return f"{round(size_bytes)} {unit}"
size_bytes /= 1024
# Function to convert UNIX timestamp to a readable date
def format_date(unix_timestamp):
return datetime.utcfromtimestamp(int(unix_timestamp)).strftime('%Y-%m-%d')
# Function to create a properly formatted magnet link
def create_magnet_link(info_hash, name):
encoded_name = urlencode({"dn": name})
return f"magnet:?xt=urn:btih:{info_hash}&{encoded_name}"
if __name__ == "__main__":
main()
# After completing the scraping, run the verify script
print("Running the verify script...")
subprocess.run(["python", "verify.py"])