-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.py
112 lines (92 loc) · 2.8 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from bs4 import BeautifulSoup as Soup
import requests
from typing import List
import time
import urllib
import json
headers = {
"User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:95.0) Gecko/20100101 Firefox/95.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "en-US;en;q=0.9",
"Connection": "keep-alive"
}
class Userinfo:
def __init__(self, username, url):
self.username = username
self.url = url
def __str__(self):
return f"username: {self.username}, url: {self.url}"
def __repr__(self):
return f"<Userinfo: {self.__str__()}>"
class Crawler:
def __init__(self):
self.session = requests.Session()
self.session.headers = headers
#Visit the website once to get the `sessionid`
self.session.get("https://steamcommunity.com")
cookies = self.session.cookies.get_dict()
self.sessionid = cookies["sessionid"]
def crawl(self, username, write_html=False, validator=None) -> List[Userinfo]:
"""
Args:
validator: func(username) -> bool
Omits the userinfo when it returns false
"""
username_encoded = urllib.parse.quote(username, safe='')
#baseurl = f"https://steamcommunity.com/search/users/#page={{}}&text={username_encoded}"
baseurl = f"https://steamcommunity.com/search/SearchCommunityAjax?text={username}&filter=users&sessionid={self.sessionid}&steamid_user=false&page="
userinfos = []
if write_html:
fout = open(f"output/{username_encoded}_{int(time.time())}.html", "w", encoding="utf-8")
#Some text's color is `whitesmoke`...
fout.write("""
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<style>
body {
background-color: black;
color: green;
}
</style>
</head>
<body>
""")
else:
fout = None
i = 1
while i < 501: #For some reason it can't go beyond the page 500
url = baseurl + str(i)
print('*'*64)
print(f"On {url}")
req = self.session.get(url)
json_obj = json.loads(req.text)
page_obj = Soup(json_obj["html"], "html.parser")
if fout:
fout.write(json_obj["html"])
#"Showing ... of ..."
paging = page_obj.find("span", class_="community_searchresults_paging")
paging_text = ' '.join(paging.get_text().split())
print(paging_text)
#Does it have the next page?
has_next_page = '>' in paging_text
for search_row in page_obj.find_all("div", class_="search_row"):
name_obj = search_row.find("a", class_="searchPersonaName")
name = name_obj.get_text()
href = name_obj["href"]
if not validator or validator(name):
userinfos.append(Userinfo(name, href))
print('^'*64)
if not has_next_page:
break
i += 1
time.sleep(0.1)
if fout:
fout.write("""
</body>
</html>
""")
fout.close()
return userinfos