-
Notifications
You must be signed in to change notification settings - Fork 0
/
run.py
125 lines (104 loc) · 4.64 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/usr/bin/env python3
import requests
import json
from bs4 import BeautifulSoup
import csv
import io
import argparse
import json
# settings arguments
parser = argparse.ArgumentParser(
prog='run.py', description="Script to scrape game details from steam and write it into a csv file")
parser.add_argument('-i', metavar='input',
help="input filename", required=True, type=str)
parser.add_argument('-o', metavar='output',
help="output filename", required=True, type=str)
args = parser.parse_args()
url_prefix_steam = 'http://store.steampowered.com/app/'
url_prefix_steamdb = 'https://steamdb.info/app/'
filter_string_steam = '.responsive_apppage_details_left.game_details .details_block'
filter_string_steamdb = '.row.app-row tbody tr'
# cookie to get through age check
cookies = {'birthtime': '568022401'}
columns = ['title', 'developer', 'publisher', 'release_date', 'genre']
# parsing json
json_file = open(args.i).read()
json_data = json.loads(json_file)
rgOwnedApps = json_data['rgOwnedApps']
with open(args.o, "w", newline="") as csvfile:
# create csv file and add columns
writer = csv.writer(csvfile, delimiter=',',
quotechar='"', quoting=csv.QUOTE_ALL)
writer.writerow(columns)
for game_id in rgOwnedApps:
url = url_prefix_steam + str(game_id)
# get the html data and parse it
html_response = requests.get(url, cookies=cookies)
soup = BeautifulSoup(html_response.text, 'html.parser')
soup_data = soup.select(filter_string_steam)
# to make sure the game still exists
if len(soup_data) > 0:
print("Scraping game data with id {0}.".format(game_id))
# get game details, it's in the first block
soup_prettified = soup_data[0].prettify()
# parse string parts
game_details = soup_prettified.replace('\n', '').split('<br/>')
title = ""
developer = ""
genre = ""
publisher = ""
release_date = ""
# make sure the items exists
for detail in game_details:
if "Title" in detail:
title = detail.split('</b>')[1]
if "Genre" in detail:
splitted_links = detail.split('</b>')[1].split(", <a")
data_array = []
for link in splitted_links:
data_array.append(link.split(
"\">")[1].replace('</a>', ''))
genre = ','.join(data_array)
if "Developer" in detail:
splitted_links = detail.split('</b>')[1].split(", <a")
data_array = []
for link in splitted_links:
data_array.append(link.split(
"\">")[1].replace('</a>', ''))
developer = ','.join(data_array)
if "Publisher" in detail:
splitted_links = detail.split('</b>')[1].split(", <a")
data_array = []
for link in splitted_links:
data_array.append(link.split(
"\">")[1].replace('</a>', ''))
publisher = ','.join(data_array)
if "Release" in detail:
release_date = detail.split('</b>')[1]
# write details into the csv file
writer.writerow([title, developer, publisher, release_date, genre])
else:
print("The game with id {0} doesn't exist in your Steam region. Trying to fetch it from SteamDB.".format(
game_id))
url = url_prefix_steamdb + str(game_id)
html_response = requests.get(url)
soup = BeautifulSoup(html_response.text, 'html.parser')
soup_data_rows = soup.select(filter_string_steamdb)
title = ""
developer = ""
genre = ""
publisher = ""
release_date = ""
for row in soup_data_rows:
cells = row.select('td')
name = str(cells[0])
data = str(cells[1])
if "name" in data:
title = data.split("\">")[1].replace('</td>', '')
if "author" in data:
developer += data.split("\">")[1].split('<')[0]
if "publisher" in data:
publisher += data.split("\">")[1].split('<')[0]
if "Release Date" in name:
release_date = data.replace('<td>', '').split('<')[0]
writer.writerow([title, developer, publisher, release_date, genre])