-
Notifications
You must be signed in to change notification settings - Fork 0
/
metacritic_scraper.py
167 lines (143 loc) · 5.4 KB
/
metacritic_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
# Import libraries
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from multiprocessing import Pool
import asyncio
import sys
import csv
from datetime import datetime
import time
import pickle
# Set scraper header
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
}
# Set session
session = requests.Session()
def get_page_soup(url):
"""
Fetch a webpage and return its parsed HTML content.
"""
response = session.get(url, headers=headers)
response.raise_for_status() # Check if the request was successful.
soup = BeautifulSoup(response.text, "html.parser")
return soup
def get_all_pages():
"""
Get all the pages with the list of games by checking the "pagination" element.
"""
base_url = "https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page="
pages = []
current_page = 0
while True:
try:
page_url = base_url + str(current_page)
soup = get_page_soup(page_url)
pagination = soup.find("ul", class_="pages")
if not pagination:
break
pages.append(soup)
current_page += 1
except Exception as e:
print (f"{e}")
time.sleep(5)
return pages
def get_rating(page):
try:
rating = (page
.find("div", class_="metascore_w large game positive")
.get_text(strip=True))
except:
try:
rating = (page
.find("div", class_="metascore_w large game mixed")
.get_text(strip=True))
except:
rating = (page
.find("div", class_="metascore_w large game negative")
.get_text(strip=True))
return rating
def get_user_rating(page):
try:
rating_u = (page
.find("div", class_="metascore_w user large game positive")
.get_text(strip=True))
except:
try:
rating_u = (page
.find("div", class_="metascore_w user large game mixed")
.get_text(strip=True))
except:
try:
rating_u = (page
.find("div", class_="metascore_w user large game negative")
.get_text(strip=True))
except:
rating_u = np.NaN
return rating_u
def extract_game_data(page):
"""
Extract game information from a BeautifulSoup object containing a page's HTML content.
"""
game_data = []
for page in page.find_all("table"):
for page in page.find_all("tr"):
try:
number = page.find("span", class_="title numbered").get_text(strip=True)
name = page.find("a", class_="title").get_text(strip=True)
platform = page.find("span", class_="data").get_text(strip=True)
rating = get_rating(page)
rating_u = get_user_rating(page)
release_date = page.find_all("span")[4].get_text(strip=True)
summary = page.find("div", class_="summary").get_text(strip=True)
game_data.append([number, name, platform, rating, rating_u, release_date, summary])
#print (f'{number}|{name}|{platform}|{rating}|{rating_u}|{release_date}|{summary}')
except:
pass
return game_data
def get_current_ts():
return datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
def write_pickle_obj(object_pi, fl_path):
"""
Writes an object to a file in pickle format.
Parameters:
object_pi (object): the object to be written to the file
fl_path (str): the path to the file where the object will be written
Returns:
None
"""
# open the file in write binary mode
with open(fl_path, 'wb') as file_pi:
# write the object to the file in pickle format
pickle.dump(object_pi, file_pi)
def read_pickle_obj(fl_path):
"""
Reads an object from a file in pickle format.
Parameters:
fl_path (str): the path to the file where the object is stored
Returns:
object: the object read from the file
"""
# open the file in read binary mode
with open(fl_path, 'rb') as filehandler:
# read the object from the file in pickle format
object_pi = pickle.load(filehandler)
# return the object
return object_pi
# Scrape all pages
pages = get_all_pages()
# Save pages read as pickle object
write_pickle_obj([str(x) for x in pages],"data\\pages.obj")
pages = [BeautifulSoup(x,"html.parser") for x in read_pickle_obj(f"data\\pages.obj")]
#Scrape all game data from Metacritic and return the data as a list of dictionaries.
all_game_data = []
for page in pages:
game_data = extract_game_data(page)
all_game_data.append(game_data)
all_game_data = sum(all_game_data, [])
# Get game data in a dataframe
all_game_dataframe = pd.DataFrame(all_game_data, columns=["index", "name", "platform", "metacritic", "user_rating", "release_date", "summary"])
# Write to csv
all_game_dataframe.to_csv(f"data\\{get_current_ts()}_metacritic.csv",quoting=csv.QUOTE_ALL,index=False)