-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFootballGameInfoScraper.py
325 lines (254 loc) · 13.2 KB
/
FootballGameInfoScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
"""
STEP 2 - FootballGameInfoScraper.py
This should take a long time, therfore do STEP 3 & 4 while waiting.
Takes a list of pro-football-reference game urls within "game_url_list.txt" and returns
the wanted data via webscraping. This is kick data, weather data, and statium data.
Functions:
- formatWeather: Properly formats weather data, so it can be stored correctly.
- informationScraper: Gets all information for the output file, via bs4 and selenium.
- main: Starts everything and stores information seperated by reading a txt file.
"""
import time
import requests
import re
from bs4 import BeautifulSoup
# Imports needed for selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
# Options added to the chromedriver, so we can webscrape without havoc
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--ignore-ssl-errors')
options.add_argument("disable-quic")
options.add_argument("--disable-proxy-certificate-handler")
options.add_argument('--blink-settings=imagesEnabled=false')
options.add_argument('--pageLoadStrategy=eager')
def FormatWeather(weather_text:str):
"""
Takes a string of weather info and correctly formats it.
Parameters:
- weather_text(string): Weather data scraped from pro-football-reference.com
Returns:
string: Formated weather data.
"""
# Replaces "no wind" with 0
if "no wind" in weather_text:
weather_text = weather_text.replace("no wind", "0")
# Defines a list of words to remove
words_to_remove = ["degrees", "relative humidity", "wind", "mph", "chill"]
# Create a regex pattern by joining the words with the "|" (OR) operator
pattern = re.compile(r'\b(?:' + '|'.join(map(re.escape, words_to_remove)) + \
r')\b\s*|\%', flags=re.IGNORECASE)
# Use the sub function to replace the matched words, spaces,
# and percentage signs with an empty string, keeping the commas
weather = re.sub(pattern, '', weather_text)
# Gets rid of spaces
weather = weather.replace(" ", "")
# Checks if the wind chill data doesn't exist, if not adds N/A in its collum
comma_count = weather.count(',')
if comma_count == 2:
weather += ",N/A"
return weather
def informationScraper(game_url_list:list, start_line:int):
"""
Obtains all information needed for the "infoCSV.txt" to be created. Uses
BeautifulSoup and Selenium on pro-football-reference.com via the given game urls to
obtain information about each game.
Parameters:
- game_url_list(list): A url list of games to be scraped.
- start_line(int): Line number within "game_url_list.txt" to start scraping at.
Returns: list:
- [0] list: scraped data that is properly formated.
- [1] list: pro-football-reference.com game urls that returned an ERROR.
"""
info_csv, bad_info_csv = [], []
line = start_line
# Set up the WebDriver (for example, using Chrome)
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
# Takes each game individually and scraps
for url in game_url_list[start_line:]:
# To prevent pro-football-reference from temp IP banning us, we delay each request
time.sleep(6)
response = requests.get(url)
# Checks for a temp ban
wait_time = response.headers.get('Retry-After')
# If there is a ban, waits until it is over
# (1 hour is tipical from pro-football-reference)
if wait_time:
print(wait_time)
time.sleep(int(wait_time) + 1)
response = requests.get(url)
# Checks if we had a successful request
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
#----------------------------------------------------------------------------
# Statium
#----------------------------------------------------------------------------
# Find first 'a' element with an 'href' attribute starting with "/stadiums/"
stadium_element = soup.find('a', href=lambda href: href and \
href.startswith("/stadiums/"))
# Check if a stadium element was found
if stadium_element:
# Extract the text content of the stadium element
stadium = stadium_element.text
else:
# If no stadium element is found, print a message and set the stadium to "N/A"
print("Missing: Stadium")
stadium = "N/A"
#----------------------------------------------------------------------------
#----------------------------------------------------------------------------
# Selenium
#----------------------------------------------------------------------------
#----------------------------------------------------------------------------
# Gives the selenium webdriver the url
driver.get(url)
# Introduce a delay (e.g., 4 seconds) to allow dynamic content to load
driver.implicitly_wait(4)
#----------------------------------------------------------------------------
# Roof
#----------------------------------------------------------------------------
try:
# Use WebDriverWait to wait for the presence of an element with specific XPATH
roof_element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH,'//th[@data-stat="info" and text()="Roof"]'))
)
# If element is found, extract text content of the corresponding 'td' element
roof = roof_element.find_element(By.XPATH, '../td[@data-stat="stat"]').text
except:
# If there's an exception (element not found or timeout), set 'roof' to "N/A"
print("Missing: Roof")
roof = "N/A"
#----------------------------------------------------------------------------
# Surface
#----------------------------------------------------------------------------
try:
# Use WebDriverWait to wait for presence of an element with specific XPATH
surface_element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//th[@data-stat="info" and text()="Surface"]'))
)
# If the element is found, extract the text content of the corresponding 'td' element
surface = surface_element.find_element(By.XPATH, '../td[@data-stat="stat"]').text
except:
# If there's an exception (element not found or timeout), print a message and set 'surface' to "N/A"
print("Missing: Surface")
surface = "N/A"
#----------------------------------------------------------------------------
# Field Goals
#----------------------------------------------------------------------------
try:
# Use WebDriverWait to wait for the presence of all elements with specific XPATH
fgm_elements = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.XPATH, '//td[@data-stat="fgm"]'))
)
# Use WebDriverWait to wait for the presence of all elements with specific XPATH
fga_elements = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.XPATH, '//td[@data-stat="fga"]'))
)
fgm, fga = 0,0
# Iterate through corresponding FGM and FGA elements, calculating the total FGM and FGA
for fgm_element, fga_element in zip(fgm_elements, fga_elements):
fgm_per = fgm_element.text
fga_per = fga_element.text
# If FGM or FGA data is present, add it to the running total
if fgm_per:
fgm += int(fgm_per)
if fga_per:
fga += int(fga_per)
except:
# If there's an exception (elements not found or timeout), set FGM and FGA to 0
fgm, fga = 0,0
#----------------------------------------------------------------------------
# Weather
#----------------------------------------------------------------------------
# Check if roof element is not in the specified list
if roof not in ["dome", "(closed)"]:
try:
# Use WebDriverWait to wait for the presence of an element with specific XPATH for 'Weather'
weather_element = WebDriverWait(driver, 5).until(
EC.presence_of_element_located((By.XPATH, '//th[@data-stat="info" and text()="Weather"]'))
)
# Extract the text content of the corresponding 'td' element
weather = weather_element.find_element(By.XPATH, '../td[@data-stat="stat"]').text
# Format weather
weather = FormatWeather(weather)
except:
# If there's an exception (element not found or timeout), set weather to default
print("Missing: Weather")
weather = "N/A,N/A,N/A,N/A"
else:
# Set weather to default
weather = "N/A,N/A,N/A,N/A"
#----------------------------------------------------------------------------
#----------------------------------------------------------------------------
#----------------------------------------------------------------------------
# Appends all grabbed information to the info_csv list and prints it
info_csv.append(str(line) + "," + stadium + "," + roof + "," + surface + "," + str(fgm) + "," + str(fga) + "," + weather)
print(str(line) + "," + stadium + "," + roof + "," + surface + "," + str(fgm) + "," + str(fga) + "," + weather)
else:
# if the request was unsuccessful, it adds the week url to badinfo_csv
print(f"Failed to retrieve the HTML content. Status code: {response.status_code}")
bad_info_csv.append(str(line) + "," + url)
# Notifies you which line you are on
print("out of 6162")
# Add to the count
line += 1
# Saves the data collected every 100 lines
if (line % 100) == 0:
print("Saving...")
# Saves to infoCSV.txt
with open('infoCSV.txt', 'a') as file:
# Convert each element of the list to a string and write it to the file
for item in info_csv:
file.write(str(item) + '\n')
if bad_info_csv:
with open('bad_infoCSV.txt', 'a') as file:
# Convert each element of the list to a string and write it to the file
for item in bad_info_csv:
file.write(str(item) + '\n')
# Erases data in info_csv
info_csv = []
# Stops chromedriver
driver.quit()
return [info_csv, bad_info_csv]
def main():
"""
Start of the program, allows the initialization of all other functions. Also stores
information gathered.
"""
# Gives the choice of wwhich list to use
print("Use 'bad_infoCSV.txt' instead of game_url_list.txt?")
user_input = input("Y/N: ")
game_urls = []
if user_input == "N":
# Reads game_url_list.txt, stores information in game_urls
with open('game_url_list.txt', 'r') as file:
# Read each line from the file and append it to the list
for line in file:
game_urls.append(line.strip())
# Asks for the line number to start scraping at
print("Which line are we starting at?")
line_number = int(input("#: "))
else:
# Reads bad_infoCSV.txt, stores information in game_urls
with open('bad_infoCSV.txt', 'r') as file:
# Read each line from the file and append it to the list
for line in file:
game_urls.append(line.strip())
line_number = 0
# Inistalizes the webscraping of information, and stores the information in infoLine
information = informationScraper(game_urls, line_number)
# Appends infoLine[0] information to infoCSV.txt
with open('infoCSV.txt', 'a') as file:
# Convert each element of the list to a string and write it to the file
for item in information[0]:
file.write(str(item) + '\n')
# Appends infoLine[1] information to bad_infoCSV.txt
with open('bad_infoCSV.txt', 'a') as file:
# Convert each element of the list to a string and write it to the file
for item in information[1]:
file.write(str(item) + '\n')
if __name__ == "__main__":
main()