-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFootballInfoScraper.py
179 lines (135 loc) · 5.96 KB
/
FootballInfoScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
"""
STEP 1 - FootballInfoScaper.py
Takes a range of years and returns a txt file named "game_url_list.txt". This contains
a list, where each line is an element, of all regular NFL games played in that range.
These are stored as pro-football-reference.com urls.
Functions:
- generateYearUrls: Creates a list of years.
- generateWeekUrls: Creates a list of weeks.
- generateGameUrls: Creates a lost of games.
- main: Starts everything and stores information gathered.
"""
# Imports
import time
import requests
from bs4 import BeautifulSoup
def generateYearUrls(start:int, end:int):
"""
Takes a range of years and provides a list of pro-football-reference.com urls that
correspond to it.
Parameters:
- start(int): The year you want to start with.
- end (int): The year you want to end with (inclusive).
Returns:
list: pro-football-reference.com year urls.
"""
url_list = []
# Creates urls and stores them in url_list
for year in range(start, end + 1):
url = f"https://www.pro-football-reference.com/years/{year}/"
url_list.append(url)
return url_list
def generateWeekUrls(year_url_list:list, start:int, end:int):
"""
Takes a list of years in pro-football-reference.com urls form, and provides a list
of pro-football-reference.com urls that correspond to each specified week within
the years provided.
Parameters:
- year_url_list(list): List of pro-football-reference.com year urls.
- start(int): The week you want to start with.
- end (int): The week you want to end with (inclusive).
Returns:
list: pro-football-reference.com week urls.
"""
url_list = []
# Creates urls and stores them in url_list
for year in year_url_list:
for week in range(start, end + 1):
url = f"{year}week_{week}.htm"
url_list.append(url)
return url_list
def generateGameUrls(week_url_list:list):
"""
Takes a list of weeks in pro-football-reference.com urls form, and provides a list
of pro-football-reference.com urls that correspond to each game within
the weeks provided.
Parameters:
- year_url_list(list): List of pro-football-reference.com week urls.
Returns: list:
- [0] list: pro-football-reference.com game urls.
- [1] list: pro-football-reference.com week urls that returned an ERROR.
"""
url_list, bad_url_list = [], []
count = 1
# Finds game urls and stores them in url_list
# Also storing week urls that gave ERRORs in bad_url_list
for url in week_url_list:
# To prevent pro-football-reference from temp IP banning us, we delay each request
time.sleep(5)
response = requests.get(url)
# Checks for a temp ban
wait_time = response.headers.get('Retry-After')
# If there is a ban, waits until it is over
# (1 hour is tipical from pro-football-reference)
if wait_time:
print(wait_time)
time.sleep(int(wait_time) + 1)
response = requests.get(url)
# Prints the current count of which week your on
print(f"#{count}")
count += 1
# Checks if we had a successful request
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
# Find all <a> tags with href attribute starting with "/boxscores/20"
target_links = soup.find_all('a', href=lambda href: href and \
href.startswith("/boxscores/20"))
# Extract the href attribute value for each matching link and adds to list
for link in target_links:
url_list.append("https://www.pro-football-reference.com" \
+ link.get('href'))
else:
# if the request was unsuccessful, it adds the week url to bad_url_list
print(f"Failed to retrieve the HTML content. Status code: \
{response.status_code}")
bad_url_list.append(url)
return [url_list, bad_url_list]
def main():
"""
Start of the program, allows the initialization of all other functions. Also stores
information gathered.
"""
# Gives the choice of whether to use a created list or create one
print("Use 'bad_game_url_list.txt' as the weekUrls?")
user_input = input("Y/N: ")
if user_input == "N":
# Generate list of years, then weeks from that years list
year_urls = generateYearUrls(2000, 2022)
week_urls = generateWeekUrls(year_urls, 1, 21)
# For years that have an extra week
week_urls.append("https://www.pro-football-reference.com/years/2021/week_22.htm")
week_urls.append("https://www.pro-football-reference.com/years/2022/week_22.htm")
else:
week_urls = []
# Reads the bad_game_url_list.txt, stores in week_urls
with open('bad_game_url_list.txt', 'r') as file:
# Read each line from the file and append it to the list
for line in file:
week_urls.append(line.strip())
# Gets the final list of games
game_urls = generateGameUrls(week_urls)
# Appends the game_url_list.txt, puts game_urls[0] in
with open('game_url_list.txt', 'a') as file:
# Convert each element of the list to a string and write it to the file
for item in game_urls[0]:
file.write(str(item) + '\n')
# Writes the bad_game_url_list.txt, puts game_urls[1] in
with open('bad_game_url_list.txt', 'w') as file:
# Convert each element of the list to a string and write it to the file
for item in game_urls[1]:
file.write(str(item) + '\n')
# Creates the begining of a file needed for FootballGameInfoScraper.py - STEP 2
with open('infoCSV.txt', 'w') as file:
file.write('"#","Statium","Roof Type","Surface Type","Field Goals Made","Field Goals Attempted","Temperture","Realtive Humitity","Wind Speed","Wind Chill"\n')
if __name__ == "__main__":
main()