Skip to content

Commit

Permalink
Merge pull request #18 from dcstats/1.0.4
Browse files Browse the repository at this point in the history
1.0.4
  • Loading branch information
dcstats authored Sep 29, 2022
2 parents 2b31406 + bdae9fe commit 73a511d
Show file tree
Hide file tree
Showing 2 changed files with 163 additions and 20 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "CBBpy"
version = "1.0.3"
version = "1.0.4"
description = 'A Python-based web scraper for NCAA basketball.'
readme = "README.md"
authors = [{ name = "Daniel Cowan", email = "dnlcowan37@gmail.com" }]
Expand Down
181 changes: 162 additions & 19 deletions src/cbbpy/mens_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,12 @@
_log = logging.getLogger(__name__)

ATTEMPTS = 10
DATE_PARSES = ['%Y-%m-%d',
'%Y/%m/%d',
'%m-%d-%Y',
'%m/%d/%Y',
]
DATE_PARSES = [
'%Y-%m-%d',
'%Y/%m/%d',
'%m-%d-%Y',
'%m/%d/%Y',
]
USER_AGENTS = [
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 ' +
'(KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
Expand Down Expand Up @@ -95,6 +96,25 @@
"TBD",
"Suspended"
]
TOURN_WORDS = [
'tournament',
'championship',
'playoff',
'1st round',
'2nd round',
'quarterfinal',
'semifinal',
'final'
]

TOURN_SPEC = [
'cit ',
'cbi ',
'nit - ',
"men's basketball championship",
'the basketball classic',
'vegas 16',
]


class CouldNotParseError(Exception):
Expand Down Expand Up @@ -188,7 +208,7 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame:
# no exception thrown
break

return pd.concat([df_home, df_away])
return pd.concat([df_home, df_away]).reset_index(drop=True)


def get_game_pbp(game_id: str) -> pd.DataFrame:
Expand Down Expand Up @@ -250,7 +270,7 @@ def get_game_pbp(game_id: str) -> pd.DataFrame:
# no exception thrown
break

return pd.concat(pbp_halves)
return pd.concat(pbp_halves).reset_index(drop=True)


def get_game_info(game_id: str) -> pd.DataFrame:
Expand Down Expand Up @@ -373,21 +393,26 @@ def get_game_info(game_id: str) -> pd.DataFrame:
game_network = np.nan

game_arena_pre = game_info_div.find(
"div", {"class": "caption-wrapper"})
'div', {'class': 'caption-wrapper'})

if not game_arena_pre:
div_loc = game_info_div.find(
"div", {"class": "location-details"})
game_arena = div_loc.find(
"span", {"class": "game-location"}).get_text().strip()
game_loc = div_loc.find(
"div", {"class": "game-location"}).get_text().strip()
'div', {'class': 'location-details'})
game_arena = div_loc.find('span', {'class': 'game-location'})

if game_arena:
game_arena = game_arena.get_text().strip()
game_loc = div_loc.find(
'div', {'class': 'game-location'}).get_text().strip()

else:
game_arena = game_info_div.find(
'div', {'class': 'game-location'}).get_text().strip()
game_loc = None
else:
game_arena = game_arena_pre.get_text().strip()
game_loc = (
game_info_div.find(
"div", {"class": "game-location"}).get_text().strip()
)
game_loc = game_info_div.find(
'div', {'class': 'game-location'}).get_text().strip()

game_cap_pre = game_info_div.find_all(
"div", {"class": "game-info-note capacity"})
Expand Down Expand Up @@ -425,6 +450,28 @@ def get_game_info(game_id: str) -> pd.DataFrame:
game_r2 = np.nan
game_r3 = np.nan

conf_home = 'conf' in home_div.get_text().lower()
conf_away = 'conf' in away_div.get_text().lower()
home_home = 'home' in home_div.get_text().lower()
away_away = 'away' in away_div.get_text().lower()

if conf_home and conf_away:
is_conf = True
else:
is_conf = False

if home_home or away_away:
is_neutral = False
elif is_conf and not type(game_meta) == str:
is_neutral = False
else:
is_neutral = True

game_meta = str(game_meta)

is_postseason = (any(x in game_meta.lower() for x in TOURN_WORDS) or
any(x in game_meta.lower() for x in TOURN_SPEC))

# AGGREGATE DATA INTO DATAFRAME AND RETURN
game_info_list = [
game_id,
Expand All @@ -440,6 +487,9 @@ def get_game_info(game_id: str) -> pd.DataFrame:
away_score,
home_win,
num_ots,
is_conf,
is_neutral,
is_postseason,
game_meta,
game_day,
game_time,
Expand Down Expand Up @@ -467,6 +517,9 @@ def get_game_info(game_id: str) -> pd.DataFrame:
"away_score",
"home_win",
"num_ots",
"is_conference",
"is_neutral",
"is_postseason",
"tournament",
"game_day",
"game_time",
Expand Down Expand Up @@ -532,7 +585,7 @@ def get_games_season(season: int) -> tuple:
all_data.append(games_info_day)

else:
t.set_description(f"No games on {date}")
t.set_description(f"No games on {date.strftime('%D')}")

date += timedelta(days=1)

Expand Down Expand Up @@ -611,6 +664,9 @@ def get_game_ids(date: Union[str, datetime]) -> list:
def _clean_boxscore_table(table, team, game_id):
"""A helper function to clean the DataFrame returned by get_game_boxscore"""

if len(table.find_all("thead")) <= 1:
return pd.DataFrame([])

# GET RID OF UNWANTED ROWS
all_rows = table.find_all("tr")
bad_rows_a = table.find_all("thead")[1].find_all("tr")
Expand Down Expand Up @@ -646,6 +702,32 @@ def _clean_boxscore_table(table, team, game_id):
df.pf = pd.to_numeric(df.pf, errors='coerce')
df.pts = pd.to_numeric(df.pts, errors='coerce')

# TOTALS ROW
tot_row = [row for row in all_rows if 'TEAM' in row.get_text()]
tot_t = "<table>"
tot_t += str(tot_row[0])
tot_t += "</table>"

df_tot = pd.read_html(tot_t)[0]

df_tot.columns = df.columns
# type handling
df_tot.starters = df_tot.starters.astype(str)
df_tot['min'] = pd.to_numeric(df_tot['min'], errors='coerce')
df_tot.fg = df_tot.fg.astype(str)
df_tot['3pt'] = df_tot['3pt'].astype(str)
df_tot.ft = df_tot.ft.astype(str)
df_tot.oreb = pd.to_numeric(df_tot.oreb, errors='coerce')
df_tot.dreb = pd.to_numeric(df_tot.dreb, errors='coerce')
df_tot.reb = pd.to_numeric(df_tot.reb, errors='coerce')
df_tot.ast = pd.to_numeric(df_tot.ast, errors='coerce')
df_tot.stl = pd.to_numeric(df_tot.stl, errors='coerce')
df_tot.blk = pd.to_numeric(df_tot.blk, errors='coerce')
df_tot.to = pd.to_numeric(df_tot.to, errors='coerce')
df_tot.pf = pd.to_numeric(df_tot.pf, errors='coerce')
df_tot.pts = pd.to_numeric(df_tot.pts, errors='coerce')

# START BY CLEANING PLAYER BOXSCORES
# GET PLAYER IDS
ids = [x.find("a")["href"].split("/")[-2]
for x in good_rows if x.find("a")]
Expand Down Expand Up @@ -697,7 +779,68 @@ def _clean_boxscore_table(table, team, game_id):
df.insert(14, "fta", fta)
df['fta'] = pd.to_numeric(df['fta'], errors='coerce')

return df
# THEN CLEAN TOTAL ROW
# SPLIT UP THE FG FIELDS
fgm = [x.split('-')[0]
if x.split('-')[0] != ''
else np.nan
for x in df_tot['fg']]
fga = [x.split('-')[1]
if x.split('-')[1] != ''
else np.nan
for x in df_tot['fg']]
thpm = [x.split('-')[0]
if x.split('-')[0] != ''
else np.nan
for x in df_tot['3pt']]
thpa = [x.split('-')[1]
if x.split('-')[1] != ''
else np.nan
for x in df_tot['3pt']]
ftm = [x.split('-')[0]
if x.split('-')[0] != ''
else np.nan
for x in df_tot['ft']]
fta = [x.split('-')[1]
if x.split('-')[1] != ''
else np.nan
for x in df_tot['ft']]

# GET RID OF UNWANTED COLUMNS
df_tot = df_tot.drop(columns=["fg", "3pt", "ft"])

df_tot = df_tot.rename(columns={"starters": "player"})
df_tot['player'] = 'TEAM'

# INSERT COLUMNS WHERE NECESSARY
df_tot.insert(0, "game_id", game_id)
df_tot.game_id = df_tot.game_id.astype(str)
df_tot.insert(1, "team", team)
df_tot.team = df_tot.team.astype(str)
df_tot.insert(3, "player_id", 'TOTAL')
df_tot.player_id = df_tot.player_id.astype(str)
df_tot.insert(4, "position", 'TOTAL')
df_tot.position = df_tot.position.astype(str)
df_tot.insert(5, "starter", 0)
df_tot.starter = df_tot.starter.astype(bool)
df_tot.insert(7, "fgm", fgm)
df_tot.fgm = pd.to_numeric(df_tot.fgm, errors='coerce')
df_tot.insert(8, "fga", fga)
df_tot.fga = pd.to_numeric(df_tot.fga, errors='coerce')
df_tot.insert(9, "2pm", [float(x) - float(y) for x, y in zip(fgm, thpm)])
df_tot['2pm'] = pd.to_numeric(df_tot['2pm'], errors='coerce')
df_tot.insert(10, "2pa", [float(x) - float(y) for x, y in zip(fga, thpa)])
df_tot['2pa'] = pd.to_numeric(df_tot['2pa'], errors='coerce')
df_tot.insert(11, "3pm", thpm)
df_tot['3pm'] = pd.to_numeric(df_tot['3pm'], errors='coerce')
df_tot.insert(12, "3pa", thpa)
df_tot['3pa'] = pd.to_numeric(df_tot['3pa'], errors='coerce')
df_tot.insert(13, "ftm", ftm)
df_tot['ftm'] = pd.to_numeric(df_tot['ftm'], errors='coerce')
df_tot.insert(14, "fta", fta)
df_tot['fta'] = pd.to_numeric(df_tot['fta'], errors='coerce')

return pd.concat([df, df_tot])


def _get_pbp_map(soup):
Expand Down

0 comments on commit 73a511d

Please sign in to comment.