diff --git a/pyproject.toml b/pyproject.toml index 304a581..c4a6c87 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta" [project] name = "CBBpy" -version = "1.0.3" +version = "1.0.4" description = 'A Python-based web scraper for NCAA basketball.' readme = "README.md" authors = [{ name = "Daniel Cowan", email = "dnlcowan37@gmail.com" }] diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index ff42d34..93e46f1 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -25,11 +25,12 @@ _log = logging.getLogger(__name__) ATTEMPTS = 10 -DATE_PARSES = ['%Y-%m-%d', - '%Y/%m/%d', - '%m-%d-%Y', - '%m/%d/%Y', - ] +DATE_PARSES = [ + '%Y-%m-%d', + '%Y/%m/%d', + '%m-%d-%Y', + '%m/%d/%Y', +] USER_AGENTS = [ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 ' + '(KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36', @@ -95,6 +96,25 @@ "TBD", "Suspended" ] +TOURN_WORDS = [ + 'tournament', + 'championship', + 'playoff', + '1st round', + '2nd round', + 'quarterfinal', + 'semifinal', + 'final' +] + +TOURN_SPEC = [ + 'cit ', + 'cbi ', + 'nit - ', + "men's basketball championship", + 'the basketball classic', + 'vegas 16', +] class CouldNotParseError(Exception): @@ -188,7 +208,7 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame: # no exception thrown break - return pd.concat([df_home, df_away]) + return pd.concat([df_home, df_away]).reset_index(drop=True) def get_game_pbp(game_id: str) -> pd.DataFrame: @@ -250,7 +270,7 @@ def get_game_pbp(game_id: str) -> pd.DataFrame: # no exception thrown break - return pd.concat(pbp_halves) + return pd.concat(pbp_halves).reset_index(drop=True) def get_game_info(game_id: str) -> pd.DataFrame: @@ -373,21 +393,26 @@ def get_game_info(game_id: str) -> pd.DataFrame: game_network = np.nan game_arena_pre = game_info_div.find( - "div", {"class": "caption-wrapper"}) + 'div', {'class': 'caption-wrapper'}) if not game_arena_pre: div_loc = game_info_div.find( - "div", {"class": "location-details"}) - game_arena = div_loc.find( - "span", {"class": "game-location"}).get_text().strip() - game_loc = div_loc.find( - "div", {"class": "game-location"}).get_text().strip() + 'div', {'class': 'location-details'}) + game_arena = div_loc.find('span', {'class': 'game-location'}) + + if game_arena: + game_arena = game_arena.get_text().strip() + game_loc = div_loc.find( + 'div', {'class': 'game-location'}).get_text().strip() + + else: + game_arena = game_info_div.find( + 'div', {'class': 'game-location'}).get_text().strip() + game_loc = None else: game_arena = game_arena_pre.get_text().strip() - game_loc = ( - game_info_div.find( - "div", {"class": "game-location"}).get_text().strip() - ) + game_loc = game_info_div.find( + 'div', {'class': 'game-location'}).get_text().strip() game_cap_pre = game_info_div.find_all( "div", {"class": "game-info-note capacity"}) @@ -425,6 +450,28 @@ def get_game_info(game_id: str) -> pd.DataFrame: game_r2 = np.nan game_r3 = np.nan + conf_home = 'conf' in home_div.get_text().lower() + conf_away = 'conf' in away_div.get_text().lower() + home_home = 'home' in home_div.get_text().lower() + away_away = 'away' in away_div.get_text().lower() + + if conf_home and conf_away: + is_conf = True + else: + is_conf = False + + if home_home or away_away: + is_neutral = False + elif is_conf and not type(game_meta) == str: + is_neutral = False + else: + is_neutral = True + + game_meta = str(game_meta) + + is_postseason = (any(x in game_meta.lower() for x in TOURN_WORDS) or + any(x in game_meta.lower() for x in TOURN_SPEC)) + # AGGREGATE DATA INTO DATAFRAME AND RETURN game_info_list = [ game_id, @@ -440,6 +487,9 @@ def get_game_info(game_id: str) -> pd.DataFrame: away_score, home_win, num_ots, + is_conf, + is_neutral, + is_postseason, game_meta, game_day, game_time, @@ -467,6 +517,9 @@ def get_game_info(game_id: str) -> pd.DataFrame: "away_score", "home_win", "num_ots", + "is_conference", + "is_neutral", + "is_postseason", "tournament", "game_day", "game_time", @@ -532,7 +585,7 @@ def get_games_season(season: int) -> tuple: all_data.append(games_info_day) else: - t.set_description(f"No games on {date}") + t.set_description(f"No games on {date.strftime('%D')}") date += timedelta(days=1) @@ -611,6 +664,9 @@ def get_game_ids(date: Union[str, datetime]) -> list: def _clean_boxscore_table(table, team, game_id): """A helper function to clean the DataFrame returned by get_game_boxscore""" + if len(table.find_all("thead")) <= 1: + return pd.DataFrame([]) + # GET RID OF UNWANTED ROWS all_rows = table.find_all("tr") bad_rows_a = table.find_all("thead")[1].find_all("tr") @@ -646,6 +702,32 @@ def _clean_boxscore_table(table, team, game_id): df.pf = pd.to_numeric(df.pf, errors='coerce') df.pts = pd.to_numeric(df.pts, errors='coerce') + # TOTALS ROW + tot_row = [row for row in all_rows if 'TEAM' in row.get_text()] + tot_t = "" + tot_t += str(tot_row[0]) + tot_t += "
" + + df_tot = pd.read_html(tot_t)[0] + + df_tot.columns = df.columns + # type handling + df_tot.starters = df_tot.starters.astype(str) + df_tot['min'] = pd.to_numeric(df_tot['min'], errors='coerce') + df_tot.fg = df_tot.fg.astype(str) + df_tot['3pt'] = df_tot['3pt'].astype(str) + df_tot.ft = df_tot.ft.astype(str) + df_tot.oreb = pd.to_numeric(df_tot.oreb, errors='coerce') + df_tot.dreb = pd.to_numeric(df_tot.dreb, errors='coerce') + df_tot.reb = pd.to_numeric(df_tot.reb, errors='coerce') + df_tot.ast = pd.to_numeric(df_tot.ast, errors='coerce') + df_tot.stl = pd.to_numeric(df_tot.stl, errors='coerce') + df_tot.blk = pd.to_numeric(df_tot.blk, errors='coerce') + df_tot.to = pd.to_numeric(df_tot.to, errors='coerce') + df_tot.pf = pd.to_numeric(df_tot.pf, errors='coerce') + df_tot.pts = pd.to_numeric(df_tot.pts, errors='coerce') + + # START BY CLEANING PLAYER BOXSCORES # GET PLAYER IDS ids = [x.find("a")["href"].split("/")[-2] for x in good_rows if x.find("a")] @@ -697,7 +779,68 @@ def _clean_boxscore_table(table, team, game_id): df.insert(14, "fta", fta) df['fta'] = pd.to_numeric(df['fta'], errors='coerce') - return df + # THEN CLEAN TOTAL ROW + # SPLIT UP THE FG FIELDS + fgm = [x.split('-')[0] + if x.split('-')[0] != '' + else np.nan + for x in df_tot['fg']] + fga = [x.split('-')[1] + if x.split('-')[1] != '' + else np.nan + for x in df_tot['fg']] + thpm = [x.split('-')[0] + if x.split('-')[0] != '' + else np.nan + for x in df_tot['3pt']] + thpa = [x.split('-')[1] + if x.split('-')[1] != '' + else np.nan + for x in df_tot['3pt']] + ftm = [x.split('-')[0] + if x.split('-')[0] != '' + else np.nan + for x in df_tot['ft']] + fta = [x.split('-')[1] + if x.split('-')[1] != '' + else np.nan + for x in df_tot['ft']] + + # GET RID OF UNWANTED COLUMNS + df_tot = df_tot.drop(columns=["fg", "3pt", "ft"]) + + df_tot = df_tot.rename(columns={"starters": "player"}) + df_tot['player'] = 'TEAM' + + # INSERT COLUMNS WHERE NECESSARY + df_tot.insert(0, "game_id", game_id) + df_tot.game_id = df_tot.game_id.astype(str) + df_tot.insert(1, "team", team) + df_tot.team = df_tot.team.astype(str) + df_tot.insert(3, "player_id", 'TOTAL') + df_tot.player_id = df_tot.player_id.astype(str) + df_tot.insert(4, "position", 'TOTAL') + df_tot.position = df_tot.position.astype(str) + df_tot.insert(5, "starter", 0) + df_tot.starter = df_tot.starter.astype(bool) + df_tot.insert(7, "fgm", fgm) + df_tot.fgm = pd.to_numeric(df_tot.fgm, errors='coerce') + df_tot.insert(8, "fga", fga) + df_tot.fga = pd.to_numeric(df_tot.fga, errors='coerce') + df_tot.insert(9, "2pm", [float(x) - float(y) for x, y in zip(fgm, thpm)]) + df_tot['2pm'] = pd.to_numeric(df_tot['2pm'], errors='coerce') + df_tot.insert(10, "2pa", [float(x) - float(y) for x, y in zip(fga, thpa)]) + df_tot['2pa'] = pd.to_numeric(df_tot['2pa'], errors='coerce') + df_tot.insert(11, "3pm", thpm) + df_tot['3pm'] = pd.to_numeric(df_tot['3pm'], errors='coerce') + df_tot.insert(12, "3pa", thpa) + df_tot['3pa'] = pd.to_numeric(df_tot['3pa'], errors='coerce') + df_tot.insert(13, "ftm", ftm) + df_tot['ftm'] = pd.to_numeric(df_tot['ftm'], errors='coerce') + df_tot.insert(14, "fta", fta) + df_tot['fta'] = pd.to_numeric(df_tot['fta'], errors='coerce') + + return pd.concat([df, df_tot]) def _get_pbp_map(soup):