diff --git a/pyproject.toml b/pyproject.toml index 304a581..c4a6c87 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta" [project] name = "CBBpy" -version = "1.0.3" +version = "1.0.4" description = 'A Python-based web scraper for NCAA basketball.' readme = "README.md" authors = [{ name = "Daniel Cowan", email = "dnlcowan37@gmail.com" }] diff --git a/src/cbbpy/mens_scraper.py b/src/cbbpy/mens_scraper.py index ff42d34..93e46f1 100755 --- a/src/cbbpy/mens_scraper.py +++ b/src/cbbpy/mens_scraper.py @@ -25,11 +25,12 @@ _log = logging.getLogger(__name__) ATTEMPTS = 10 -DATE_PARSES = ['%Y-%m-%d', - '%Y/%m/%d', - '%m-%d-%Y', - '%m/%d/%Y', - ] +DATE_PARSES = [ + '%Y-%m-%d', + '%Y/%m/%d', + '%m-%d-%Y', + '%m/%d/%Y', +] USER_AGENTS = [ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 ' + '(KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36', @@ -95,6 +96,25 @@ "TBD", "Suspended" ] +TOURN_WORDS = [ + 'tournament', + 'championship', + 'playoff', + '1st round', + '2nd round', + 'quarterfinal', + 'semifinal', + 'final' +] + +TOURN_SPEC = [ + 'cit ', + 'cbi ', + 'nit - ', + "men's basketball championship", + 'the basketball classic', + 'vegas 16', +] class CouldNotParseError(Exception): @@ -188,7 +208,7 @@ def get_game_boxscore(game_id: str) -> pd.DataFrame: # no exception thrown break - return pd.concat([df_home, df_away]) + return pd.concat([df_home, df_away]).reset_index(drop=True) def get_game_pbp(game_id: str) -> pd.DataFrame: @@ -250,7 +270,7 @@ def get_game_pbp(game_id: str) -> pd.DataFrame: # no exception thrown break - return pd.concat(pbp_halves) + return pd.concat(pbp_halves).reset_index(drop=True) def get_game_info(game_id: str) -> pd.DataFrame: @@ -373,21 +393,26 @@ def get_game_info(game_id: str) -> pd.DataFrame: game_network = np.nan game_arena_pre = game_info_div.find( - "div", {"class": "caption-wrapper"}) + 'div', {'class': 'caption-wrapper'}) if not game_arena_pre: div_loc = game_info_div.find( - "div", {"class": "location-details"}) - game_arena = div_loc.find( - "span", {"class": "game-location"}).get_text().strip() - game_loc = div_loc.find( - "div", {"class": "game-location"}).get_text().strip() + 'div', {'class': 'location-details'}) + game_arena = div_loc.find('span', {'class': 'game-location'}) + + if game_arena: + game_arena = game_arena.get_text().strip() + game_loc = div_loc.find( + 'div', {'class': 'game-location'}).get_text().strip() + + else: + game_arena = game_info_div.find( + 'div', {'class': 'game-location'}).get_text().strip() + game_loc = None else: game_arena = game_arena_pre.get_text().strip() - game_loc = ( - game_info_div.find( - "div", {"class": "game-location"}).get_text().strip() - ) + game_loc = game_info_div.find( + 'div', {'class': 'game-location'}).get_text().strip() game_cap_pre = game_info_div.find_all( "div", {"class": "game-info-note capacity"}) @@ -425,6 +450,28 @@ def get_game_info(game_id: str) -> pd.DataFrame: game_r2 = np.nan game_r3 = np.nan + conf_home = 'conf' in home_div.get_text().lower() + conf_away = 'conf' in away_div.get_text().lower() + home_home = 'home' in home_div.get_text().lower() + away_away = 'away' in away_div.get_text().lower() + + if conf_home and conf_away: + is_conf = True + else: + is_conf = False + + if home_home or away_away: + is_neutral = False + elif is_conf and not type(game_meta) == str: + is_neutral = False + else: + is_neutral = True + + game_meta = str(game_meta) + + is_postseason = (any(x in game_meta.lower() for x in TOURN_WORDS) or + any(x in game_meta.lower() for x in TOURN_SPEC)) + # AGGREGATE DATA INTO DATAFRAME AND RETURN game_info_list = [ game_id, @@ -440,6 +487,9 @@ def get_game_info(game_id: str) -> pd.DataFrame: away_score, home_win, num_ots, + is_conf, + is_neutral, + is_postseason, game_meta, game_day, game_time, @@ -467,6 +517,9 @@ def get_game_info(game_id: str) -> pd.DataFrame: "away_score", "home_win", "num_ots", + "is_conference", + "is_neutral", + "is_postseason", "tournament", "game_day", "game_time", @@ -532,7 +585,7 @@ def get_games_season(season: int) -> tuple: all_data.append(games_info_day) else: - t.set_description(f"No games on {date}") + t.set_description(f"No games on {date.strftime('%D')}") date += timedelta(days=1) @@ -611,6 +664,9 @@ def get_game_ids(date: Union[str, datetime]) -> list: def _clean_boxscore_table(table, team, game_id): """A helper function to clean the DataFrame returned by get_game_boxscore""" + if len(table.find_all("thead")) <= 1: + return pd.DataFrame([]) + # GET RID OF UNWANTED ROWS all_rows = table.find_all("tr") bad_rows_a = table.find_all("thead")[1].find_all("tr") @@ -646,6 +702,32 @@ def _clean_boxscore_table(table, team, game_id): df.pf = pd.to_numeric(df.pf, errors='coerce') df.pts = pd.to_numeric(df.pts, errors='coerce') + # TOTALS ROW + tot_row = [row for row in all_rows if 'TEAM' in row.get_text()] + tot_t = "