-
-
Notifications
You must be signed in to change notification settings - Fork 811
/
mergers.py
72 lines (59 loc) · 2.37 KB
/
mergers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import pandas as pd
from os.path import dirname, join
import os
def import_merged_gw(season='2021-22'):
""" Function to call merged_gw.csv file in every data/season folder
Args:
season (str): Name of the folder season that contains the merged_gw.csv file
"""
path = os.getcwd()
filename = 'merged_gw.csv'
season_path = join(dirname(dirname("__file__")), path, 'data', season, 'gws', filename)
return season_path
def clean_players_name_string(df, col='name'):
""" Clean the imported file 'name' column because it has different patterns between seasons
Args:
df: merged df for all the seasons that have been imported
col: name of the column for cleanup
"""
#replace _ with space in name column
df[col] = df[col].str.replace('_', ' ')
#remove number in name column
df[col] = df[col].str.replace('\d+', '')
#trim name column
df[col] = df[col].str.strip()
return df
def filter_players_exist_latest(df, col='position'):
""" Fill in null 'position' (data that only available in 20-21 season) into previous seasons.
Null meaning that player doesnt exist in latest season hence can exclude.
"""
result = df.groupby('name')[col].apply(lambda x: x.ffill().bfill())
df[col] = result.droplevel(0)
df = df[df[col].notnull()]
return df
def get_opponent_team_name(df):
""" Find team name from master_team_list file and match with the merged df
"""
path = os.getcwd()
filename = 'master_team_list.csv'
team_path = join(dirname(dirname("__file__")), path, 'data', filename)
df_team = pd.read_csv(team_path)
#create id column for both df_team and df
df['id'] = df['season'].astype(str) + '_' + df['opponent_team'].astype(str)
df_team['id'] = df_team['season'].astype(str) + '_' + df_team['team'].astype(str)
#merge two dfs
df = pd.merge(df, df_team, on = 'id', how = 'left')
#rename column
df = df.rename(columns={"team_name": "opp_team_name"})
return df
def export_cleaned_data(df):
""" Function to export merged df into specified folder
Args:
path (str): Path of the folder
filename(str): Name of the file
"""
path = os.getcwd()
filename = 'cleaned_merged_seasons.csv'
filepath = join(dirname(dirname("__file__")), path, 'data', filename)
df.to_csv(filepath, encoding = 'utf-8', index=False)
return df