-
Notifications
You must be signed in to change notification settings - Fork 1
/
preprocessing.py
42 lines (32 loc) · 1.54 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import pandas as pd
import json
MUSIC_CHOICES = ['classical music', 'pop', 'metal or hardrock', 'hiphop, rap', 'latino', 'alternative']
DATA_FILE_PATH = './resources/responses.csv'
THRESHOLD_TO_LIKE_A_GENRE = 4
def load_data(file_path):
"""
Loads X and y data from resources using predetermined features
Returns two DataFrames, X and y data
:param string file_path: specifies which model's final values to utilize
:return: tuple(DataFrame, DataFrame)
"""
raw_data = pd.read_csv(DATA_FILE_PATH)
raw_data.fillna(0, inplace=True)
raw_data.columns = [col.lower() for col in raw_data.columns]
# using the features found to be the best predictors during the EDA process
# further work can be seen in the music_prediction_eda jupyter notbook
with open(file_path) as file:
json_file = json.load(file)
return raw_data[json_file['features']], raw_data[MUSIC_CHOICES]
def preprocess_data(question_data, music_data):
"""
One Hot Encodes the X data and labels the y data
:param DataFrame question_data: X data
:param DataFrame music_data: y data
:return: tuple(DataFrame, DataFrame)
"""
# gender is binary so we convert that prior to OHE (One Hot Encoding)
question_data.loc[:, ['gender']] = question_data[['gender']].applymap(lambda gender: 1 if gender == 'female' else 0)
question_data = pd.get_dummies(question_data, drop_first=True).astype(int)
music_data = music_data.applymap(lambda x: True if x >= THRESHOLD_TO_LIKE_A_GENRE else False)
return question_data, music_data