Skip to content

Commit

Permalink
update code refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
MauroLuzzatto committed Jul 23, 2023
1 parent bdbb406 commit 4e284e7
Show file tree
Hide file tree
Showing 15 changed files with 170 additions and 57 deletions.
32 changes: 32 additions & 0 deletions music_flow/configs/model.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Data parameters
data:
dataset_path : 'data/processed/'

# Model parameters
model:
name: 'singleoutput'
num_units: 224
num_layers: 5
activation_function : 'sigmoid'

# Training parameters
training:
batch_size: 128
num_epochs: 200
loss_function: 'mae'
metric: 'mse'

# Logging and output parameters
mlflow:
mlruns_path: 'file:models/mlruns'
experiment_name: 'singleOutput'

# Tuning
hyperparameter_tuning:
num_layers: [3, 5]
num_units: [16, 64, 224]
activation_function: ['relu', 'sigmoid']
batch_size: [128, 256]
loss_function: ['mae']
metric: ['mse']
num_epochs: [200]
10 changes: 5 additions & 5 deletions music_flow/core/batch_spotify_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ class BatchSpotifyAPI(SpotifyAPI):
def __init__(self):
super().__init__()

def get_multiple_audio_features(self, ids: list[str]):
def get_batch_audio_features(self, ids: list[str]):
if len(ids) > 100:
raise Exception("too many values requested")

Expand All @@ -14,7 +14,7 @@ def get_multiple_audio_features(self, ids: list[str]):
response, status_code = self.get_request(url)
return response, status_code

def get_multiple_tracks(self, ids: list[str]):
def get_batch_tracks(self, ids: list[str]):
if len(ids) > 50:
raise Exception("too many values requested")

Expand All @@ -24,7 +24,7 @@ def get_multiple_tracks(self, ids: list[str]):
return response, status_code

@staticmethod
def convert_multi_response_to_dict(responses):
def convert_batch_response_to_dict(responses):
try:
output_dict = {audio["id"]: audio for audio in responses}
except:
Expand All @@ -35,9 +35,9 @@ def convert_multi_response_to_dict(responses):
if __name__ == "__main__":
from pprint import pprint

multi = MultiSpotifyAPI()
multi = BatchSpotifyAPI()
ids = ["1qRpqv3I1t1kRol36KAfEi", "4knd2gQyr2DTRLfJDHcyMS"]
response, status_code = multi.get_multiple_audio_features(ids)
response, status_code = multi.get_batch_audio_features(ids)
pprint(response)

response, status_code = multi.get_multiple_tracks(ids)
Expand Down
3 changes: 1 addition & 2 deletions music_flow/core/features/get_raw_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
logger.setLevel(logging.INFO)

spotify_api = SpotifyAPI()
# TODO: allow for batch downloading of api requests


@dataclass
Expand Down Expand Up @@ -102,7 +101,7 @@ def get_raw_features(
for endpoint in endpoints:
name = endpoint.name
response, status_code = endpoint.func(track_id)
logger.info(f"endpoint: {endpoint.name}, status_code: {status_code}")
logger.debug(f"endpoint: {endpoint.name}, status_code: {status_code}")
if status_code == 200:
data[name] = response
else:
Expand Down
2 changes: 1 addition & 1 deletion music_flow/core/features/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,11 +82,11 @@ def feature_preprocessing(dataset: pd.DataFrame):

# transformation step
for column in [
"plays",
"speechiness",
"acousticness",
"instrumentalness",
"liveness",
"plays",
]:
dataset[column] = dataset[column].apply(np.log1p)

Expand Down
11 changes: 0 additions & 11 deletions music_flow/core/spotify_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,14 +155,3 @@ def get_audio_analysis(self, id):
@staticmethod
def clean_string(string):
return requote_uri(string)

# (
# string.replace("'", "")
# .replace("-", " ")
# .replace("(", "")
# .replace(")", "")
# .replace("#", "")
# .replace("&", " ")
# .replace("'", "")
# .replace("5", "")
# )
2 changes: 1 addition & 1 deletion music_flow/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def read_json(path: str) -> dict:
path_results = os.path.join(path, "results")
path_registry = os.path.join(path, "registry")
path_reports = os.path.join(path, "reports")
path_data_lake = os.path.join(path, "data_lake")
path_data_lake = os.path.join(path, "data_lake_v2")
path_data_lake_success = os.path.join(path_data_lake, "success")
path_data_lake_failed = os.path.join(path_data_lake, "failed")

Expand Down
13 changes: 5 additions & 8 deletions music_flow/dataset/download_audio_features_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def download_audio_features_batch(is_retry_failed_files: bool = False) -> bool:

start_time = time.time()

multi = MultiSpotifyAPI()
batch = BatchSpotifyAPI()
data_collection = {}

for index, row in df.iterrows():
Expand Down Expand Up @@ -94,15 +94,12 @@ def download_audio_features_batch(is_retry_failed_files: bool = False) -> bool:
continue

ids = list(data_collection.keys())
response, status_code = multi.get_multiple_audio_features(ids)
audio_features_dict = multi.convert_multi_response_to_dict(
response, _ = batch.get_batch_audio_features(ids)
audio_features_dict = batch.convert_batch_response_to_dict(
response["audio_features"]
)
response, status_code = multi.get_multiple_tracks(ids)
tracks_dict = multi.convert_multi_response_to_dict(response["tracks"])

# if not tracks_dict or not audio_features_dict:
# continue
response, _ = batch.get_batch_tracks(ids)
tracks_dict = batch.convert_batch_response_to_dict(response["tracks"])

for track_id, data in data_collection.items():
try:
Expand Down
13 changes: 8 additions & 5 deletions music_flow/main_train.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import os

import numpy as np
import pandas as pd
from xgboost import XGBRegressor # type: ignore

Expand Down Expand Up @@ -34,7 +34,6 @@


path_dataset_file = os.path.join(path_dataset, dataset_settings.FINAL_DATASET)

dataset = pd.read_csv(path_dataset_file, sep=";", index_col=0) # type: ignore
dataset = feature_preprocessing(dataset)

Expand Down Expand Up @@ -77,10 +76,12 @@
"Unknown",
]


dataset.sample(frac=1, random_state=42)
X: pd.DataFrame = dataset[columns_scope]
y: pd.Series = dataset[target_column]

# y = np.log1p(y)

print(X.describe().T)
print(type(X))

Expand All @@ -100,5 +101,7 @@

trainer.train(param_distributions, cv_settings)

registry = ModelRegistry(bucket_name=settings.BUCKET_NAME)
registry.upload_folder(trainer.folder_name)
upload = False
if upload:
registry = ModelRegistry(bucket_name=settings.BUCKET_NAME)
registry.upload_folder(trainer.folder_name)
4 changes: 3 additions & 1 deletion music_flow/model/Training.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ def __init__(
self.X_test, self.y_test = dataset.get_test_data()
data_log = dataset.get_data_log()
self.column_names = dataset.get_column_names()
self.data_version = dataset.data_version

logger = Logger()
self.logger = logger(self.path_logs, stage="training")
Expand Down Expand Up @@ -240,7 +241,7 @@ def train_on_all_data(self) -> None:
"""
self.final_model = self.estimator.set_params(**self.best_params)
self.final_model.fit(self.X, self.y) # type: ignore
self.final_model.fit(self.dataset.X, self.dataset.y) # type: ignore

def save_metadata(self, score_dict) -> None:
"""
Expand Down Expand Up @@ -314,6 +315,7 @@ def train(self, param_distributions, cv_settings, config=None, save=False):
y_pred=self.y_pred_reversed,
)
score_dict = evaluator.evaluate()

evaluator.visualize(path_save=self.path_plots)

###### evaluate before training on all data
Expand Down
73 changes: 56 additions & 17 deletions music_flow/model/baseline.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,14 @@
import os

import numpy as np
import pandas as pd
from xgboost import XGBRegressor # type: ignore

from music_flow.__init__ import __version__ as model_version
from music_flow.config import settings
from music_flow.core.features.preprocessing import feature_preprocessing
from music_flow.core.model_registry import ModelRegistry
from music_flow.core.utils import path_dataset, path_results
from music_flow.core.utils import path_dataset, create_folder, path_results
from music_flow.config import dataset_settings
from music_flow.model.training import Training
from music_flow.model.training_data import TrainingData
from music_flow.model.evaluator import Evaluator
from music_flow.model.file_handler import save_json

path_dataset_file = os.path.join(path_dataset, dataset_settings.FINAL_DATASET)
dataset = pd.read_csv(path_dataset_file, sep=";", index_col=0) # type: ignore
Expand Down Expand Up @@ -64,27 +62,68 @@

dataset = TrainingData(X=X, y=y)
dataset.do_train_test_split()
data_log = dataset.get_data_log()
print(data_log)
X_test, y_test = dataset.get_test_data()
X_train, y_train = dataset.get_training_data()

estimator = XGBRegressor()

# Baseline - random values

folder_name = "Baseline0 - one value"
def baseline_model_constant_value(X_test: np.ndarray):
"""return only ones"""
shape = (X_test.shape[0],)
return np.zeros(shape)


def baseline_model_simple_rule(X_test: np.ndarray):
"""_summary_
Get the popularity of a song and scale it into
a scale between 0 and 1, then re-scale the values
to be in the prediction range
"""
index = columns_scope.index("popularity")
max_value = 30
scaled = X_test[:, index] / 100
scale_to_predictions = scaled * max_value
return scale_to_predictions


def baseline_model_no_tuning(X_test: np.ndarray):
"""
Use non fine-tuned estimator to predict the values
"""
estimator.fit(X_train, y_train)
return estimator.predict(X_test)

path_save = create_folder(os.path.join(path_model, folder_name))

# Baseline - only zeros
folder_name = "Baseline0 - zeros"
path_save = create_folder(os.path.join(path_results, folder_name))

evaluator = Evaluator(
y_test=self.y_test_reversed,
y_pred=self.y_pred_reversed,
)
y_pred = baseline_model_constant_value(X_test)
print(y_pred.shape, y_test.shape)
evaluator = Evaluator(y_test=y_test, y_pred=y_pred)
score_dict = evaluator.evaluate()
save_json(name="score_dict.json", data=score_dict, path=path_save)
evaluator.visualize(path_save=path_save)


# Baseline 1 - simpel rule
# Baseline 1 - simple rule
folder_name = "Baseline1 - simple rule"
path_save = create_folder(os.path.join(path_results, folder_name))

y_pred = baseline_model_simple_rule(X_test)
evaluator = Evaluator(y_test=y_test, y_pred=y_pred)
score_dict = evaluator.evaluate()
save_json(name="score_dict.json", data=score_dict, path=path_save)
evaluator.visualize(path_save=path_save)

# Baseline 2 - no-hyperparamter tuning
folder_name = "Baseline2 - no hyperparamter tuning"
path_save = create_folder(os.path.join(path_results, folder_name))

# Baseline 2 - no-hyperparamter tuning
y_pred = baseline_model_no_tuning(X_test)
evaluator = Evaluator(y_test=y_test, y_pred=y_pred)
score_dict = evaluator.evaluate()
save_json(name="score_dict.json", data=score_dict, path=path_save)
evaluator.visualize(path_save=path_save)
43 changes: 43 additions & 0 deletions music_flow/model/compare_results.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from music_flow.core.utils import path_dataset, create_folder, path_results
import json
import os
import pandas as pd


results = []


def read_json(path):
try:
with open(path, "r") as f:
data = json.load(f)
except (FileNotFoundError, NotADirectoryError):
data = {}
return data


for folder in os.listdir(path_results):
path = os.path.join(path_results, folder)
print(path)
if not os.path.isdir(path):
continue

files = os.listdir(path)

if "score_dict.json" in files:
path = os.path.join(path_results, folder, "score_dict.json")
score_dict = read_json(path)

elif "metadata.json" in files:
path = os.path.join(path_results, folder, "metadata.json")
metadata = read_json(path)
key = "score" if "score" in metadata["model"] else "score_dict"
score_dict = metadata["model"].get(key, {})

score_dict["folder"] = folder
results.append(score_dict)


df = pd.DataFrame(results).sort_values("mean_squared_error")
df.to_csv(os.path.join(path_results, "results_overview.csv"))
print(df)
5 changes: 5 additions & 0 deletions music_flow/model/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ def visualize(self, path_save: Optional[str] = None):
plt.xlabel("predictions")
plt.ylabel("test values")
plt.show(block=False)
plt.tight_layout()

if path_save:
fig.savefig(os.path.join(path_save, image_name))

Expand All @@ -76,6 +78,8 @@ def visualize(self, path_save: Optional[str] = None):
plt.ylabel("residuals")
plt.xlabel("predictions")
plt.show(block=False)
plt.tight_layout()

if path_save:
fig.savefig(os.path.join(path_save, image_name))

Expand All @@ -84,5 +88,6 @@ def visualize(self, path_save: Optional[str] = None):
plt.hist(residuals, alpha=0.5)
plt.ylabel("residuals")
plt.show(block=False)
plt.tight_layout()
if path_save:
fig.savefig(os.path.join(path_save, image_name))
Loading

0 comments on commit 4e284e7

Please sign in to comment.