Skip to content

Commit

Permalink
PEP8 standard fixed by pycodestyle
Browse files Browse the repository at this point in the history
  • Loading branch information
snehakumari321 committed Mar 5, 2023
1 parent a3633c9 commit 8997080
Show file tree
Hide file tree
Showing 7 changed files with 56 additions and 28 deletions.
15 changes: 10 additions & 5 deletions components/get_data/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,21 @@ def go(args):


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Download URL to a local destination")
parser = argparse.\
ArgumentParser(description="Download URL to a local destination")

parser.add_argument("sample", type=str, help="Name of the sample to download")
parser.add_argument("sample", type=str,
help="Name of the sample to download")

parser.add_argument("artifact_name", type=str, help="Name for the output artifact")
parser.add_argument("artifact_name", type=str,
help="Name for the output artifact")

parser.add_argument("artifact_type", type=str, help="Output artifact type.")
parser.add_argument("artifact_type", type=str,
help="Output artifact type.")

parser.add_argument(
"artifact_description", type=str, help="A brief description of this artifact"
"artifact_description", type=str,
help="A brief description of this artifact"
)

args = parser.parse_args()
Expand Down
6 changes: 4 additions & 2 deletions components/test_regression_model/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,12 @@ def go(args):
)

parser.add_argument(
"--mlflow_model", type=str, help="Input MLFlow model", required=True
"--mlflow_model", type=str,
help="Input MLFlow model", required=True
)

parser.add_argument("--test_dataset", type=str, help="Test dataset", required=True)
parser.add_argument("--test_dataset", type=str,
help="Test dataset", required=True)

args = parser.parse_args()

Expand Down
3 changes: 2 additions & 1 deletion components/train_val_test_split/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ def go(args):
parser.add_argument(
"test_size",
type=float,
help="Size of the test split. " "Fraction of the dataset, or number of items",
help="Size of the test split."
"Fraction of the dataset, or number of items",
)

parser.add_argument(
Expand Down
21 changes: 14 additions & 7 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,15 @@ def go(config: DictConfig):

if "basic_cleaning" in active_steps:
_ = mlflow.run(
os.path.join(hydra.utils.get_original_cwd(), "src", "basic_cleaning"),
os.path.join(hydra.utils.get_original_cwd(),
"src", "basic_cleaning"),
"main",
parameters={
"input_artifact": "sample.csv:latest",
"output_artifact": "clean_sample.csv",
"output_type": "clean_sample",
"output_description": "Data with outliers and null values removed",
"output_description":
"Data with outliers and null values removed",
"min_price": config["etl"]["min_price"],
"max_price": config["etl"]["max_price"],
},
Expand All @@ -65,7 +67,8 @@ def go(config: DictConfig):
if "data_check" in active_steps:
if "data_check" in active_steps:
_ = mlflow.run(
os.path.join(hydra.utils.get_original_cwd(), "src", "data_check"),
os.path.join(hydra.utils.get_original_cwd(),
"src", "data_check"),
"main",
parameters={
"csv": "clean_sample.csv:latest",
Expand All @@ -78,7 +81,8 @@ def go(config: DictConfig):

if "data_split" in active_steps:
_ = mlflow.run(
f"{config['main']['components_repository']}/" f"train_val_test_split",
f"{config['main']['components_repository']}/"
f"train_val_test_split",
"main",
version="main",
parameters={
Expand All @@ -104,7 +108,8 @@ def go(config: DictConfig):

_ = mlflow.run(
os.path.join(
hydra.utils.get_original_cwd(), "src", "train_random_forest"
hydra.utils.get_original_cwd(), "src",
"train_random_forest"
),
"main",
parameters={
Expand All @@ -113,14 +118,16 @@ def go(config: DictConfig):
"random_seed": config["modeling"]["random_seed"],
"stratify": config["modeling"]["stratify_by"],
"rf_config": rf_config,
"max_tfidf_features": config["modeling"]["max_tfidf_features"],
"max_tfidf_features":
config["modeling"]["max_tfidf_features"],
"output_artifact": "random_forest_export",
},
)

if "test_regression_model" in active_steps:
_ = mlflow.run(
f"{config['main']['components_repository']}/" f"test_regression_model",
f"{config['main']['components_repository']}/"
f"test_regression_model",
"main",
version="main",
parameters={
Expand Down
12 changes: 8 additions & 4 deletions src/basic_cleaning/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ def go(args):
df["last_review"] = pd.to_datetime(df["last_review"])

# Filter proper boundary
idx = df["longitude"].between(-74.25, -73.50) & df["latitude"].between(40.5, 41.2)
idx = df["longitude"].between(-74.25, -73.50) & df["latitude"].\
between(40.5, 41.2)
df = df[idx].copy()

df.to_csv(args.output_artifact, index=False)
Expand Down Expand Up @@ -68,7 +69,8 @@ def go(args):
required=True,
)

parser.add_argument("--output_type", type=str, help="type of output", required=True)
parser.add_argument("--output_type", type=str,
help="type of output", required=True)

parser.add_argument(
"--output_description",
Expand All @@ -78,11 +80,13 @@ def go(args):
)

parser.add_argument(
"--min_price", type=float, help="parameter for minimum price", required=True
"--min_price", type=float,
help="parameter for minimum price", required=True
)

parser.add_argument(
"--max_price", type=float, help="parameter for maximum price", required=True
"--max_price", type=float,
help="parameter for maximum price", required=True
)

args = parser.parse_args()
Expand Down
3 changes: 2 additions & 1 deletion src/train_random_forest/feature_engineering.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@ def delta_date_feature(dates):
between each date and the most recent date in its column
"""
date_sanitized = pd.DataFrame(dates).apply(pd.to_datetime)
return date_sanitized.apply(lambda d: (d.max() - d).dt.days, axis=0).to_numpy()
return date_sanitized.apply(lambda d: (d.max() - d).dt.
days, axis=0).to_numpy()
24 changes: 16 additions & 8 deletions src/train_random_forest/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, FunctionTransformer
from sklearn.preprocessing import OrdinalEncoder,\
OneHotEncoder, FunctionTransformer

import wandb
from sklearn.ensemble import RandomForestRegressor
Expand All @@ -32,7 +33,8 @@ def delta_date_feature(dates):
between each date and the most recent date in its column
"""
date_sanitized = pd.DataFrame(dates).apply(pd.to_datetime)
return date_sanitized.apply(lambda d: (d.max() - d).dt.days, axis=0).to_numpy()
return date_sanitized.apply(lambda d: (d.max() - d).dt.
days, axis=0).to_numpy()


logging.basicConfig(level=logging.INFO, format="%(asctime)-15s %(message)s")
Expand Down Expand Up @@ -107,7 +109,8 @@ def go(args):
# Save the sk_pipe pipeline as a mlflow.sklearn model in the
# directory "random_forest_dir"
# HINT: use mlflow.sklearn.save_model
signature = mlflow.models.infer_signature(X_val[processed_features], y_pred)
signature = mlflow.models.\
infer_signature(X_val[processed_features], y_pred)

export_path = "random_forest_dir"
mlflow.sklearn.save_model(
Expand Down Expand Up @@ -158,16 +161,19 @@ def go(args):

def plot_feature_importance(pipe, feat_names):
# We collect the feature importance for all non-nlp features first
feat_imp = pipe["random_forest"].feature_importances_[: len(feat_names) - 1]
feat_imp = pipe["random_forest"].
feature_importances_[: len(feat_names) - 1]
# For the NLP feature we sum across all the TF-IDF dimensions into a global
# NLP importance
nlp_importance = sum(
pipe["random_forest"].feature_importances_[len(feat_names) - 1 :]
pipe["random_forest"].
feature_importances_[len(feat_names) - 1:]
)
feat_imp = np.append(feat_imp, nlp_importance)
fig_feat_imp, sub_feat_imp = plt.subplots(figsize=(10, 10))
# idx = np.argsort(feat_imp)[::-1]
sub_feat_imp.bar(range(feat_imp.shape[0]), feat_imp, color="r", align="center")
sub_feat_imp.bar(range(feat_imp.shape[0]),
feat_imp, color="r", align="center")
_ = sub_feat_imp.set_xticks(range(feat_imp.shape[0]))
_ = sub_feat_imp.set_xticklabels(np.array(feat_names), rotation=90)
fig_feat_imp.tight_layout()
Expand Down Expand Up @@ -217,7 +223,8 @@ def get_inference_pipeline(rf_config, max_tfidf_features):
# create a new feature from it,
date_imputer = make_pipeline(
SimpleImputer(strategy="constant", fill_value="2010-01-01"),
FunctionTransformer(delta_date_feature, check_inverse=False, validate=False),
FunctionTransformer(delta_date_feature,
check_inverse=False, validate=False),
)

# Some minimal NLP for the "name" column
Expand Down Expand Up @@ -265,7 +272,8 @@ def get_inference_pipeline(rf_config, max_tfidf_features):
# HINT: Use the explicit Pipeline constructor so you can assign
# the names to the steps, do not use make_pipeline
sk_pipe = Pipeline(
steps=[("preprocessor", preprocessor), ("random_forest", random_Forest)],
steps=[("preprocessor", preprocessor),
("random_forest", random_Forest)],
)

return sk_pipe, processed_features
Expand Down

0 comments on commit 8997080

Please sign in to comment.