Skip to content

Commit

Permalink
PEP8 check, EDA markdown
Browse files Browse the repository at this point in the history
  • Loading branch information
snehakumari321 committed Mar 5, 2023
1 parent b597bf0 commit 6a3068e
Show file tree
Hide file tree
Showing 14 changed files with 338 additions and 178 deletions.
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@

#### My Submission:
Github repo : https://github.com/snehakumari321/build-ml-pipeline-for-short-term-rental-prices


WandB : https://wandb.ai/sneha_kumari/nyc_airbnb

# Build an ML Pipeline for Short-Term Rental Prices in NYC
You are working for a property management company renting rooms and properties for short periods of
time on various rental platforms. You need to estimate the typical price for a given property based
Expand Down
1 change: 0 additions & 1 deletion components/get_data/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@


def go(args):

run = wandb.init(job_type="download_file")
run.config.update(args)

Expand Down
7 changes: 2 additions & 5 deletions components/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,11 @@
name="wandb-utils",
version=0.1,
description="Utilities for interacting with Weights and Biases and mlflow",
zip_safe=False, # avoid eggs, which make the handling of package data cumbersome
zip_safe=False, # avoid eggs,make the handling of package data cumbersome
packages=["wandb_utils"],
classifiers=[
"Programming Language :: Python :: 3",
"Development Status :: 4 - Beta",
],
install_requires=[
"mlflow",
"wandb"
]
install_requires=["mlflow", "wandb"],
)
29 changes: 11 additions & 18 deletions components/test_regression_model/run.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/usr/bin/env python
"""
This step takes the best model, tagged with the "prod" tag, and tests it against the test dataset
This step takes the best model, tagged with the "prod" tag,
and tests it against the test dataset
"""
import argparse
import logging
Expand All @@ -17,13 +18,12 @@


def go(args):

run = wandb.init(job_type="test_model")
run.config.update(args)

logger.info("Downloading artifacts")
# Download input artifact. This will also log that this script is using this
# particular version of the artifact
# Download input artifact. This will also log that this script
# is using this particular version of the artifact
model_local_path = run.use_artifact(args.mlflow_model).download()

# Download test dataset
Expand All @@ -46,28 +46,21 @@ def go(args):
logger.info(f"MAE: {mae}")

# Log MAE and r2
run.summary['r2'] = r_squared
run.summary['mae'] = mae
run.summary["r2"] = r_squared
run.summary["mae"] = mae


if __name__ == "__main__":

parser = argparse.ArgumentParser(description="Test the provided model against the test dataset")

parser.add_argument(
"--mlflow_model",
type=str,
help="Input MLFlow model",
required=True
parser = argparse.ArgumentParser(
description="Test the provided model against the test dataset"
)

parser.add_argument(
"--test_dataset",
type=str,
help="Test dataset",
required=True
"--mlflow_model", type=str, help="Input MLFlow model", required=True
)

parser.add_argument("--test_dataset", type=str, help="Test dataset", required=True)

args = parser.parse_args()

go(args)
26 changes: 17 additions & 9 deletions components/train_val_test_split/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,11 @@


def go(args):

run = wandb.init(job_type="train_val_test_split")
run.config.update(args)

# Download input artifact. This will also note that this script is using this
# particular version of the artifact
# Download input artifact. This will also note that this script
# is using this particular version of the artifact
logger.info(f"Fetching artifact {args.input}")
artifact_local_path = run.use_artifact(args.input).file()

Expand All @@ -31,14 +30,13 @@ def go(args):
df,
test_size=args.test_size,
random_state=args.random_seed,
stratify=df[args.stratify_by] if args.stratify_by != 'none' else None,
stratify=df[args.stratify_by] if args.stratify_by != "none" else None,
)

# Save to output files
for df, k in zip([trainval, test], ['trainval', 'test']):
for df, k in zip([trainval, test], ["trainval", "test"]):
logger.info(f"Uploading {k}_data.csv dataset")
with tempfile.NamedTemporaryFile("w") as fp:

df.to_csv(fp.name, index=False)

log_artifact(
Expand All @@ -56,15 +54,25 @@ def go(args):
parser.add_argument("input", type=str, help="Input artifact to split")

parser.add_argument(
"test_size", type=float, help="Size of the test split. Fraction of the dataset, or number of items"
"test_size",
type=float,
help="Size of the test split. " "Fraction of the dataset, or number of items",
)

parser.add_argument(
"--random_seed", type=int, help="Seed for random number generator", default=42, required=False
"--random_seed",
type=int,
help="Seed for random number generator",
default=42,
required=False,
)

parser.add_argument(
"--stratify_by", type=str, help="Column to use for stratification", default='none', required=False
"--stratify_by",
type=str,
help="Column to use for stratification",
default="none",
required=False,
)

args = parser.parse_args()
Expand Down
11 changes: 7 additions & 4 deletions components/wandb_utils/log_artifact.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,16 @@
import mlflow


def log_artifact(artifact_name, artifact_type, artifact_description, filename, wandb_run):
def log_artifact(
artifact_name, artifact_type, artifact_description, filename, wandb_run
):
"""
Log the provided filename as an artifact in W&B, and add the artifact path to the MLFlow run
so it can be retrieved by subsequent steps in a pipeline
Log the provided filename as an artifact in W&B, and add the artifact path
to the MLFlow run so it can be retrieved by subsequent steps in a pipeline
:param artifact_name: name for the artifact
:param artifact_type: type for the artifact (just a string like "raw_data", "clean_data" and so on)
:param artifact_type: type for the artifact
(just a string like "raw_data", "clean_data" and so on)
:param artifact_description: a brief description of the artifact
:param filename: local filename for the artifact
:param wandb_run: current Weights & Biases run
Expand Down
56 changes: 30 additions & 26 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,38 +14,37 @@
"data_split",
"train_random_forest",
# NOTE: We do not include this in the steps so it is not run by mistake.
# You first need to promote a model export to "prod" before you can run this,
# You first need to promote a model export to "prod"
# before you can run this,
# then you need to run this step explicitly
# "test_regression_model"
# "test_regression_model"
]


# This automatically reads in the configuration
@hydra.main(config_name='config')
@hydra.main(config_name="config")
def go(config: DictConfig):

# Setup the wandb experiment. All runs will be grouped under this name
os.environ["WANDB_PROJECT"] = config["main"]["project_name"]
os.environ["WANDB_RUN_GROUP"] = config["main"]["experiment_name"]

# Steps to execute
steps_par = config['main']['steps']
steps_par = config["main"]["steps"]
active_steps = steps_par.split(",") if steps_par != "all" else _steps

# Move to a temporary directory
with tempfile.TemporaryDirectory() as tmp_dir:

if "download" in active_steps:
# Download file and load in W&B
_ = mlflow.run(
f"{config['main']['components_repository']}/get_data",
"main",
version='main',
version="main",
parameters={
"sample": config["etl"]["sample"],
"artifact_name": "sample.csv",
"artifact_type": "raw_data",
"artifact_description": "Raw file as downloaded"
"artifact_description": "Raw file as downloaded",
},
)

Expand All @@ -58,8 +57,8 @@ def go(config: DictConfig):
"output_artifact": "clean_sample.csv",
"output_type": "clean_sample",
"output_description": "Data with outliers and null values removed",
"min_price": config['etl']['min_price'],
"max_price": config['etl']['max_price']
"min_price": config["etl"]["min_price"],
"max_price": config["etl"]["max_price"],
},
)

Expand All @@ -72,56 +71,61 @@ def go(config: DictConfig):
"csv": "clean_sample.csv:latest",
"ref": "clean_sample.csv:reference",
"kl_threshold": config["data_check"]["kl_threshold"],
"min_price": config['etl']['min_price'],
"max_price": config['etl']['max_price']
"min_price": config["etl"]["min_price"],
"max_price": config["etl"]["max_price"],
},
)

if "data_split" in active_steps:
_ = mlflow.run(
f"{config['main']['components_repository']}/train_val_test_split",
f"{config['main']['components_repository']}/" f"train_val_test_split",
"main",
version='main',
version="main",
parameters={
"input": "clean_sample.csv:latest",
"test_size": config["modeling"]["test_size"],
"random_seed": config["modeling"]["random_seed"],
"stratify": config['modeling']['stratify_by']
"stratify": config["modeling"]["stratify_by"],
},
)

if "train_random_forest" in active_steps:

# NOTE: we need to serialize the random forest configuration into JSON
# NOTE: we need to serialize the random forest
# configuration into JSON
rf_config = os.path.abspath("rf_config.json")
with open(rf_config, "w+") as fp:
json.dump(dict(config["modeling"]["random_forest"].items()), fp) # DO NOT TOUCH
json.dump(
dict(config["modeling"]["random_forest"].items()), fp
) # DO NOT TOUCH

# NOTE: use the rf_config we just created as the rf_config parameter for the train_random_forest
# NOTE: use the rf_config we just created as the
# rf_config parameter for the train_random_forest
# step

_ = mlflow.run(
os.path.join(hydra.utils.get_original_cwd(), "src", "train_random_forest"),
os.path.join(
hydra.utils.get_original_cwd(), "src", "train_random_forest"
),
"main",
parameters={
"trainval_artifact": "clean_sample.csv:latest",
"val_size": config["modeling"]["test_size"],
"random_seed": config["modeling"]["random_seed"],
"stratify": config['modeling']['stratify_by'],
"stratify": config["modeling"]["stratify_by"],
"rf_config": rf_config,
"max_tfidf_features": config['modeling']['max_tfidf_features'],
"output_artifact": "random_forest_export"
"max_tfidf_features": config["modeling"]["max_tfidf_features"],
"output_artifact": "random_forest_export",
},
)

if "test_regression_model" in active_steps:
_ = mlflow.run(
f"{config['main']['components_repository']}/test_regression_model",
f"{config['main']['components_repository']}/" f"test_regression_model",
"main",
version='main',
version="main",
parameters={
"mlflow_model": "random_forest_export:prod",
"test_dataset": "test_data.csv:latest"
"test_dataset": "test_data.csv:latest",
},
)

Expand Down
4 changes: 2 additions & 2 deletions src/basic_cleaning/MLproject
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,11 @@ entry_points:

min_price:
description: parameter for minimum price
type: string
type: float

max_price:
description: parameter for maximum price
type: string
type: float


command: >-
Expand Down
Loading

0 comments on commit 6a3068e

Please sign in to comment.