PEP8 check, EDA markdown

snehakumari321 · Mar 5, 2023 · 6a3068e · 6a3068e
1 parent b597bf0
commit 6a3068e
Show file tree

Hide file tree

Showing 14 changed files with 338 additions and 178 deletions.
diff --git a/README.md b/README.md
@@ -1,3 +1,10 @@
+
+#### My Submission:
+Github repo : https://github.com/snehakumari321/build-ml-pipeline-for-short-term-rental-prices
+
+
+WandB : https://wandb.ai/sneha_kumari/nyc_airbnb
+
 # Build an ML Pipeline for Short-Term Rental Prices in NYC
 You are working for a property management company renting rooms and properties for short periods of 
 time on various rental platforms. You need to estimate the typical price for a given property based 

diff --git a/components/get_data/run.py b/components/get_data/run.py
@@ -15,7 +15,6 @@
 
 
 def go(args):
-
     run = wandb.init(job_type="download_file")
     run.config.update(args)
 

diff --git a/components/setup.py b/components/setup.py
@@ -5,14 +5,11 @@
     name="wandb-utils",
     version=0.1,
     description="Utilities for interacting with Weights and Biases and mlflow",
-    zip_safe=False,  # avoid eggs, which make the handling of package data cumbersome
+    zip_safe=False,  # avoid eggs,make the handling of package data cumbersome
     packages=["wandb_utils"],
     classifiers=[
         "Programming Language :: Python :: 3",
         "Development Status :: 4 - Beta",
     ],
-    install_requires=[
-        "mlflow",
-        "wandb"
-    ]
+    install_requires=["mlflow", "wandb"],
 )
diff --git a/components/test_regression_model/run.py b/components/test_regression_model/run.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 """
-This step takes the best model, tagged with the "prod" tag, and tests it against the test dataset
+This step takes the best model, tagged with the "prod" tag,
+and tests it against the test dataset
 """
 import argparse
 import logging
@@ -17,13 +18,12 @@
 
 
 def go(args):
-
     run = wandb.init(job_type="test_model")
     run.config.update(args)
 
     logger.info("Downloading artifacts")
-    # Download input artifact. This will also log that this script is using this
-    # particular version of the artifact
+    # Download input artifact. This will also log that this script
+    # is using this particular version of the artifact
     model_local_path = run.use_artifact(args.mlflow_model).download()
 
     # Download test dataset
@@ -46,28 +46,21 @@ def go(args):
     logger.info(f"MAE: {mae}")
 
     # Log MAE and r2
-    run.summary['r2'] = r_squared
-    run.summary['mae'] = mae
+    run.summary["r2"] = r_squared
+    run.summary["mae"] = mae
 
 
 if __name__ == "__main__":
-
-    parser = argparse.ArgumentParser(description="Test the provided model against the test dataset")
-
-    parser.add_argument(
-        "--mlflow_model",
-        type=str, 
-        help="Input MLFlow model",
-        required=True
+    parser = argparse.ArgumentParser(
+        description="Test the provided model against the test dataset"
     )
 
     parser.add_argument(
-        "--test_dataset",
-        type=str, 
-        help="Test dataset",
-        required=True
+        "--mlflow_model", type=str, help="Input MLFlow model", required=True
     )
 
+    parser.add_argument("--test_dataset", type=str, help="Test dataset", required=True)
+
     args = parser.parse_args()
 
     go(args)
diff --git a/components/train_val_test_split/run.py b/components/train_val_test_split/run.py
@@ -15,12 +15,11 @@
 
 
 def go(args):
-
     run = wandb.init(job_type="train_val_test_split")
     run.config.update(args)
 
-    # Download input artifact. This will also note that this script is using this
-    # particular version of the artifact
+    # Download input artifact. This will also note that this script
+    # is using this particular version of the artifact
     logger.info(f"Fetching artifact {args.input}")
     artifact_local_path = run.use_artifact(args.input).file()
 
@@ -31,14 +30,13 @@ def go(args):
         df,
         test_size=args.test_size,
         random_state=args.random_seed,
-        stratify=df[args.stratify_by] if args.stratify_by != 'none' else None,
+        stratify=df[args.stratify_by] if args.stratify_by != "none" else None,
     )
 
     # Save to output files
-    for df, k in zip([trainval, test], ['trainval', 'test']):
+    for df, k in zip([trainval, test], ["trainval", "test"]):
         logger.info(f"Uploading {k}_data.csv dataset")
         with tempfile.NamedTemporaryFile("w") as fp:
-
             df.to_csv(fp.name, index=False)
 
             log_artifact(
@@ -56,15 +54,25 @@ def go(args):
     parser.add_argument("input", type=str, help="Input artifact to split")
 
     parser.add_argument(
-        "test_size", type=float, help="Size of the test split. Fraction of the dataset, or number of items"
+        "test_size",
+        type=float,
+        help="Size of the test split. " "Fraction of the dataset, or number of items",
     )
 
     parser.add_argument(
-        "--random_seed", type=int, help="Seed for random number generator", default=42, required=False
+        "--random_seed",
+        type=int,
+        help="Seed for random number generator",
+        default=42,
+        required=False,
     )
 
     parser.add_argument(
-        "--stratify_by", type=str, help="Column to use for stratification", default='none', required=False
+        "--stratify_by",
+        type=str,
+        help="Column to use for stratification",
+        default="none",
+        required=False,
     )
 
     args = parser.parse_args()

diff --git a/components/wandb_utils/log_artifact.py b/components/wandb_utils/log_artifact.py
@@ -2,13 +2,16 @@
 import mlflow
 
 
-def log_artifact(artifact_name, artifact_type, artifact_description, filename, wandb_run):
+def log_artifact(
+    artifact_name, artifact_type, artifact_description, filename, wandb_run
+):
     """
-    Log the provided filename as an artifact in W&B, and add the artifact path to the MLFlow run
-    so it can be retrieved by subsequent steps in a pipeline
+    Log the provided filename as an artifact in W&B, and add the artifact path
+    to the MLFlow run so it can be retrieved by subsequent steps in a pipeline
 
     :param artifact_name: name for the artifact
-    :param artifact_type: type for the artifact (just a string like "raw_data", "clean_data" and so on)
+    :param artifact_type: type for the artifact
+        (just a string like "raw_data", "clean_data" and so on)
     :param artifact_description: a brief description of the artifact
     :param filename: local filename for the artifact
     :param wandb_run: current Weights & Biases run

diff --git a/main.py b/main.py
@@ -14,38 +14,37 @@
     "data_split",
     "train_random_forest",
     # NOTE: We do not include this in the steps so it is not run by mistake.
-    # You first need to promote a model export to "prod" before you can run this,
+    # You first need to promote a model export to "prod"
+    # before you can run this,
     # then you need to run this step explicitly
-#    "test_regression_model"
+    #    "test_regression_model"
 ]
 
 
 # This automatically reads in the configuration
-@hydra.main(config_name='config')
+@hydra.main(config_name="config")
 def go(config: DictConfig):
-
     # Setup the wandb experiment. All runs will be grouped under this name
     os.environ["WANDB_PROJECT"] = config["main"]["project_name"]
     os.environ["WANDB_RUN_GROUP"] = config["main"]["experiment_name"]
 
     # Steps to execute
-    steps_par = config['main']['steps']
+    steps_par = config["main"]["steps"]
     active_steps = steps_par.split(",") if steps_par != "all" else _steps
 
     # Move to a temporary directory
     with tempfile.TemporaryDirectory() as tmp_dir:
-
         if "download" in active_steps:
             # Download file and load in W&B
             _ = mlflow.run(
                 f"{config['main']['components_repository']}/get_data",
                 "main",
-                version='main',
+                version="main",
                 parameters={
                     "sample": config["etl"]["sample"],
                     "artifact_name": "sample.csv",
                     "artifact_type": "raw_data",
-                    "artifact_description": "Raw file as downloaded"
+                    "artifact_description": "Raw file as downloaded",
                 },
             )
 
@@ -58,8 +57,8 @@ def go(config: DictConfig):
                     "output_artifact": "clean_sample.csv",
                     "output_type": "clean_sample",
                     "output_description": "Data with outliers and null values removed",
-                    "min_price": config['etl']['min_price'],
-                    "max_price": config['etl']['max_price']
+                    "min_price": config["etl"]["min_price"],
+                    "max_price": config["etl"]["max_price"],
                 },
             )
 
@@ -72,56 +71,61 @@ def go(config: DictConfig):
                         "csv": "clean_sample.csv:latest",
                         "ref": "clean_sample.csv:reference",
                         "kl_threshold": config["data_check"]["kl_threshold"],
-                        "min_price": config['etl']['min_price'],
-                        "max_price": config['etl']['max_price']
+                        "min_price": config["etl"]["min_price"],
+                        "max_price": config["etl"]["max_price"],
                     },
                 )
 
         if "data_split" in active_steps:
             _ = mlflow.run(
-                f"{config['main']['components_repository']}/train_val_test_split",
+                f"{config['main']['components_repository']}/" f"train_val_test_split",
                 "main",
-                version='main',
+                version="main",
                 parameters={
                     "input": "clean_sample.csv:latest",
                     "test_size": config["modeling"]["test_size"],
                     "random_seed": config["modeling"]["random_seed"],
-                    "stratify": config['modeling']['stratify_by']
+                    "stratify": config["modeling"]["stratify_by"],
                 },
             )
 
         if "train_random_forest" in active_steps:
-
-            # NOTE: we need to serialize the random forest configuration into JSON
+            # NOTE: we need to serialize the random forest
+            # configuration into JSON
             rf_config = os.path.abspath("rf_config.json")
             with open(rf_config, "w+") as fp:
-                json.dump(dict(config["modeling"]["random_forest"].items()), fp)  # DO NOT TOUCH
+                json.dump(
+                    dict(config["modeling"]["random_forest"].items()), fp
+                )  # DO NOT TOUCH
 
-            # NOTE: use the rf_config we just created as the rf_config parameter for the train_random_forest
+            # NOTE: use the rf_config we just created as the
+            # rf_config parameter for the train_random_forest
             # step
 
             _ = mlflow.run(
-                os.path.join(hydra.utils.get_original_cwd(), "src", "train_random_forest"),
+                os.path.join(
+                    hydra.utils.get_original_cwd(), "src", "train_random_forest"
+                ),
                 "main",
                 parameters={
                     "trainval_artifact": "clean_sample.csv:latest",
                     "val_size": config["modeling"]["test_size"],
                     "random_seed": config["modeling"]["random_seed"],
-                    "stratify": config['modeling']['stratify_by'],
+                    "stratify": config["modeling"]["stratify_by"],
                     "rf_config": rf_config,
-                    "max_tfidf_features": config['modeling']['max_tfidf_features'],
-                    "output_artifact": "random_forest_export"
+                    "max_tfidf_features": config["modeling"]["max_tfidf_features"],
+                    "output_artifact": "random_forest_export",
                 },
             )
 
         if "test_regression_model" in active_steps:
             _ = mlflow.run(
-                f"{config['main']['components_repository']}/test_regression_model",
+                f"{config['main']['components_repository']}/" f"test_regression_model",
                 "main",
-                version='main',
+                version="main",
                 parameters={
                     "mlflow_model": "random_forest_export:prod",
-                    "test_dataset": "test_data.csv:latest"
+                    "test_dataset": "test_data.csv:latest",
                 },
             )
 

diff --git a/src/basic_cleaning/MLproject b/src/basic_cleaning/MLproject
@@ -23,11 +23,11 @@ entry_points:
 
       min_price:
         description: parameter for minimum price
-        type: string
+        type: float
 
       max_price:
         description: parameter for maximum price
-        type: string
+        type: float
 
 
     command: >-