PEP8 standard fixed by pycodestyle

snehakumari321 · Mar 5, 2023 · 8997080 · 8997080
1 parent a3633c9
commit 8997080
Show file tree

Hide file tree

Showing 7 changed files with 56 additions and 28 deletions.
diff --git a/components/get_data/run.py b/components/get_data/run.py
@@ -30,16 +30,21 @@ def go(args):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Download URL to a local destination")
+    parser = argparse.\
+        ArgumentParser(description="Download URL to a local destination")
 
-    parser.add_argument("sample", type=str, help="Name of the sample to download")
+    parser.add_argument("sample", type=str,
+                        help="Name of the sample to download")
 
-    parser.add_argument("artifact_name", type=str, help="Name for the output artifact")
+    parser.add_argument("artifact_name", type=str,
+                        help="Name for the output artifact")
 
-    parser.add_argument("artifact_type", type=str, help="Output artifact type.")
+    parser.add_argument("artifact_type", type=str,
+                        help="Output artifact type.")
 
     parser.add_argument(
-        "artifact_description", type=str, help="A brief description of this artifact"
+        "artifact_description", type=str,
+        help="A brief description of this artifact"
     )
 
     args = parser.parse_args()

diff --git a/components/test_regression_model/run.py b/components/test_regression_model/run.py
@@ -56,10 +56,12 @@ def go(args):
     )
 
     parser.add_argument(
-        "--mlflow_model", type=str, help="Input MLFlow model", required=True
+        "--mlflow_model", type=str,
+        help="Input MLFlow model", required=True
     )
 
-    parser.add_argument("--test_dataset", type=str, help="Test dataset", required=True)
+    parser.add_argument("--test_dataset", type=str,
+                        help="Test dataset", required=True)
 
     args = parser.parse_args()
 

diff --git a/components/train_val_test_split/run.py b/components/train_val_test_split/run.py
@@ -56,7 +56,8 @@ def go(args):
     parser.add_argument(
         "test_size",
         type=float,
-        help="Size of the test split. " "Fraction of the dataset, or number of items",
+        help="Size of the test split."
+             "Fraction of the dataset, or number of items",
     )
 
     parser.add_argument(

diff --git a/main.py b/main.py
@@ -50,13 +50,15 @@ def go(config: DictConfig):
 
         if "basic_cleaning" in active_steps:
             _ = mlflow.run(
-                os.path.join(hydra.utils.get_original_cwd(), "src", "basic_cleaning"),
+                os.path.join(hydra.utils.get_original_cwd(),
+                             "src", "basic_cleaning"),
                 "main",
                 parameters={
                     "input_artifact": "sample.csv:latest",
                     "output_artifact": "clean_sample.csv",
                     "output_type": "clean_sample",
-                    "output_description": "Data with outliers and null values removed",
+                    "output_description":
+                        "Data with outliers and null values removed",
                     "min_price": config["etl"]["min_price"],
                     "max_price": config["etl"]["max_price"],
                 },
@@ -65,7 +67,8 @@ def go(config: DictConfig):
         if "data_check" in active_steps:
             if "data_check" in active_steps:
                 _ = mlflow.run(
-                    os.path.join(hydra.utils.get_original_cwd(), "src", "data_check"),
+                    os.path.join(hydra.utils.get_original_cwd(),
+                                 "src", "data_check"),
                     "main",
                     parameters={
                         "csv": "clean_sample.csv:latest",
@@ -78,7 +81,8 @@ def go(config: DictConfig):
 
         if "data_split" in active_steps:
             _ = mlflow.run(
-                f"{config['main']['components_repository']}/" f"train_val_test_split",
+                f"{config['main']['components_repository']}/"
+                f"train_val_test_split",
                 "main",
                 version="main",
                 parameters={
@@ -104,7 +108,8 @@ def go(config: DictConfig):
 
             _ = mlflow.run(
                 os.path.join(
-                    hydra.utils.get_original_cwd(), "src", "train_random_forest"
+                    hydra.utils.get_original_cwd(), "src",
+                    "train_random_forest"
                 ),
                 "main",
                 parameters={
@@ -113,14 +118,16 @@ def go(config: DictConfig):
                     "random_seed": config["modeling"]["random_seed"],
                     "stratify": config["modeling"]["stratify_by"],
                     "rf_config": rf_config,
-                    "max_tfidf_features": config["modeling"]["max_tfidf_features"],
+                    "max_tfidf_features":
+                        config["modeling"]["max_tfidf_features"],
                     "output_artifact": "random_forest_export",
                 },
             )
 
         if "test_regression_model" in active_steps:
             _ = mlflow.run(
-                f"{config['main']['components_repository']}/" f"test_regression_model",
+                f"{config['main']['components_repository']}/"
+                f"test_regression_model",
                 "main",
                 version="main",
                 parameters={

diff --git a/src/basic_cleaning/run.py b/src/basic_cleaning/run.py
@@ -33,7 +33,8 @@ def go(args):
     df["last_review"] = pd.to_datetime(df["last_review"])
 
     # Filter proper boundary
-    idx = df["longitude"].between(-74.25, -73.50) & df["latitude"].between(40.5, 41.2)
+    idx = df["longitude"].between(-74.25, -73.50) & df["latitude"].\
+        between(40.5, 41.2)
     df = df[idx].copy()
 
     df.to_csv(args.output_artifact, index=False)
@@ -68,7 +69,8 @@ def go(args):
         required=True,
     )
 
-    parser.add_argument("--output_type", type=str, help="type of output", required=True)
+    parser.add_argument("--output_type", type=str,
+                        help="type of output", required=True)
 
     parser.add_argument(
         "--output_description",
@@ -78,11 +80,13 @@ def go(args):
     )
 
     parser.add_argument(
-        "--min_price", type=float, help="parameter for minimum price", required=True
+        "--min_price", type=float,
+        help="parameter for minimum price", required=True
     )
 
     parser.add_argument(
-        "--max_price", type=float, help="parameter for maximum price", required=True
+        "--max_price", type=float,
+        help="parameter for maximum price", required=True
     )
 
     args = parser.parse_args()

diff --git a/src/train_random_forest/feature_engineering.py b/src/train_random_forest/feature_engineering.py
@@ -9,4 +9,5 @@ def delta_date_feature(dates):
     between each date and the most recent date in its column
     """
     date_sanitized = pd.DataFrame(dates).apply(pd.to_datetime)
-    return date_sanitized.apply(lambda d: (d.max() - d).dt.days, axis=0).to_numpy()
+    return date_sanitized.apply(lambda d: (d.max() - d).dt.
+                                days, axis=0).to_numpy()
diff --git a/src/train_random_forest/run.py b/src/train_random_forest/run.py
@@ -17,7 +17,8 @@
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.impute import SimpleImputer
 from sklearn.model_selection import train_test_split
-from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, FunctionTransformer
+from sklearn.preprocessing import OrdinalEncoder,\
+    OneHotEncoder, FunctionTransformer
 
 import wandb
 from sklearn.ensemble import RandomForestRegressor
@@ -32,7 +33,8 @@ def delta_date_feature(dates):
     between each date and the most recent date in its column
     """
     date_sanitized = pd.DataFrame(dates).apply(pd.to_datetime)
-    return date_sanitized.apply(lambda d: (d.max() - d).dt.days, axis=0).to_numpy()
+    return date_sanitized.apply(lambda d: (d.max() - d).dt.
+                                days, axis=0).to_numpy()
 
 
 logging.basicConfig(level=logging.INFO, format="%(asctime)-15s %(message)s")
@@ -107,7 +109,8 @@ def go(args):
     # Save the sk_pipe pipeline as a mlflow.sklearn model in the
     # directory "random_forest_dir"
     # HINT: use mlflow.sklearn.save_model
-    signature = mlflow.models.infer_signature(X_val[processed_features], y_pred)
+    signature = mlflow.models.\
+        infer_signature(X_val[processed_features], y_pred)
 
     export_path = "random_forest_dir"
     mlflow.sklearn.save_model(
@@ -158,16 +161,19 @@ def go(args):
 
 def plot_feature_importance(pipe, feat_names):
     # We collect the feature importance for all non-nlp features first
-    feat_imp = pipe["random_forest"].feature_importances_[: len(feat_names) - 1]
+    feat_imp = pipe["random_forest"].
+    feature_importances_[: len(feat_names) - 1]
     # For the NLP feature we sum across all the TF-IDF dimensions into a global
     # NLP importance
     nlp_importance = sum(
-        pipe["random_forest"].feature_importances_[len(feat_names) - 1 :]
+        pipe["random_forest"].
+        feature_importances_[len(feat_names) - 1:]
     )
     feat_imp = np.append(feat_imp, nlp_importance)
     fig_feat_imp, sub_feat_imp = plt.subplots(figsize=(10, 10))
     # idx = np.argsort(feat_imp)[::-1]
-    sub_feat_imp.bar(range(feat_imp.shape[0]), feat_imp, color="r", align="center")
+    sub_feat_imp.bar(range(feat_imp.shape[0]),
+                     feat_imp, color="r", align="center")
     _ = sub_feat_imp.set_xticks(range(feat_imp.shape[0]))
     _ = sub_feat_imp.set_xticklabels(np.array(feat_names), rotation=90)
     fig_feat_imp.tight_layout()
@@ -217,7 +223,8 @@ def get_inference_pipeline(rf_config, max_tfidf_features):
     # create a new feature from it,
     date_imputer = make_pipeline(
         SimpleImputer(strategy="constant", fill_value="2010-01-01"),
-        FunctionTransformer(delta_date_feature, check_inverse=False, validate=False),
+        FunctionTransformer(delta_date_feature,
+                            check_inverse=False, validate=False),
     )
 
     # Some minimal NLP for the "name" column
@@ -265,7 +272,8 @@ def get_inference_pipeline(rf_config, max_tfidf_features):
     # HINT: Use the explicit Pipeline constructor so you can assign
     # the names to the steps, do not use make_pipeline
     sk_pipe = Pipeline(
-        steps=[("preprocessor", preprocessor), ("random_forest", random_Forest)],
+        steps=[("preprocessor", preprocessor),
+               ("random_forest", random_Forest)],
     )
 
     return sk_pipe, processed_features