Removed src dir

eyra · Mar 3, 2024 · de7cd18 · de7cd18
1 parent e861ef2
commit de7cd18
Show file tree

Hide file tree

Showing 3 changed files with 43 additions and 28 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -3,11 +3,11 @@ FROM continuumio/anaconda3:2023.03-1
 COPY environment.yml /
 RUN conda env create -f /environment.yml
 
-RUN mkdir /src
+RUN mkdir /app
 
 COPY data /data
-COPY src/script.py /src
+COPY script.py /
 COPY models /models
 
-ENTRYPOINT ["conda", "run", "-n", "eyra-rank", "python", "/src/script.py"]
+ENTRYPOINT ["conda", "run", "-n", "eyra-rank", "python", "/script.py"]
 CMD ["predict", "/data/fake_data.csv"]
diff --git a/src/Example pipeline.ipynb → Example pipeline.ipynb b/src/Example pipeline.ipynb → Example pipeline.ipynb
@@ -236,7 +236,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "models_path = os.path.join(\"..\", \"models\")\n",
+    "models_path = \"models\"\n",
     "os.makedirs(models_path, exist_ok=True)\n",
     "\n",
     "# Dump model (don't change the name)\n",
@@ -249,9 +249,9 @@
    "metadata": {},
    "source": [
     "# How the submission would look like\n",
-    "The snippet below is taken from the file `src/script.py`. It shows how the prediction code needs to work. The function will be called with a dataframe containing the full dataset. This dataset is similar to the data downloaded but also includes the holdout data.\n",
+    "The snippet below is taken from the file `script.py`. It shows how the prediction code needs to work. The function will be called with a dataframe containing the full dataset. This dataset is similar to the data downloaded but also includes the holdout data.\n",
     "\n",
-    "It then does the preprocessing in the same way that was used to train the model. If you make any adjustments to the pre-processing they should also be copied to the `src/script.py` script (**the code below is just an excerpt**).\n",
+    "It then does the preprocessing in the same way that was used to train the model. If you make any adjustments to the pre-processing they should also be copied to the `script.py` script (**the code below is just an excerpt**).\n",
     "\n",
     "Finally the script loads the model that was saved in the step above and does the prediction."
    ]
@@ -271,7 +271,7 @@
     "    df = df.loc[:, keepcols]\n",
     "    \n",
     "    # Load your trained model from the models directory\n",
-    "    model_path = os.path.join(os.path.dirname(__file__), \"..\", \"models\", \"model.joblib\")\n",
+    "    model_path = os.path.join(os.path.dirname(__file__), \"models\", \"model.joblib\")\n",
     "    model = load(model_path)\n",
     "\n",
     "    # Use your trained model for prediction\n",

diff --git a/src/script.py → script.py b/src/script.py → script.py
@@ -29,14 +29,18 @@
 subparsers = parser.add_subparsers(dest="command")
 
 # Process subcommand
-process_parser = subparsers.add_parser("predict", help="Process input data for prediction.")
+process_parser = subparsers.add_parser(
+    "predict", help="Process input data for prediction."
+)
 process_parser.add_argument("input_path", help="Path to input data CSV file.")
 process_parser.add_argument("--output", help="Path to prediction output CSV file.")
 
 # Score subcommand
 score_parser = subparsers.add_parser("score", help="Score (evaluate) predictions.")
 score_parser.add_argument("prediction_path", help="Path to predicted outcome CSV file.")
-score_parser.add_argument("ground_truth_path", help="Path to ground truth outcome CSV file.")
+score_parser.add_argument(
+    "ground_truth_path", help="Path to ground truth outcome CSV file."
+)
 score_parser.add_argument("--output", help="Path to evaluation score output CSV file.")
 
 args = parser.parse_args()
@@ -54,14 +58,20 @@ def predict_outcomes(df):
     # individual did not have a child during 2020-2022, while '1' implies that
     # they did.
 
-    # Keep 
-    keepcols = ['burgstat2019', 'leeftijd2019', 'woonvorm2019', 'oplmet2019', 'aantalki2019']
+    # Keep
+    keepcols = [
+        "burgstat2019",
+        "leeftijd2019",
+        "woonvorm2019",
+        "oplmet2019",
+        "aantalki2019",
+    ]
     nomem_encr = df["nomem_encr"]
-    
+
     df = df.loc[:, keepcols]
-    
+
     # Load your trained model from the models directory
-    model_path = os.path.join(os.path.dirname(__file__), "..", "models", "model.joblib")
+    model_path = os.path.join(os.path.dirname(__file__), "models", "model.joblib")
     model = load(model_path)
 
     # Use your trained model for prediction
@@ -73,7 +83,9 @@ def predict_outcomes(df):
 def predict(input_path, output):
     if output is None:
         output = sys.stdout
-    df = pd.read_csv(input_path, encoding="latin-1", encoding_errors="replace", low_memory=False)
+    df = pd.read_csv(
+        input_path, encoding="latin-1", encoding_errors="replace", low_memory=False
+    )
     predictions = predict_outcomes(df)
     assert (
         predictions.shape[1] == 2
@@ -88,10 +100,10 @@ def predict(input_path, output):
 
 def score(prediction_path, ground_truth_path, output):
     """Score (evaluate) the predictions and write the metrics.
-    
+
     This function takes the path to a CSV file containing predicted outcomes and the
-    path to a CSV file containing the ground truth outcomes. It calculates the overall 
-    prediction accuracy, and precision, recall, and F1 score for having a child 
+    path to a CSV file containing the ground truth outcomes. It calculates the overall
+    prediction accuracy, and precision, recall, and F1 score for having a child
     and writes these scores to a new output CSV file.
 
     This function should not be modified.
@@ -107,9 +119,9 @@ def score(prediction_path, ground_truth_path, output):
     merged_df = pd.merge(predictions_df, ground_truth_df, on="nomem_encr", how="right")
 
     # Calculate accuracy
-    accuracy = len(
-        merged_df[merged_df["prediction"] == merged_df["new_child"]]
-    ) / len(merged_df)
+    accuracy = len(merged_df[merged_df["prediction"] == merged_df["new_child"]]) / len(
+        merged_df
+    )
 
     # Calculate true positives, false positives, and false negatives
     true_positives = len(
@@ -136,14 +148,17 @@ def score(prediction_path, ground_truth_path, output):
     except ZeroDivisionError:
         f1_score = 0
     # Write metric output to a new CSV file
-    metrics_df = pd.DataFrame({
-        'accuracy': [accuracy],
-        'precision': [precision],
-        'recall': [recall],
-        'f1_score': [f1_score]
-    })
+    metrics_df = pd.DataFrame(
+        {
+            "accuracy": [accuracy],
+            "precision": [precision],
+            "recall": [recall],
+            "f1_score": [f1_score],
+        }
+    )
     metrics_df.to_csv(output, index=False)
 
+
 if __name__ == "__main__":
     args = parser.parse_args()
     if args.command == "predict":
@@ -152,5 +167,5 @@ def score(prediction_path, ground_truth_path, output):
         score(args.prediction_path, args.ground_truth_path, args.output)
     else:
         parser.print_help()
-        predict(args.input_path, args.output)  
+        predict(args.input_path, args.output)
         sys.exit(1)