diff --git a/Dockerfile b/Dockerfile index 03a22c62..de22b58d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,11 +3,11 @@ FROM continuumio/anaconda3:2023.03-1 COPY environment.yml / RUN conda env create -f /environment.yml -RUN mkdir /src +RUN mkdir /app COPY data /data -COPY src/script.py /src +COPY script.py / COPY models /models -ENTRYPOINT ["conda", "run", "-n", "eyra-rank", "python", "/src/script.py"] +ENTRYPOINT ["conda", "run", "-n", "eyra-rank", "python", "/script.py"] CMD ["predict", "/data/fake_data.csv"] \ No newline at end of file diff --git a/src/Example pipeline.ipynb b/Example pipeline.ipynb similarity index 94% rename from src/Example pipeline.ipynb rename to Example pipeline.ipynb index fa096d17..d929238b 100644 --- a/src/Example pipeline.ipynb +++ b/Example pipeline.ipynb @@ -236,7 +236,7 @@ "metadata": {}, "outputs": [], "source": [ - "models_path = os.path.join(\"..\", \"models\")\n", + "models_path = \"models\"\n", "os.makedirs(models_path, exist_ok=True)\n", "\n", "# Dump model (don't change the name)\n", @@ -249,9 +249,9 @@ "metadata": {}, "source": [ "# How the submission would look like\n", - "The snippet below is taken from the file `src/script.py`. It shows how the prediction code needs to work. The function will be called with a dataframe containing the full dataset. This dataset is similar to the data downloaded but also includes the holdout data.\n", + "The snippet below is taken from the file `script.py`. It shows how the prediction code needs to work. The function will be called with a dataframe containing the full dataset. This dataset is similar to the data downloaded but also includes the holdout data.\n", "\n", - "It then does the preprocessing in the same way that was used to train the model. If you make any adjustments to the pre-processing they should also be copied to the `src/script.py` script (**the code below is just an excerpt**).\n", + "It then does the preprocessing in the same way that was used to train the model. If you make any adjustments to the pre-processing they should also be copied to the `script.py` script (**the code below is just an excerpt**).\n", "\n", "Finally the script loads the model that was saved in the step above and does the prediction." ] @@ -271,7 +271,7 @@ " df = df.loc[:, keepcols]\n", " \n", " # Load your trained model from the models directory\n", - " model_path = os.path.join(os.path.dirname(__file__), \"..\", \"models\", \"model.joblib\")\n", + " model_path = os.path.join(os.path.dirname(__file__), \"models\", \"model.joblib\")\n", " model = load(model_path)\n", "\n", " # Use your trained model for prediction\n", diff --git a/src/script.py b/script.py similarity index 84% rename from src/script.py rename to script.py index 82ad811f..3e948868 100644 --- a/src/script.py +++ b/script.py @@ -29,14 +29,18 @@ subparsers = parser.add_subparsers(dest="command") # Process subcommand -process_parser = subparsers.add_parser("predict", help="Process input data for prediction.") +process_parser = subparsers.add_parser( + "predict", help="Process input data for prediction." +) process_parser.add_argument("input_path", help="Path to input data CSV file.") process_parser.add_argument("--output", help="Path to prediction output CSV file.") # Score subcommand score_parser = subparsers.add_parser("score", help="Score (evaluate) predictions.") score_parser.add_argument("prediction_path", help="Path to predicted outcome CSV file.") -score_parser.add_argument("ground_truth_path", help="Path to ground truth outcome CSV file.") +score_parser.add_argument( + "ground_truth_path", help="Path to ground truth outcome CSV file." +) score_parser.add_argument("--output", help="Path to evaluation score output CSV file.") args = parser.parse_args() @@ -54,14 +58,20 @@ def predict_outcomes(df): # individual did not have a child during 2020-2022, while '1' implies that # they did. - # Keep - keepcols = ['burgstat2019', 'leeftijd2019', 'woonvorm2019', 'oplmet2019', 'aantalki2019'] + # Keep + keepcols = [ + "burgstat2019", + "leeftijd2019", + "woonvorm2019", + "oplmet2019", + "aantalki2019", + ] nomem_encr = df["nomem_encr"] - + df = df.loc[:, keepcols] - + # Load your trained model from the models directory - model_path = os.path.join(os.path.dirname(__file__), "..", "models", "model.joblib") + model_path = os.path.join(os.path.dirname(__file__), "models", "model.joblib") model = load(model_path) # Use your trained model for prediction @@ -73,7 +83,9 @@ def predict_outcomes(df): def predict(input_path, output): if output is None: output = sys.stdout - df = pd.read_csv(input_path, encoding="latin-1", encoding_errors="replace", low_memory=False) + df = pd.read_csv( + input_path, encoding="latin-1", encoding_errors="replace", low_memory=False + ) predictions = predict_outcomes(df) assert ( predictions.shape[1] == 2 @@ -88,10 +100,10 @@ def predict(input_path, output): def score(prediction_path, ground_truth_path, output): """Score (evaluate) the predictions and write the metrics. - + This function takes the path to a CSV file containing predicted outcomes and the - path to a CSV file containing the ground truth outcomes. It calculates the overall - prediction accuracy, and precision, recall, and F1 score for having a child + path to a CSV file containing the ground truth outcomes. It calculates the overall + prediction accuracy, and precision, recall, and F1 score for having a child and writes these scores to a new output CSV file. This function should not be modified. @@ -107,9 +119,9 @@ def score(prediction_path, ground_truth_path, output): merged_df = pd.merge(predictions_df, ground_truth_df, on="nomem_encr", how="right") # Calculate accuracy - accuracy = len( - merged_df[merged_df["prediction"] == merged_df["new_child"]] - ) / len(merged_df) + accuracy = len(merged_df[merged_df["prediction"] == merged_df["new_child"]]) / len( + merged_df + ) # Calculate true positives, false positives, and false negatives true_positives = len( @@ -136,14 +148,17 @@ def score(prediction_path, ground_truth_path, output): except ZeroDivisionError: f1_score = 0 # Write metric output to a new CSV file - metrics_df = pd.DataFrame({ - 'accuracy': [accuracy], - 'precision': [precision], - 'recall': [recall], - 'f1_score': [f1_score] - }) + metrics_df = pd.DataFrame( + { + "accuracy": [accuracy], + "precision": [precision], + "recall": [recall], + "f1_score": [f1_score], + } + ) metrics_df.to_csv(output, index=False) + if __name__ == "__main__": args = parser.parse_args() if args.command == "predict": @@ -152,5 +167,5 @@ def score(prediction_path, ground_truth_path, output): score(args.prediction_path, args.ground_truth_path, args.output) else: parser.print_help() - predict(args.input_path, args.output) + predict(args.input_path, args.output) sys.exit(1)