diff --git a/Dockerfile b/Dockerfile index 03a22c62..1e223577 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,11 +3,11 @@ FROM continuumio/anaconda3:2023.03-1 COPY environment.yml / RUN conda env create -f /environment.yml -RUN mkdir /src +RUN mkdir /app COPY data /data -COPY src/script.py /src +COPY *.py / COPY models /models -ENTRYPOINT ["conda", "run", "-n", "eyra-rank", "python", "/src/script.py"] +ENTRYPOINT ["conda", "run", "-n", "eyra-rank", "python", "/run.py"] CMD ["predict", "/data/fake_data.csv"] \ No newline at end of file diff --git a/src/Example pipeline.ipynb b/Example pipeline.ipynb similarity index 97% rename from src/Example pipeline.ipynb rename to Example pipeline.ipynb index fa096d17..ba448d5f 100644 --- a/src/Example pipeline.ipynb +++ b/Example pipeline.ipynb @@ -236,7 +236,7 @@ "metadata": {}, "outputs": [], "source": [ - "models_path = os.path.join(\"..\", \"models\")\n", + "models_path = \"models\"\n", "os.makedirs(models_path, exist_ok=True)\n", "\n", "# Dump model (don't change the name)\n", @@ -249,9 +249,9 @@ "metadata": {}, "source": [ "# How the submission would look like\n", - "The snippet below is taken from the file `src/script.py`. It shows how the prediction code needs to work. The function will be called with a dataframe containing the full dataset. This dataset is similar to the data downloaded but also includes the holdout data.\n", + "The snippet below is taken from the file `submission.py`. It shows how the prediction code needs to work. The function will be called with a dataframe containing the full dataset. This dataset is similar to the data downloaded but also includes the holdout data.\n", "\n", - "It then does the preprocessing in the same way that was used to train the model. If you make any adjustments to the pre-processing they should also be copied to the `src/script.py` script (**the code below is just an excerpt**).\n", + "It then does the preprocessing in the same way that was used to train the model. If you make any adjustments to the pre-processing they should also be copied to the `submission.py` script (**the code below is just an excerpt**).\n", "\n", "Finally the script loads the model that was saved in the step above and does the prediction." ] @@ -271,7 +271,7 @@ " df = df.loc[:, keepcols]\n", " \n", " # Load your trained model from the models directory\n", - " model_path = os.path.join(os.path.dirname(__file__), \"..\", \"models\", \"model.joblib\")\n", + " model_path = os.path.join(os.path.dirname(__file__), \"models\", \"model.joblib\")\n", " model = load(model_path)\n", "\n", " # Use your trained model for prediction\n", diff --git a/README.md b/README.md index 6e7fcbc1..ccc42b7c 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,7 @@ The challenge is to predict whether an individual will have a child within a thr For the SICSS-ODISSEI Summer School 2023, the challenge consists of 2 rounds. [Round 1](https://eyra.co/benchmark/5) will close on **Wednesday 21 June 2023 at 16:00** and [Round 2](https://eyra.co/benchmark/6) will close on **Monday 26 June at 9:00 a.m.** ### Preparation + 1. Make sure you have filled out the [LISS panel Data Statement](https://statements.centerdata.nl/liss-panel-data-statement) form. 2. Register and sign in on the [Next platform](https://eyra.co/benchmark/5) using your institution email address. 3. Download the example data from the challenge website ([Round 1](https://eyra.co/benchmark/5), [Round 2](https://eyra.co/benchmark/6)) to tune your method: @@ -26,12 +27,12 @@ For the SICSS-ODISSEI Summer School 2023, the challenge consists of 2 rounds. [R ### Participation 1. Fork and clone [this](https://github.com/eyra/fertility-prediction-challenge) repository as explained [here](https://github.com/eyra/fertility-prediction-challenge/wiki#how-to-fork-and-clone-this-repository). -2. Change the content of the **predict_outcomes function** in [script.py](https://github.com/eyra/fertility-prediction-challenge/blob/master/src/script.py) as explained in the script to include your method. Do not change the expected input and output data format. -3. The metrics used to create the challenge [leaderboards](https://github.com/eyra/fertility-prediction-challenge/tree/master#leaderboard) are included in this repo. You can separate the challenge example data into a train and test set and use the score function in [script.py](https://github.com/eyra/fertility-prediction-challenge/blob/master/src/script.py) to determine your method performance scores on the example data as described [here](https://github.com/eyra/fertility-prediction-challenge/wiki#how-to-evaluate-your-method). +2. Change the content of the **predict_outcomes function** in [submission.py](https://github.com/eyra/fertility-prediction-challenge/blob/master/src/submission.py) as explained in the script to include your method. Do not change the expected input and output data format. +3. The metrics used to create the challenge [leaderboards](https://github.com/eyra/fertility-prediction-challenge/tree/master#leaderboard) are included in this repo. You can separate the challenge example data into a train and test set and use the score function in [submission.py](https://github.com/eyra/fertility-prediction-challenge/blob/master/src/submission.py) to determine your method performance scores on the example data as described [here](https://github.com/eyra/fertility-prediction-challenge/wiki#how-to-evaluate-your-method). 4. Submit your method as explained [here](https://github.com/eyra/fertility-prediction-challenge/tree/master#how-to-submit-your-method). -5. Your performance scores on the challenge [leaderboards](https://github.com/eyra/fertility-prediction-challenge/tree/master#leaderboard) will become available after signing in on the Next platform ([Round 1](https://eyra.co/benchmark/5), [Round 2](https://eyra.co/benchmark/6)). +5. Your performance scores on the challenge [leaderboards](https://github.com/eyra/fertility-prediction-challenge/tree/master#leaderboard) will become available after signing in on the Next platform ([Round 1](https://eyra.co/benchmark/5), [Round 2](https://eyra.co/benchmark/6)). -ℹ️ It takes some time to process the results for the leaderboards. +ℹ️ It takes some time to process the results for the leaderboards. ### Leaderboards diff --git a/src/script.py b/run.py similarity index 65% rename from src/script.py rename to run.py index 82ad811f..31137032 100644 --- a/src/script.py +++ b/run.py @@ -12,69 +12,47 @@ The script can be run from the command line using the following command: -python script.py input_path +python run.py input_path An example for the provided test is: -python script.py data/test_data_liss_2_subjects.csv +python run.py data/test_data_liss_2_subjects.csv """ -import os import sys import argparse import pandas as pd -from joblib import load +import submission parser = argparse.ArgumentParser(description="Process and score data.") subparsers = parser.add_subparsers(dest="command") # Process subcommand -process_parser = subparsers.add_parser("predict", help="Process input data for prediction.") +process_parser = subparsers.add_parser( + "predict", help="Process input data for prediction." +) process_parser.add_argument("input_path", help="Path to input data CSV file.") process_parser.add_argument("--output", help="Path to prediction output CSV file.") # Score subcommand score_parser = subparsers.add_parser("score", help="Score (evaluate) predictions.") score_parser.add_argument("prediction_path", help="Path to predicted outcome CSV file.") -score_parser.add_argument("ground_truth_path", help="Path to ground truth outcome CSV file.") +score_parser.add_argument( + "ground_truth_path", help="Path to ground truth outcome CSV file." +) score_parser.add_argument("--output", help="Path to evaluation score output CSV file.") args = parser.parse_args() -def predict_outcomes(df): - """Process the input data and write the predictions.""" - - # The predict_outcomes function accepts a Pandas DataFrame as an argument - # and returns a new DataFrame with two columns: nomem_encr and - # prediction. The nomem_encr column in the new DataFrame replicates the - # corresponding column from the input DataFrame. The prediction - # column contains predictions for each corresponding nomem_encr. Each - # prediction is represented as a binary value: '0' indicates that the - # individual did not have a child during 2020-2022, while '1' implies that - # they did. - - # Keep - keepcols = ['burgstat2019', 'leeftijd2019', 'woonvorm2019', 'oplmet2019', 'aantalki2019'] - nomem_encr = df["nomem_encr"] - - df = df.loc[:, keepcols] - - # Load your trained model from the models directory - model_path = os.path.join(os.path.dirname(__file__), "..", "models", "model.joblib") - model = load(model_path) - - # Use your trained model for prediction - predictions = model.predict(df) - # Return the result as a Pandas DataFrame with the columns "nomem_encr" and "prediction" - return pd.concat([nomem_encr, pd.Series(predictions, name="prediction")], axis=1) - - def predict(input_path, output): if output is None: output = sys.stdout - df = pd.read_csv(input_path, encoding="latin-1", encoding_errors="replace", low_memory=False) - predictions = predict_outcomes(df) + df = pd.read_csv( + input_path, encoding="latin-1", encoding_errors="replace", low_memory=False + ) + df = submission.clean_df(df) + predictions = submission.predict_outcomes(df) assert ( predictions.shape[1] == 2 ), "Predictions must have two columns: nomem_encr and prediction" @@ -88,10 +66,10 @@ def predict(input_path, output): def score(prediction_path, ground_truth_path, output): """Score (evaluate) the predictions and write the metrics. - + This function takes the path to a CSV file containing predicted outcomes and the - path to a CSV file containing the ground truth outcomes. It calculates the overall - prediction accuracy, and precision, recall, and F1 score for having a child + path to a CSV file containing the ground truth outcomes. It calculates the overall + prediction accuracy, and precision, recall, and F1 score for having a child and writes these scores to a new output CSV file. This function should not be modified. @@ -107,9 +85,9 @@ def score(prediction_path, ground_truth_path, output): merged_df = pd.merge(predictions_df, ground_truth_df, on="nomem_encr", how="right") # Calculate accuracy - accuracy = len( - merged_df[merged_df["prediction"] == merged_df["new_child"]] - ) / len(merged_df) + accuracy = len(merged_df[merged_df["prediction"] == merged_df["new_child"]]) / len( + merged_df + ) # Calculate true positives, false positives, and false negatives true_positives = len( @@ -136,14 +114,17 @@ def score(prediction_path, ground_truth_path, output): except ZeroDivisionError: f1_score = 0 # Write metric output to a new CSV file - metrics_df = pd.DataFrame({ - 'accuracy': [accuracy], - 'precision': [precision], - 'recall': [recall], - 'f1_score': [f1_score] - }) + metrics_df = pd.DataFrame( + { + "accuracy": [accuracy], + "precision": [precision], + "recall": [recall], + "f1_score": [f1_score], + } + ) metrics_df.to_csv(output, index=False) + if __name__ == "__main__": args = parser.parse_args() if args.command == "predict": @@ -152,5 +133,5 @@ def score(prediction_path, ground_truth_path, output): score(args.prediction_path, args.ground_truth_path, args.output) else: parser.print_help() - predict(args.input_path, args.output) + predict(args.input_path, args.output) sys.exit(1) diff --git a/submission.py b/submission.py new file mode 100644 index 00000000..2d5c9abb --- /dev/null +++ b/submission.py @@ -0,0 +1,92 @@ +""" +This is an example script to generate the outcome variable given the input dataset. + +This script should be modified to prepare your own submission that predicts +the outcome for the benchmark challenge by changing the predict_outcomes function. + +The predict_outcomes function takes a Pandas data frame. The return value must +be a data frame with two columns: nomem_encr and outcome. The nomem_encr column +should contain the nomem_encr column from the input data frame. The outcome +column should contain the predicted outcome for each nomem_encr. The outcome +should be 0 (no child) or 1 (having a child). + +The script can be run from the command line using the following command: + +python run.py input_path + +An example for the provided test is: + +python run.py data/test_data_liss_2_subjects.csv +""" + +import os +import pandas as pd +from joblib import load + + +def clean_df(df): + """Process the input data to feed the model.""" + ### If no cleaning is done (e.g. if all the cleaning is done in a pipeline) leave only the "return df" command + + # e.g. keep some variables (the ones you used in your model) + # keepcols = [ + # "burgstat2019", + # "leeftijd2019", + # "woonvorm2019", + # "oplmet2019", + # "aantalki2019", + # ] + # df = df.loc[:, keepcols] + + return df + + +def predict_outcomes(df): + """Process the input data and write the predictions.""" + + # The predict_outcomes function accepts a Pandas DataFrame as an argument + # and returns a new DataFrame with two columns: nomem_encr and + # prediction. The nomem_encr column in the new DataFrame replicates the + # corresponding column from the input DataFrame. The prediction + # column contains predictions for each corresponding nomem_encr. Each + # prediction is represented as a binary value: '0' indicates that the + # individual did not have a child during 2020-2022, while '1' implies that + # they did. + + # Keep + keepcols = [ + "burgstat2019", + "leeftijd2019", + "woonvorm2019", + "oplmet2019", + "aantalki2019", + ] + nomem_encr = df["nomem_encr"] + + df = df.loc[:, keepcols] + + # Load your trained model from the models directory + model_path = os.path.join(os.path.dirname(__file__), "models", "model.joblib") + model = load(model_path) + + # Use your trained model for prediction + predictions = model.predict(df) + # Return the result as a Pandas DataFrame with the columns "nomem_encr" and "prediction" + return pd.concat([nomem_encr, pd.Series(predictions, name="prediction")], axis=1) + + +def test_submission(df, model_path="./model.joblib"): + """Test if the code will work""" + # Load fake data + df = pd.read_csv(os.path.join(os.path.dirname(__file__), "data/fake_data.csv")) + ids = df[["nomem_encr"]] + + # Clean data as you did before the model + df = clean_df(df) + + # Load model + model = load(model_path) + + # Create prediction + ids["prediction"] = model.predict(df) + return ids