Skip to content

Commit

Permalink
Removed src dir
Browse files Browse the repository at this point in the history
  • Loading branch information
vloothuis committed Mar 3, 2024
1 parent e861ef2 commit de7cd18
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 28 deletions.
6 changes: 3 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@ FROM continuumio/anaconda3:2023.03-1
COPY environment.yml /
RUN conda env create -f /environment.yml

RUN mkdir /src
RUN mkdir /app

COPY data /data
COPY src/script.py /src
COPY script.py /
COPY models /models

ENTRYPOINT ["conda", "run", "-n", "eyra-rank", "python", "/src/script.py"]
ENTRYPOINT ["conda", "run", "-n", "eyra-rank", "python", "/script.py"]
CMD ["predict", "/data/fake_data.csv"]
8 changes: 4 additions & 4 deletions src/Example pipeline.ipynb → Example pipeline.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@
"metadata": {},
"outputs": [],
"source": [
"models_path = os.path.join(\"..\", \"models\")\n",
"models_path = \"models\"\n",
"os.makedirs(models_path, exist_ok=True)\n",
"\n",
"# Dump model (don't change the name)\n",
Expand All @@ -249,9 +249,9 @@
"metadata": {},
"source": [
"# How the submission would look like\n",
"The snippet below is taken from the file `src/script.py`. It shows how the prediction code needs to work. The function will be called with a dataframe containing the full dataset. This dataset is similar to the data downloaded but also includes the holdout data.\n",
"The snippet below is taken from the file `script.py`. It shows how the prediction code needs to work. The function will be called with a dataframe containing the full dataset. This dataset is similar to the data downloaded but also includes the holdout data.\n",
"\n",
"It then does the preprocessing in the same way that was used to train the model. If you make any adjustments to the pre-processing they should also be copied to the `src/script.py` script (**the code below is just an excerpt**).\n",
"It then does the preprocessing in the same way that was used to train the model. If you make any adjustments to the pre-processing they should also be copied to the `script.py` script (**the code below is just an excerpt**).\n",
"\n",
"Finally the script loads the model that was saved in the step above and does the prediction."
]
Expand All @@ -271,7 +271,7 @@
" df = df.loc[:, keepcols]\n",
" \n",
" # Load your trained model from the models directory\n",
" model_path = os.path.join(os.path.dirname(__file__), \"..\", \"models\", \"model.joblib\")\n",
" model_path = os.path.join(os.path.dirname(__file__), \"models\", \"model.joblib\")\n",
" model = load(model_path)\n",
"\n",
" # Use your trained model for prediction\n",
Expand Down
57 changes: 36 additions & 21 deletions src/script.py → script.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,18 @@
subparsers = parser.add_subparsers(dest="command")

# Process subcommand
process_parser = subparsers.add_parser("predict", help="Process input data for prediction.")
process_parser = subparsers.add_parser(
"predict", help="Process input data for prediction."
)
process_parser.add_argument("input_path", help="Path to input data CSV file.")
process_parser.add_argument("--output", help="Path to prediction output CSV file.")

# Score subcommand
score_parser = subparsers.add_parser("score", help="Score (evaluate) predictions.")
score_parser.add_argument("prediction_path", help="Path to predicted outcome CSV file.")
score_parser.add_argument("ground_truth_path", help="Path to ground truth outcome CSV file.")
score_parser.add_argument(
"ground_truth_path", help="Path to ground truth outcome CSV file."
)
score_parser.add_argument("--output", help="Path to evaluation score output CSV file.")

args = parser.parse_args()
Expand All @@ -54,14 +58,20 @@ def predict_outcomes(df):
# individual did not have a child during 2020-2022, while '1' implies that
# they did.

# Keep
keepcols = ['burgstat2019', 'leeftijd2019', 'woonvorm2019', 'oplmet2019', 'aantalki2019']
# Keep
keepcols = [
"burgstat2019",
"leeftijd2019",
"woonvorm2019",
"oplmet2019",
"aantalki2019",
]
nomem_encr = df["nomem_encr"]

df = df.loc[:, keepcols]

# Load your trained model from the models directory
model_path = os.path.join(os.path.dirname(__file__), "..", "models", "model.joblib")
model_path = os.path.join(os.path.dirname(__file__), "models", "model.joblib")
model = load(model_path)

# Use your trained model for prediction
Expand All @@ -73,7 +83,9 @@ def predict_outcomes(df):
def predict(input_path, output):
if output is None:
output = sys.stdout
df = pd.read_csv(input_path, encoding="latin-1", encoding_errors="replace", low_memory=False)
df = pd.read_csv(
input_path, encoding="latin-1", encoding_errors="replace", low_memory=False
)
predictions = predict_outcomes(df)
assert (
predictions.shape[1] == 2
Expand All @@ -88,10 +100,10 @@ def predict(input_path, output):

def score(prediction_path, ground_truth_path, output):
"""Score (evaluate) the predictions and write the metrics.
This function takes the path to a CSV file containing predicted outcomes and the
path to a CSV file containing the ground truth outcomes. It calculates the overall
prediction accuracy, and precision, recall, and F1 score for having a child
path to a CSV file containing the ground truth outcomes. It calculates the overall
prediction accuracy, and precision, recall, and F1 score for having a child
and writes these scores to a new output CSV file.
This function should not be modified.
Expand All @@ -107,9 +119,9 @@ def score(prediction_path, ground_truth_path, output):
merged_df = pd.merge(predictions_df, ground_truth_df, on="nomem_encr", how="right")

# Calculate accuracy
accuracy = len(
merged_df[merged_df["prediction"] == merged_df["new_child"]]
) / len(merged_df)
accuracy = len(merged_df[merged_df["prediction"] == merged_df["new_child"]]) / len(
merged_df
)

# Calculate true positives, false positives, and false negatives
true_positives = len(
Expand All @@ -136,14 +148,17 @@ def score(prediction_path, ground_truth_path, output):
except ZeroDivisionError:
f1_score = 0
# Write metric output to a new CSV file
metrics_df = pd.DataFrame({
'accuracy': [accuracy],
'precision': [precision],
'recall': [recall],
'f1_score': [f1_score]
})
metrics_df = pd.DataFrame(
{
"accuracy": [accuracy],
"precision": [precision],
"recall": [recall],
"f1_score": [f1_score],
}
)
metrics_df.to_csv(output, index=False)


if __name__ == "__main__":
args = parser.parse_args()
if args.command == "predict":
Expand All @@ -152,5 +167,5 @@ def score(prediction_path, ground_truth_path, output):
score(args.prediction_path, args.ground_truth_path, args.output)
else:
parser.print_help()
predict(args.input_path, args.output)
predict(args.input_path, args.output)
sys.exit(1)

0 comments on commit de7cd18

Please sign in to comment.