Skip to content

Commit

Permalink
Update submission.py
Browse files Browse the repository at this point in the history
  • Loading branch information
AdrienneMendrik authored Mar 23, 2024
1 parent 33c3ef3 commit 75be10b
Showing 1 changed file with 71 additions and 36 deletions.
107 changes: 71 additions & 36 deletions submission.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,64 +2,99 @@
This is an example script to generate the outcome variable given the input dataset.
This script should be modified to prepare your own submission that predicts
the outcome for the benchmark challenge by changing the predict_outcomes function.
the outcome for the benchmark challenge by changing the clean_df and predict_outcomes function.
The predict_outcomes function takes a Pandas data frame. The return value must
be a data frame with two columns: nomem_encr and outcome. The nomem_encr column
should contain the nomem_encr column from the input data frame. The outcome
column should contain the predicted outcome for each nomem_encr. The outcome
should be 0 (no child) or 1 (having a child).
clean_df can be used to clean (preprocess) the data.
clean_df should be used to clean (preprocess) the data.
run.py can be used to test your submission.
"""

import os

# List your libraries and modules here. Don't forget to update environment.yml!
import pandas as pd
from joblib import load
from sklearn.linear_model import LogisticRegression
import joblib


def clean_df(df, background=None):
"""
Preprocess the input dataframe to feed the model.
# If no cleaning is done (e.g. if all the cleaning is done in a pipeline) leave only the "return df" command
def clean_df(df):
"""Process the input data to feed the model."""
### If no cleaning is done (e.g. if all the cleaning is done in a pipeline) leave only the "return df" command
Parameters:
df (pd.DataFrame): The input dataframe containing the raw data (from PreFer_train_data.csv).
background (pd.DataFrame): Optional input dataframe containing background data (from PreFer_train_background_data.csv).
# e.g. keep some variables (the ones you used in your model)
keepcols = [
'birthyear_bg',
'gender_bg',
'burgstat_2020',
'oplmet_2020',
'cf20m454']
Returns:
pd.DataFrame: The cleaned dataframe with only the necessary columns and processed variables.
"""

df = df.loc[:, keepcols]
## This script contains a bare minimum working example
# Create new variable with age
df['age'] = 2024 - df['birthyear_bg']

# Imputing missing values in age with the mean
df['age'] = df['age'].fillna(df['age'].mean())

# Filter cases for whom the outcome is not available
df = df[~df['new_child'].isna()]

# Selecting variables for modelling
keepcols = ['nomem_encr', # ID variable required for predictions,
'age', # newly created variable
'new_child'] # outcome variable

# Keeping data with variables selected
df = df[keepcols]

return df


def predict_outcomes(df, model_path="model.joblib"):
"""Generate predictions using the saved model and the input dataframe.
The predict_outcomes function accepts a Pandas DataFrame as an argument
and returns a new DataFrame with two columns: nomem_encr and
prediction. The nomem_encr column in the new DataFrame replicates the
corresponding column from the input DataFrame. The prediction
column contains predictions for each corresponding nomem_encr. Each
prediction is represented as a binary value: '0' indicates that the
individual did not have a child during 2021-2023, while '1' implies that
they did.
Parameters:
df (pd.DataFrame): The input dataframe for which predictions are to be made.
model_path (str): The path to the saved model file (which is the output of training.py).
Returns:
pd.DataFrame: A dataframe containing the identifiers and their corresponding predictions.
"""

## This script contains a bare minimum working example
if 'nomem_encr' not in df.columns:
print("The identifier variable 'nomem_encr' should be in the dataset")

def predict_outcomes(df, model_path="model.joblib"):
"""Write the predictions."""

# The predict_outcomes function accepts a Pandas DataFrame as an argument
# and returns a new DataFrame with two columns: nomem_encr and
# prediction. The nomem_encr column in the new DataFrame replicates the
# corresponding column from the input DataFrame. The prediction
# column contains predictions for each corresponding nomem_encr. Each
# prediction is represented as a binary value: '0' indicates that the
# individual did not have a child during 2021-2023, while '1' implies that
# they did.

nomem_encr = df[['nomem_encr']]
# cleaning
# Load the model
model = joblib.load(model_path)

# Preprocess the fake / holdout data
df = clean_df(df)

# Load your trained model
#model_path = os.path.join(os.path.dirname(__file__), "model.joblib")
model = load(model_path)
# IMPORTANT: the outcome `new_child` should NOT be in the data from this point onwards
# get list of variables *without* the outcome:
vars_without_outcome = df.columns[df.columns != 'new_child']

# Generate predictions from model, should be 0 (no child) or 1 (had child)
predictions = model.predict(df[vars_without_outcome])

# Output file should be DataFrame with two columns, nomem_encr and predictions
df_predict = pd.DataFrame({'nomem_encr': df['nomem_encr'], 'prediction': predictions})

# Use your trained model for prediction
predictions = model.predict(df)
# Return the result as a Pandas DataFrame with the columns "nomem_encr" and "prediction"
return pd.concat([nomem_encr, pd.Series(predictions, name="prediction")], axis=1)
# Return only dataset with predictions and identifier
return df_predict

0 comments on commit 75be10b

Please sign in to comment.