The challenge is to predict which customers respond positively to an automobile insurance offer.

Step 1. Import libraries

import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import math 
import copy
import pickle
import gc

from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.base import clone

from sklearn.preprocessing import StandardScaler

Step 1. Load and explore the data

TARGET = 'Response'
SEED = 94
print('Loading Data...')
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')

submission_data = pd.read_csv('input/sample_submission.csv')

print('Data Load Successfully.')
Loading Data...
Data Load Successfully.
train.shape, test.shape
((11504798, 12), (7669866, 11))
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11504798 entries, 0 to 11504797
Data columns (total 12 columns):
 #   Column                Dtype  
---  ------                -----  
 0   id                    int64  
 1   Gender                object 
 2   Age                   int64  
 3   Driving_License       int64  
 4   Region_Code           float64
 5   Previously_Insured    int64  
 6   Vehicle_Age           object 
 7   Vehicle_Damage        object 
 8   Annual_Premium        float64
 9   Policy_Sales_Channel  float64
 10  Vintage               int64  
 11  Response              int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 1.0+ GB
id Age Driving_License Region_Code Previously_Insured Annual_Premium Policy_Sales_Channel Vintage Response
count 1.150480e+07 1.150480e+07 1.150480e+07 1.150480e+07 1.150480e+07 1.150480e+07 1.150480e+07 1.150480e+07 1.150480e+07
mean 5.752398e+06 3.838356e+01 9.980220e-01 2.641869e+01 4.629966e-01 3.046137e+04 1.124254e+02 1.638977e+02 1.229973e-01
std 3.321149e+06 1.499346e+01 4.443120e-02 1.299159e+01 4.986289e-01 1.645475e+04 5.403571e+01 7.997953e+01 3.284341e-01
min 0.000000e+00 2.000000e+01 0.000000e+00 0.000000e+00 0.000000e+00 2.630000e+03 1.000000e+00 1.000000e+01 0.000000e+00
25% 2.876199e+06 2.400000e+01 1.000000e+00 1.500000e+01 0.000000e+00 2.527700e+04 2.900000e+01 9.900000e+01 0.000000e+00
50% 5.752398e+06 3.600000e+01 1.000000e+00 2.800000e+01 0.000000e+00 3.182400e+04 1.510000e+02 1.660000e+02 0.000000e+00
75% 8.628598e+06 4.900000e+01 1.000000e+00 3.500000e+01 1.000000e+00 3.945100e+04 1.520000e+02 2.320000e+02 0.000000e+00
max 1.150480e+07 8.500000e+01 1.000000e+00 5.200000e+01 1.000000e+00 5.401650e+05 1.630000e+02 2.990000e+02 1.000000e+00
#Combine daataset for processing
train['is_train'] = 1
test['is_train'] = 0

df = pd.concat([train, test])
id Gender Age Driving_License Region_Code Previously_Insured Vehicle_Age Vehicle_Damage Annual_Premium Policy_Sales_Channel Vintage Response is_train
0 0 Male 21 1 35.0 0 1-2 Year Yes 65101.0 124.0 187 0.0 1
1 1 Male 43 1 28.0 0 > 2 Years Yes 58911.0 26.0 288 1.0 1
2 2 Female 25 1 14.0 1 < 1 Year No 38043.0 152.0 254 0.0 1
3 3 Female 35 1 1.0 0 1-2 Year Yes 2630.0 156.0 76 0.0 1
4 4 Female 36 1 15.0 1 1-2 Year No 31951.0 152.0 294 0.0 1

Step 2. Data preprocessing

# Check missing values
id                            0
Gender                        0
Age                           0
Driving_License               0
Region_Code                   0
Previously_Insured            0
Vehicle_Age                   0
Vehicle_Damage                0
Annual_Premium                0
Policy_Sales_Channel          0
Vintage                       0
Response                7669866
is_train                      0
dtype: int64
  • Age and Vehicle_Age (0.77): Strong positive correlation. Older individuals tend to have older vehicles.
  • Previously_Insured and Vehicle_Damage (-0.84): Strong negative correlation. If someone is previously insured, their vehicle is less likely to be damaged.
  • Policy_Sales_Channel and Age (-0.60): Moderate negative correlation. Younger individuals are more likely to be reached through certain sales channels.
def transform_categorical_features(df):
    print('Transforming categorical features..')

    gender_map = {'Male': 0, 'Female': 1}
    vehicle_age = {'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2} 
    vehicle_damage = {'No':0, 'Yes':1}

    df['Gender'] = df['Gender'].map(gender_map)
    df['Vehicle_Age'] = df['Vehicle_Age'].map(vehicle_age)
    df['Vehicle_Damage'] = df['Vehicle_Damage'].map(vehicle_damage)

    print("Transformed successfully.")
    return df
def create_additional_features(df):
    print('Creating additional features..')
    df['Vehicle_Age_Policy_Sales_Channel'] = pd.factorize(df['Vehicle_Age'].astype(str) + df['Policy_Sales_Channel'].astype(str))[0]
    df['Age_Vehicle_Age'] = pd.factorize(df['Age'].astype(str) + df['Vehicle_Age'].astype(str))[0]
    df['Prev_Insured_Vehicle_Damage'] = pd.factorize(df['Previously_Insured'].astype(str) + df['Vehicle_Damage'].astype(str))[0]
    df['Prev_Insured_Vintage'] = pd.factorize(df['Previously_Insured'].astype(str) + df['Vintage'].astype(str))[0]
    df['Policy_Sales_Channel_Age'] = pd.factorize(df['Policy_Sales_Channel'].astype(str) + df['Age'].astype(str))[0]

    return df
def adjust_data_types(df):
    print('Adjusting data types')
    df['Region_Code'] = df['Region_Code'].astype(int)
    df['Annual_Premium'] = df['Annual_Premium'].astype(int)
    df['Policy_Sales_Channel'] = df['Policy_Sales_Channel'].astype(int)
    return df
def optimize_memory_usage(df):
    print('Optimizing memory usage')
    start_mem_usage = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtype
        if in ['category', 'object']:
            raise ValueError(f"Column '{col}' is of type '{}'")

        c_min = df[col].min()
        c_max = df[col].max()
        if str(col_type)[:3] == 'int':
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                df[col] = df[col].astype(np.int32)
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                df[col] = df[col].astype(np.int64)
            if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                df[col] = df[col].astype(np.float16)
            elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                df[col] = df[col].astype(np.float32)
                df[col] = df[col].astype(np.float64)

    end_mem_usage = df.memory_usage().sum() / 1024**2
    print(f'------ Memory usage before: {start_mem_usage:.2f} MB')
    print(f'------ Memory usage after: {end_mem_usage:.2f} MB')
    print(f'------ Reduced memory usage by {(100 * (start_mem_usage - end_mem_usage) / start_mem_usage):.1f}%')

    return df
id Gender Age Driving_License Region_Code Previously_Insured Vehicle_Age Vehicle_Damage Annual_Premium Policy_Sales_Channel Vintage Response is_train
0 0 Male 21 1 35.0 0 1-2 Year Yes 65101.0 124.0 187 0.0 1
1 1 Male 43 1 28.0 0 > 2 Years Yes 58911.0 26.0 288 1.0 1
2 2 Female 25 1 14.0 1 < 1 Year No 38043.0 152.0 254 0.0 1
3 3 Female 35 1 1.0 0 1-2 Year Yes 2630.0 156.0 76 0.0 1
4 4 Female 36 1 15.0 1 1-2 Year No 31951.0 152.0 294 0.0 1
df = transform_categorical_features(df)
df = adjust_data_types(df)  
df = create_additional_features(df)
df = optimize_memory_usage(df)

Transforming categorical features..
Transformed successfully.
Adjusting data types
Creating additional features..
Optimizing memory usage
------ Memory usage before: 2560.09 MB
------ Memory usage after: 713.17 MB
------ Reduced memory usage by 72.1%
id Gender Age Driving_License Region_Code Previously_Insured Vehicle_Age Vehicle_Damage Annual_Premium Policy_Sales_Channel Vintage Response is_train Vehicle_Age_Policy_Sales_Channel Age_Vehicle_Age Prev_Insured_Vehicle_Damage Prev_Insured_Vintage Policy_Sales_Channel_Age
0 0 0 21 1 35 0 1 1 65101 124 187 0.0 1 0 0 0 0 0
1 1 0 43 1 28 0 2 1 58911 26 288 1.0 1 1 1 0 1 1
2 2 1 25 1 14 1 0 0 38043 152 254 0.0 1 2 2 1 2 2
3 3 1 35 1 1 0 1 1 2630 156 76 0.0 1 3 3 0 3 3
4 4 1 36 1 15 1 1 0 31951 152 294 0.0 1 4 4 1 4 4
# Compute the correlation matrix
corr = df.corr()

# Create a heatmap of the correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', vmin=-1, vmax=1)
plt.title('Correlation Matrix')
from sklearn.preprocessing import MinMaxScaler

# Initialize MinMaxScaler
min_max_scaler = MinMaxScaler()

# Select features to scale
features_to_scale = ['Annual_Premium', 'Vintage', 'Policy_Sales_Channel']

# Fit and transform the selected features
df[features_to_scale] = min_max_scaler.fit_transform(df[features_to_scale])

Step 4. Split the data

# Split the data back into train and test sets
train = df[df['is_train'] == 1].drop(columns=['is_train'])
test = df[df['is_train'] == 0].drop(columns=['is_train'])

X_train = train.drop(columns=[TARGET])
y_train = train[TARGET]

X_test = test.drop(columns=[TARGET])
y_test = submission_data
(11504798, 16)
(7669866, 16)
(7669866, 2)

Subsample the data to speed up training process

X_train_subsample = X_train.sample(frac=0.01, random_state=42)
y_train_subsample = y_train.sample(frac=0.01, random_state=42)
X_test_subsample = X_test.sample(frac = 0.01, random_state=42)
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, Callback

# Custom callback to print additional training information
class CustomCallback(Callback):
    def on_epoch_end(self, epoch, logs=None):
        print(f"Epoch {epoch + 1}/{self.params['epochs']}")
        print(f" - loss: {logs['loss']:.4f} - auc: {logs['auc']:.4f} - val_loss: {logs['val_loss']:.4f} - val_auc: {logs['val_auc']:.4f}")

# Build the model
model = Sequential()
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model with AUC as a metric
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC()])

# Define early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
history =, y_train, epochs=100, validation_split=0.2, callbacks=[early_stopping])

# Evaluate the model
loss, auc = model.evaluate(X_test, y_test)
print(f"Test AUC: {auc:.4f}")

# Predict probabilities
y_pred_proba = model.predict(X_test).ravel()

# Calculate the AUC score
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC: {roc_auc:.4f}")
Epoch 1/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m301s�[0m 1ms/step - auc_2: 0.5000 - loss: 132.1440 - val_auc_2: 0.5000 - val_loss: 0.3724
Epoch 2/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m250s�[0m 868us/step - auc_2: 0.5003 - loss: 0.3755 - val_auc_2: 0.5000 - val_loss: 0.3724
Epoch 3/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m251s�[0m 872us/step - auc_2: 0.4995 - loss: 0.3732 - val_auc_2: 0.5000 - val_loss: 0.3723
Epoch 4/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m258s�[0m 898us/step - auc_2: 0.4995 - loss: 0.3730 - val_auc_2: 0.5000 - val_loss: 0.3725
Epoch 5/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m264s�[0m 917us/step - auc_2: 0.4997 - loss: 0.3728 - val_auc_2: 0.5000 - val_loss: 0.3723
Epoch 6/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m271s�[0m 940us/step - auc_2: 0.4992 - loss: 0.3731 - val_auc_2: 0.5000 - val_loss: 0.3723
Epoch 7/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m270s�[0m 939us/step - auc_2: 0.4998 - loss: 0.3727 - val_auc_2: 0.5000 - val_loss: 0.3724
Epoch 8/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m264s�[0m 918us/step - auc_2: 0.4998 - loss: 0.3733 - val_auc_2: 0.5000 - val_loss: 0.3724
Epoch 9/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m261s�[0m 906us/step - auc_2: 0.4993 - loss: 0.3727 - val_auc_2: 0.5000 - val_loss: 0.3722
Epoch 10/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m265s�[0m 920us/step - auc_2: 0.4994 - loss: 0.3725 - val_auc_2: 0.5000 - val_loss: 0.3724
Epoch 11/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m270s�[0m 936us/step - auc_2: 0.4999 - loss: 0.3727 - val_auc_2: 0.5000 - val_loss: 0.3723
Epoch 12/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m257s�[0m 894us/step - auc_2: 0.4999 - loss: 0.3728 - val_auc_2: 0.5000 - val_loss: 0.3724
Epoch 13/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m256s�[0m 888us/step - auc_2: 0.4999 - loss: 0.3726 - val_auc_2: 0.5000 - val_loss: 0.3723
Epoch 14/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m248s�[0m 862us/step - auc_2: 0.4998 - loss: 0.3729 - val_auc_2: 0.5000 - val_loss: 0.3723
Epoch 15/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m250s�[0m 867us/step - auc_2: 0.5002 - loss: 0.3728 - val_auc_2: 0.5000 - val_loss: 0.3724
Epoch 16/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m251s�[0m 872us/step - auc_2: 0.4998 - loss: 0.3729 - val_auc_2: 0.5000 - val_loss: 0.3723
Epoch 17/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m251s�[0m 871us/step - auc_2: 0.4995 - loss: 0.3727 - val_auc_2: 0.5000 - val_loss: 0.3724
Epoch 18/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m253s�[0m 878us/step - auc_2: 0.4999 - loss: 0.3730 - val_auc_2: 0.5000 - val_loss: 0.3723
Epoch 19/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m253s�[0m 879us/step - auc_2: 0.5002 - loss: 0.3729 - val_auc_2: 0.5000 - val_loss: 0.3723


InvalidArgumentError                      Traceback (most recent call last)

Cell In[22], line 32
     29 history =, y_train, epochs=100, validation_split=0.2, callbacks=[early_stopping])
     31 # Evaluate the model
---> 32 loss, auc = model.evaluate(X_test, y_test)
     33 print(f"Test AUC: {auc:.4f}")
     35 # Predict probabilities

Step 5. Train and evaluate the model

def train_and_evaluate(model, X, y, X_test, folds=10, random_state=None):
    print(f'Training {model.__class__.__name__}\n')
    scores = []
    feature_importances = np.zeros(X.shape[1])
    evaluation_history = []
    oof_pred_probs = np.zeros(X.shape[0])
    test_pred_probs = np.zeros(X_test.shape[0])
    skf = StratifiedKFold(n_splits=10, random_state=94, shuffle=True)
    for fold_index, (train_index, val_index) in enumerate(skf.split(X, y)):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        model_clone = copy.deepcopy(model)
                eval_set=[(X_val, y_val)], 
        feature_importances += model_clone.feature_importances_ / folds
        y_pred_probs = model_clone.predict_proba(X_val)[:, 1]
        oof_pred_probs[val_index] = y_pred_probs
        temp_test_pred_probs = model_clone.predict_proba(X_test)[:, 1]
        test_pred_probs += temp_test_pred_probs / folds
        auc_score = roc_auc_score(y_val, y_pred_probs)
        print(f'\n--- Fold {fold_index + 1} - AUC: {auc_score:.5f}\n\n')
        del model_clone
    print(f'------ Average AUC: {np.mean(scores):.5f} ± {np.std(scores):.5f}\n\n')

    return oof_pred_probs, test_pred_probs
best_params = {
    'alpha': 1.302348865795227e-06, 
    'max_depth': 15, 
    'learning_rate': 0.061800451723613786, 
    'subsample': 0.7098803046786328, 
    'colsample_bytree': 0.2590672912533101, 
    'min_child_weight': 10, 
    'gamma': 0.8399887056014855, 
    'reg_alpha': 0.0016943548302122801, 
    'max_bin': 71284,
    'early_stopping_rounds': 50
best_xgb_model = XGBClassifier(**best_params, n_estimators=12000, random_state=94, eval_metric="auc")

# Call train_and_evaluate function with XGBClassifier model
oof_pred_probs, predictions = train_and_evaluate(best_xgb_model, X_train, y_train, X_test, folds=10, random_state=SEED)
submission = pd.DataFrame({
    'id': X_test['id'],
    'Response': predictions
submission.to_csv('submission.csv', index=False)

Possible improvements

##Binning some features

from sklearn.preprocessing import LabelEncoder, StandardScaler

# Binning Vintage
bins_vintage = [0, 200, 400, 600, 800, float('inf')]
labels_vintage = ['Very New', 'New', 'Moderately New', 'Experienced', 'Very Experienced']
df['Vintage_Binned'] = pd.cut(df['Vintage'], bins=bins_vintage, labels=labels_vintage)
# Binning Annual_Premium
bins_premium = [0, 10000, 30000, 50000, 100000, float('inf')]
labels_premium = ['Very Low', 'Low', 'Moderate', 'High', 'Very High']
df['Annual_Premium_Binned'] = pd.cut(df['Annual_Premium'], bins=bins_premium, labels=labels_premium)

# Encoding Policy_Sales_Channel
le = LabelEncoder()
df['Policy_Sales_Channel_Encoded'] = le.fit_transform(df['Policy_Sales_Channel'])

# Dropping original columns
df = df.drop(['Vintage', 'Annual_Premium', 'Policy_Sales_Channel'], axis=1)

df['Annual_Premium_Binned_Numeric'], _ = pd.factorize(df['Annual_Premium_Binned'])
df['Vintage_Binned_Numeric'],_ = pd.factorize(df['Vintage_Binned'])
##Using RandomizedSearch - hyperparamiters tunning

from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier

xgb_params = {
    'colsample_bylevel': [0.1, 0.2, 0.3, 0.5, 0.7, 1.0],
    'colsample_bynode': [0.1, 0.2, 0.3, 0.5, 0.7, 1.0],
    'colsample_bytree': [0.1, 0.2, 0.3, 0.5, 0.7, 1.0],
    'gamma': [0, 0.1, 0.5, 0.6051, 1],
    'max_bin': [256, 512, 682, 1024],
    'max_delta_step': [0, 1, 5, 7, 10],
    'max_depth': [3, 5, 10, 20, 50, 68, 100],
    'min_child_weight': [1, 3, 5, 7, 10],
    'n_estimators': [100, 500, 1000, 5000, 10000],
    'reg_alpha': [0, 0.1, 0.4651, 0.5],
    'reg_lambda': [0, 0.1, 0.5, 1],
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

# Set up cross-validation strategy
cv = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)

xgb_model = XGBClassifier(objective="binary:logistic", n_jobs=-1, random_state=SEED, eval_metric="auc", verbosity=0, tree_method='hist')

random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=xgb_params, n_iter=5, scoring='roc_auc', cv=cv, verbose=1, random_state=SEED)
print(random_search), y_train_subsample)

print("Best parameters found: ", random_search.best_params_)
print("Best AUC score: ", random_search.best_score_)


