Step 1. Import libraries

import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import math 
import copy
import pickle
import gc

from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.base import clone

from sklearn.preprocessing import StandardScaler

Step 1. Load and explore the data

TARGET = 'Response'
SEED = 94

print('Loading Data...')
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')

submission_data = pd.read_csv('input/sample_submission.csv')

print('Data Load Successfully.')

Loading Data...
Data Load Successfully.

train.shape, test.shape

((11504798, 12), (7669866, 11))

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11504798 entries, 0 to 11504797
Data columns (total 12 columns):
 #   Column                Dtype  
---  ------                -----  
 0   id                    int64  
 1   Gender                object 
 2   Age                   int64  
 3   Driving_License       int64  
 4   Region_Code           float64
 5   Previously_Insured    int64  
 6   Vehicle_Age           object 
 7   Vehicle_Damage        object 
 8   Annual_Premium        float64
 9   Policy_Sales_Channel  float64
 10  Vintage               int64  
 11  Response              int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 1.0+ GB

train.describe()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	id	Age	Driving_License	Region_Code	Previously_Insured	Annual_Premium	Policy_Sales_Channel	Vintage	Response
count	1.150480e+07	1.150480e+07	1.150480e+07	1.150480e+07	1.150480e+07	1.150480e+07	1.150480e+07	1.150480e+07	1.150480e+07
mean	5.752398e+06	3.838356e+01	9.980220e-01	2.641869e+01	4.629966e-01	3.046137e+04	1.124254e+02	1.638977e+02	1.229973e-01
std	3.321149e+06	1.499346e+01	4.443120e-02	1.299159e+01	4.986289e-01	1.645475e+04	5.403571e+01	7.997953e+01	3.284341e-01
min	0.000000e+00	2.000000e+01	0.000000e+00	0.000000e+00	0.000000e+00	2.630000e+03	1.000000e+00	1.000000e+01	0.000000e+00
25%	2.876199e+06	2.400000e+01	1.000000e+00	1.500000e+01	0.000000e+00	2.527700e+04	2.900000e+01	9.900000e+01	0.000000e+00
50%	5.752398e+06	3.600000e+01	1.000000e+00	2.800000e+01	0.000000e+00	3.182400e+04	1.510000e+02	1.660000e+02	0.000000e+00
75%	8.628598e+06	4.900000e+01	1.000000e+00	3.500000e+01	1.000000e+00	3.945100e+04	1.520000e+02	2.320000e+02	0.000000e+00
max	1.150480e+07	8.500000e+01	1.000000e+00	5.200000e+01	1.000000e+00	5.401650e+05	1.630000e+02	2.990000e+02	1.000000e+00

#Combine daataset for processing
train['is_train'] = 1
test['is_train'] = 0

df = pd.concat([train, test])
df.head()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	id	Gender	Age	Driving_License	Region_Code	Previously_Insured	Vehicle_Age	Vehicle_Damage	Annual_Premium	Policy_Sales_Channel	Vintage	Response	is_train
0	0	Male	21	1	35.0	0	1-2 Year	Yes	65101.0	124.0	187	0.0	1
1	1	Male	43	1	28.0	0	> 2 Years	Yes	58911.0	26.0	288	1.0	1
2	2	Female	25	1	14.0	1	< 1 Year	No	38043.0	152.0	254	0.0	1
3	3	Female	35	1	1.0	0	1-2 Year	Yes	2630.0	156.0	76	0.0	1
4	4	Female	36	1	15.0	1	1-2 Year	No	31951.0	152.0	294	0.0	1

Step 2. Data preprocessing

# Check missing values
df.isnull().sum()

id                            0
Gender                        0
Age                           0
Driving_License               0
Region_Code                   0
Previously_Insured            0
Vehicle_Age                   0
Vehicle_Damage                0
Annual_Premium                0
Policy_Sales_Channel          0
Vintage                       0
Response                7669866
is_train                      0
dtype: int64

Age and Vehicle_Age (0.77): Strong positive correlation. Older individuals tend to have older vehicles.
Previously_Insured and Vehicle_Damage (-0.84): Strong negative correlation. If someone is previously insured, their vehicle is less likely to be damaged.
Policy_Sales_Channel and Age (-0.60): Moderate negative correlation. Younger individuals are more likely to be reached through certain sales channels.

def transform_categorical_features(df):
    print('Transforming categorical features..')

    gender_map = {'Male': 0, 'Female': 1}
    vehicle_age = {'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2} 
    vehicle_damage = {'No':0, 'Yes':1}

    df['Gender'] = df['Gender'].map(gender_map)
    df['Vehicle_Age'] = df['Vehicle_Age'].map(vehicle_age)
    df['Vehicle_Damage'] = df['Vehicle_Damage'].map(vehicle_damage)

    print("Transformed successfully.")
    return df

def create_additional_features(df):
    print('Creating additional features..')
    
    df['Vehicle_Age_Policy_Sales_Channel'] = pd.factorize(df['Vehicle_Age'].astype(str) + df['Policy_Sales_Channel'].astype(str))[0]
    df['Age_Vehicle_Age'] = pd.factorize(df['Age'].astype(str) + df['Vehicle_Age'].astype(str))[0]
    df['Prev_Insured_Vehicle_Damage'] = pd.factorize(df['Previously_Insured'].astype(str) + df['Vehicle_Damage'].astype(str))[0]
    df['Prev_Insured_Vintage'] = pd.factorize(df['Previously_Insured'].astype(str) + df['Vintage'].astype(str))[0]
    df['Policy_Sales_Channel_Age'] = pd.factorize(df['Policy_Sales_Channel'].astype(str) + df['Age'].astype(str))[0]

    return df

def adjust_data_types(df):
    print('Adjusting data types')
    df['Region_Code'] = df['Region_Code'].astype(int)
    df['Annual_Premium'] = df['Annual_Premium'].astype(int)
    df['Policy_Sales_Channel'] = df['Policy_Sales_Channel'].astype(int)
    
    return df

def optimize_memory_usage(df):
    print('Optimizing memory usage')
    start_mem_usage = df.memory_usage().sum() / 1024**2
    
    for col in df.columns:
        col_type = df[col].dtype
        if col_type.name in ['category', 'object']:
            raise ValueError(f"Column '{col}' is of type '{col_type.name}'")

        c_min = df[col].min()
        c_max = df[col].max()
        if str(col_type)[:3] == 'int':
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                df[col] = df[col].astype(np.int32)
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                df[col] = df[col].astype(np.int64)
        else:
            if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                df[col] = df[col].astype(np.float16)
            elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)

    end_mem_usage = df.memory_usage().sum() / 1024**2
    print(f'------ Memory usage before: {start_mem_usage:.2f} MB')
    print(f'------ Memory usage after: {end_mem_usage:.2f} MB')
    print(f'------ Reduced memory usage by {(100 * (start_mem_usage - end_mem_usage) / start_mem_usage):.1f}%')

    return df

df.head()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	id	Gender	Age	Driving_License	Region_Code	Previously_Insured	Vehicle_Age	Vehicle_Damage	Annual_Premium	Policy_Sales_Channel	Vintage	Response	is_train
0	0	Male	21	1	35.0	0	1-2 Year	Yes	65101.0	124.0	187	0.0	1
1	1	Male	43	1	28.0	0	> 2 Years	Yes	58911.0	26.0	288	1.0	1
2	2	Female	25	1	14.0	1	< 1 Year	No	38043.0	152.0	254	0.0	1
3	3	Female	35	1	1.0	0	1-2 Year	Yes	2630.0	156.0	76	0.0	1
4	4	Female	36	1	15.0	1	1-2 Year	No	31951.0	152.0	294	0.0	1

df = transform_categorical_features(df)
df = adjust_data_types(df)  
df = create_additional_features(df)
df = optimize_memory_usage(df)

df.head()

Transforming categorical features..
Transformed successfully.
Adjusting data types
Creating additional features..
Optimizing memory usage
------ Memory usage before: 2560.09 MB
------ Memory usage after: 713.17 MB
------ Reduced memory usage by 72.1%

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	id	Gender	Age	Driving_License	Region_Code	Previously_Insured	Vehicle_Age	Vehicle_Damage	Annual_Premium	Policy_Sales_Channel	Vintage	Response	is_train	Vehicle_Age_Policy_Sales_Channel	Age_Vehicle_Age	Prev_Insured_Vehicle_Damage	Prev_Insured_Vintage	Policy_Sales_Channel_Age
0	0	0	21	1	35	0	1	1	65101	124	187	0.0	1	0	0	0	0	0
1	1	0	43	1	28	0	2	1	58911	26	288	1.0	1	1	1	0	1	1
2	2	1	25	1	14	1	0	0	38043	152	254	0.0	1	2	2	1	2	2
3	3	1	35	1	1	0	1	1	2630	156	76	0.0	1	3	3	0	3	3
4	4	1	36	1	15	1	1	0	31951	152	294	0.0	1	4	4	1	4	4

# Compute the correlation matrix
corr = df.corr()

# Create a heatmap of the correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', vmin=-1, vmax=1)
plt.title('Correlation Matrix')
plt.show()

from sklearn.preprocessing import MinMaxScaler

# Initialize MinMaxScaler
min_max_scaler = MinMaxScaler()

# Select features to scale
features_to_scale = ['Annual_Premium', 'Vintage', 'Policy_Sales_Channel']

# Fit and transform the selected features
df[features_to_scale] = min_max_scaler.fit_transform(df[features_to_scale])

df.head()

df.head()

Step 4. Split the data

# Split the data back into train and test sets
train = df[df['is_train'] == 1].drop(columns=['is_train'])
test = df[df['is_train'] == 0].drop(columns=['is_train'])

X_train = train.drop(columns=[TARGET])
y_train = train[TARGET]

X_test = test.drop(columns=[TARGET])
y_test = submission_data

X_train.shape

(11504798, 16)

y_train.shape

(11504798,)

X_test.shape

(7669866, 16)

y_test.shape

(7669866, 2)

Subsample the data to speed up training process

X_train_subsample = X_train.sample(frac=0.01, random_state=42)
y_train_subsample = y_train.sample(frac=0.01, random_state=42)

X_test_subsample = X_test.sample(frac = 0.01, random_state=42)

X_train_subsample.shape
X_test_subsample.shape

X_train.head()

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, Callback

# Custom callback to print additional training information
class CustomCallback(Callback):
    def on_epoch_end(self, epoch, logs=None):
        print(f"Epoch {epoch + 1}/{self.params['epochs']}")
        print(f" - loss: {logs['loss']:.4f} - auc: {logs['auc']:.4f} - val_loss: {logs['val_loss']:.4f} - val_auc: {logs['val_auc']:.4f}")


# Build the model
model = Sequential()
model.add(Input(shape=(X_train.shape[1],)))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model with AUC as a metric
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC()])

# Define early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train, epochs=100, validation_split=0.2, callbacks=[early_stopping])

# Evaluate the model
loss, auc = model.evaluate(X_test, y_test)
print(f"Test AUC: {auc:.4f}")

# Predict probabilities
y_pred_proba = model.predict(X_test).ravel()

# Calculate the AUC score
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC: {roc_auc:.4f}")

Epoch 1/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m301s�[0m 1ms/step - auc_2: 0.5000 - loss: 132.1440 - val_auc_2: 0.5000 - val_loss: 0.3724
Epoch 2/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m250s�[0m 868us/step - auc_2: 0.5003 - loss: 0.3755 - val_auc_2: 0.5000 - val_loss: 0.3724
Epoch 3/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m251s�[0m 872us/step - auc_2: 0.4995 - loss: 0.3732 - val_auc_2: 0.5000 - val_loss: 0.3723
Epoch 4/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m258s�[0m 898us/step - auc_2: 0.4995 - loss: 0.3730 - val_auc_2: 0.5000 - val_loss: 0.3725
Epoch 5/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m264s�[0m 917us/step - auc_2: 0.4997 - loss: 0.3728 - val_auc_2: 0.5000 - val_loss: 0.3723
Epoch 6/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m271s�[0m 940us/step - auc_2: 0.4992 - loss: 0.3731 - val_auc_2: 0.5000 - val_loss: 0.3723
Epoch 7/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m270s�[0m 939us/step - auc_2: 0.4998 - loss: 0.3727 - val_auc_2: 0.5000 - val_loss: 0.3724
Epoch 8/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m264s�[0m 918us/step - auc_2: 0.4998 - loss: 0.3733 - val_auc_2: 0.5000 - val_loss: 0.3724
Epoch 9/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m261s�[0m 906us/step - auc_2: 0.4993 - loss: 0.3727 - val_auc_2: 0.5000 - val_loss: 0.3722
Epoch 10/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m265s�[0m 920us/step - auc_2: 0.4994 - loss: 0.3725 - val_auc_2: 0.5000 - val_loss: 0.3724
Epoch 11/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m270s�[0m 936us/step - auc_2: 0.4999 - loss: 0.3727 - val_auc_2: 0.5000 - val_loss: 0.3723
Epoch 12/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m257s�[0m 894us/step - auc_2: 0.4999 - loss: 0.3728 - val_auc_2: 0.5000 - val_loss: 0.3724
Epoch 13/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m256s�[0m 888us/step - auc_2: 0.4999 - loss: 0.3726 - val_auc_2: 0.5000 - val_loss: 0.3723
Epoch 14/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m248s�[0m 862us/step - auc_2: 0.4998 - loss: 0.3729 - val_auc_2: 0.5000 - val_loss: 0.3723
Epoch 15/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m250s�[0m 867us/step - auc_2: 0.5002 - loss: 0.3728 - val_auc_2: 0.5000 - val_loss: 0.3724
Epoch 16/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m251s�[0m 872us/step - auc_2: 0.4998 - loss: 0.3729 - val_auc_2: 0.5000 - val_loss: 0.3723
Epoch 17/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m251s�[0m 871us/step - auc_2: 0.4995 - loss: 0.3727 - val_auc_2: 0.5000 - val_loss: 0.3724
Epoch 18/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m253s�[0m 878us/step - auc_2: 0.4999 - loss: 0.3730 - val_auc_2: 0.5000 - val_loss: 0.3723
Epoch 19/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m253s�[0m 879us/step - auc_2: 0.5002 - loss: 0.3729 - val_auc_2: 0.5000 - val_loss: 0.3723



---------------------------------------------------------------------------

InvalidArgumentError                      Traceback (most recent call last)

Cell In[22], line 32
     29 history = model.fit(X_train, y_train, epochs=100, validation_split=0.2, callbacks=[early_stopping])
     31 # Evaluate the model
---> 32 loss, auc = model.evaluate(X_test, y_test)
     33 print(f"Test AUC: {auc:.4f}")
     35 # Predict probabilities


File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\utils\traceback_utils.py:122, in filter_traceback.<locals>.error_handler(*args, **kwargs)
    119     filtered_tb = _process_traceback_frames(e.__traceback__)
    120     # To get the full stack trace, call:
    121     # `keras.config.disable_traceback_filtering()`
--> 122     raise e.with_traceback(filtered_tb) from None
    123 finally:
    124     del filtered_tb


File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\tensorflow\python\eager\execute.py:53, in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
     51 try:
     52   ctx.ensure_initialized()
---> 53   tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
     54                                       inputs, attrs, num_outputs)
     55 except core._NotOkStatusException as e:
     56   if name is not None:


InvalidArgumentError: Graph execution error:

Detected at node UnsortedSegmentSum_1 defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\ipykernel\kernelapp.py", line 739, in start

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\tornado\platform\asyncio.py", line 205, in start

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\asyncio\base_events.py", line 641, in run_forever

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\asyncio\base_events.py", line 1987, in _run_once

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\asyncio\events.py", line 88, in _run

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\ipykernel\kernelbase.py", line 545, in dispatch_queue

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\ipykernel\kernelbase.py", line 534, in process_one

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\ipykernel\kernelbase.py", line 437, in dispatch_shell

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\ipykernel\ipkernel.py", line 362, in execute_request

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\ipykernel\kernelbase.py", line 778, in execute_request

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\ipykernel\ipkernel.py", line 449, in do_execute

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\ipykernel\zmqshell.py", line 549, in run_cell

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\IPython\core\interactiveshell.py", line 3075, in run_cell

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\IPython\core\interactiveshell.py", line 3130, in _run_cell

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\IPython\core\interactiveshell.py", line 3334, in run_cell_async

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\IPython\core\interactiveshell.py", line 3517, in run_ast_nodes

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\IPython\core\interactiveshell.py", line 3577, in run_code

  File "C:\Users\user\AppData\Local\Temp\ipykernel_14524\278436547.py", line 32, in <module>

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 425, in evaluate

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 161, in one_step_on_iterator

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 150, in one_step_on_data

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 87, in test_step

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\trainers\trainer.py", line 412, in compute_metrics

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\trainers\compile_utils.py", line 330, in update_state

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\trainers\compile_utils.py", line 17, in update_state

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\metrics\confusion_metrics.py", line 1379, in update_state

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\metrics\metrics_utils.py", line 481, in update_confusion_matrix_variables

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\metrics\metrics_utils.py", line 277, in _update_confusion_matrix_variables_optimized

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\ops\math.py", line 59, in segment_sum

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\backend\tensorflow\math.py", line 17, in segment_sum

data.shape = [64] does not start with segment_ids.shape = [32]
	 [[{{node UnsortedSegmentSum_1}}]] [Op:__inference_one_step_on_iterator_26047228]

Step 5. Train and evaluate the model

def train_and_evaluate(model, X, y, X_test, folds=10, random_state=None):
    print(f'Training {model.__class__.__name__}\n')
    
    scores = []
    feature_importances = np.zeros(X.shape[1])
    evaluation_history = []
    
    oof_pred_probs = np.zeros(X.shape[0])
    test_pred_probs = np.zeros(X_test.shape[0])
    
    skf = StratifiedKFold(n_splits=10, random_state=94, shuffle=True)
    
    for fold_index, (train_index, val_index) in enumerate(skf.split(X, y)):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        
        model_clone = copy.deepcopy(model)
        model_clone.fit(
                X_train, 
                y_train, 
                eval_set=[(X_val, y_val)], 
                verbose=500)
        
        feature_importances += model_clone.feature_importances_ / folds
        evaluation_history.append(model_clone.evals_result())
        
        y_pred_probs = model_clone.predict_proba(X_val)[:, 1]
        oof_pred_probs[val_index] = y_pred_probs
        
        temp_test_pred_probs = model_clone.predict_proba(X_test)[:, 1]
        test_pred_probs += temp_test_pred_probs / folds
        
        auc_score = roc_auc_score(y_val, y_pred_probs)
        scores.append(auc_score)
        
        print(f'\n--- Fold {fold_index + 1} - AUC: {auc_score:.5f}\n\n')
        
        del model_clone
        gc.collect()
    
    print(f'------ Average AUC: {np.mean(scores):.5f} ± {np.std(scores):.5f}\n\n')

    return oof_pred_probs, test_pred_probs

best_params = {
    'alpha': 1.302348865795227e-06, 
    'max_depth': 15, 
    'learning_rate': 0.061800451723613786, 
    'subsample': 0.7098803046786328, 
    'colsample_bytree': 0.2590672912533101, 
    'min_child_weight': 10, 
    'gamma': 0.8399887056014855, 
    'reg_alpha': 0.0016943548302122801, 
    'max_bin': 71284,
    'early_stopping_rounds': 50
}
best_xgb_model = XGBClassifier(**best_params, n_estimators=12000, random_state=94, eval_metric="auc")

# Call train_and_evaluate function with XGBClassifier model
oof_pred_probs, predictions = train_and_evaluate(best_xgb_model, X_train, y_train, X_test, folds=10, random_state=SEED)

submission = pd.DataFrame({
    'id': X_test['id'],
    'Response': predictions
})
submission.to_csv('submission.csv', index=False)
submission.head()

Possible improvements

##Binning some features

from sklearn.preprocessing import LabelEncoder, StandardScaler

# Binning Vintage
bins_vintage = [0, 200, 400, 600, 800, float('inf')]
labels_vintage = ['Very New', 'New', 'Moderately New', 'Experienced', 'Very Experienced']
df['Vintage_Binned'] = pd.cut(df['Vintage'], bins=bins_vintage, labels=labels_vintage)
# Binning Annual_Premium
bins_premium = [0, 10000, 30000, 50000, 100000, float('inf')]
labels_premium = ['Very Low', 'Low', 'Moderate', 'High', 'Very High']
df['Annual_Premium_Binned'] = pd.cut(df['Annual_Premium'], bins=bins_premium, labels=labels_premium)

# Encoding Policy_Sales_Channel
le = LabelEncoder()
df['Policy_Sales_Channel_Encoded'] = le.fit_transform(df['Policy_Sales_Channel'])

# Dropping original columns
df = df.drop(['Vintage', 'Annual_Premium', 'Policy_Sales_Channel'], axis=1)

df['Annual_Premium_Binned_Numeric'], _ = pd.factorize(df['Annual_Premium_Binned'])
df['Vintage_Binned_Numeric'],_ = pd.factorize(df['Vintage_Binned'])

##Using RandomizedSearch - hyperparamiters tunning

from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier


xgb_params = {
    'colsample_bylevel': [0.1, 0.2, 0.3, 0.5, 0.7, 1.0],
    'colsample_bynode': [0.1, 0.2, 0.3, 0.5, 0.7, 1.0],
    'colsample_bytree': [0.1, 0.2, 0.3, 0.5, 0.7, 1.0],
    'gamma': [0, 0.1, 0.5, 0.6051, 1],
    'max_bin': [256, 512, 682, 1024],
    'max_delta_step': [0, 1, 5, 7, 10],
    'max_depth': [3, 5, 10, 20, 50, 68, 100],
    'min_child_weight': [1, 3, 5, 7, 10],
    'n_estimators': [100, 500, 1000, 5000, 10000],
    'reg_alpha': [0, 0.1, 0.4651, 0.5],
    'reg_lambda': [0, 0.1, 0.5, 1],
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
}

# Set up cross-validation strategy
cv = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)

xgb_model = XGBClassifier(objective="binary:logistic", n_jobs=-1, random_state=SEED, eval_metric="auc", verbosity=0, tree_method='hist')

random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=xgb_params, n_iter=5, scoring='roc_auc', cv=cv, verbose=1, random_state=SEED)
print(random_search)
random_search.fit(X_train_subsample, y_train_subsample)

print("Best parameters found: ", random_search.best_params_)
print("Best AUC score: ", random_search.best_score_)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

README.md

README.md

Step 1. Import libraries

Step 1. Load and explore the data

Step 2. Data preprocessing

Step 4. Split the data

Subsample the data to speed up training process

Step 5. Train and evaluate the model

Possible improvements

Files

README.md

Latest commit

History

README.md

File metadata and controls

Step 1. Import libraries

Step 1. Load and explore the data

Step 2. Data preprocessing

Step 4. Split the data

Subsample the data to speed up training process

Step 5. Train and evaluate the model

Possible improvements