Skip to content

Latest commit

 

History

History
1233 lines (976 loc) · 34.4 KB

File metadata and controls

1233 lines (976 loc) · 34.4 KB

Step 1. Import libraries

import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import math 
import copy
import pickle
import gc

from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.base import clone

from sklearn.preprocessing import StandardScaler

Step 1. Load and explore the data

TARGET = 'Response'
SEED = 94
print('Loading Data...')
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')

submission_data = pd.read_csv('input/sample_submission.csv')

print('Data Load Successfully.')
Loading Data...
Data Load Successfully.
train.shape, test.shape
((11504798, 12), (7669866, 11))
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11504798 entries, 0 to 11504797
Data columns (total 12 columns):
 #   Column                Dtype  
---  ------                -----  
 0   id                    int64  
 1   Gender                object 
 2   Age                   int64  
 3   Driving_License       int64  
 4   Region_Code           float64
 5   Previously_Insured    int64  
 6   Vehicle_Age           object 
 7   Vehicle_Damage        object 
 8   Annual_Premium        float64
 9   Policy_Sales_Channel  float64
 10  Vintage               int64  
 11  Response              int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 1.0+ GB
train.describe()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
id Age Driving_License Region_Code Previously_Insured Annual_Premium Policy_Sales_Channel Vintage Response
count 1.150480e+07 1.150480e+07 1.150480e+07 1.150480e+07 1.150480e+07 1.150480e+07 1.150480e+07 1.150480e+07 1.150480e+07
mean 5.752398e+06 3.838356e+01 9.980220e-01 2.641869e+01 4.629966e-01 3.046137e+04 1.124254e+02 1.638977e+02 1.229973e-01
std 3.321149e+06 1.499346e+01 4.443120e-02 1.299159e+01 4.986289e-01 1.645475e+04 5.403571e+01 7.997953e+01 3.284341e-01
min 0.000000e+00 2.000000e+01 0.000000e+00 0.000000e+00 0.000000e+00 2.630000e+03 1.000000e+00 1.000000e+01 0.000000e+00
25% 2.876199e+06 2.400000e+01 1.000000e+00 1.500000e+01 0.000000e+00 2.527700e+04 2.900000e+01 9.900000e+01 0.000000e+00
50% 5.752398e+06 3.600000e+01 1.000000e+00 2.800000e+01 0.000000e+00 3.182400e+04 1.510000e+02 1.660000e+02 0.000000e+00
75% 8.628598e+06 4.900000e+01 1.000000e+00 3.500000e+01 1.000000e+00 3.945100e+04 1.520000e+02 2.320000e+02 0.000000e+00
max 1.150480e+07 8.500000e+01 1.000000e+00 5.200000e+01 1.000000e+00 5.401650e+05 1.630000e+02 2.990000e+02 1.000000e+00
#Combine daataset for processing
train['is_train'] = 1
test['is_train'] = 0

df = pd.concat([train, test])
df.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
id Gender Age Driving_License Region_Code Previously_Insured Vehicle_Age Vehicle_Damage Annual_Premium Policy_Sales_Channel Vintage Response is_train
0 0 Male 21 1 35.0 0 1-2 Year Yes 65101.0 124.0 187 0.0 1
1 1 Male 43 1 28.0 0 > 2 Years Yes 58911.0 26.0 288 1.0 1
2 2 Female 25 1 14.0 1 < 1 Year No 38043.0 152.0 254 0.0 1
3 3 Female 35 1 1.0 0 1-2 Year Yes 2630.0 156.0 76 0.0 1
4 4 Female 36 1 15.0 1 1-2 Year No 31951.0 152.0 294 0.0 1

Step 2. Data preprocessing

# Check missing values
df.isnull().sum()
id                            0
Gender                        0
Age                           0
Driving_License               0
Region_Code                   0
Previously_Insured            0
Vehicle_Age                   0
Vehicle_Damage                0
Annual_Premium                0
Policy_Sales_Channel          0
Vintage                       0
Response                7669866
is_train                      0
dtype: int64
  • Age and Vehicle_Age (0.77): Strong positive correlation. Older individuals tend to have older vehicles.
  • Previously_Insured and Vehicle_Damage (-0.84): Strong negative correlation. If someone is previously insured, their vehicle is less likely to be damaged.
  • Policy_Sales_Channel and Age (-0.60): Moderate negative correlation. Younger individuals are more likely to be reached through certain sales channels.
def transform_categorical_features(df):
    print('Transforming categorical features..')

    gender_map = {'Male': 0, 'Female': 1}
    vehicle_age = {'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2} 
    vehicle_damage = {'No':0, 'Yes':1}

    df['Gender'] = df['Gender'].map(gender_map)
    df['Vehicle_Age'] = df['Vehicle_Age'].map(vehicle_age)
    df['Vehicle_Damage'] = df['Vehicle_Damage'].map(vehicle_damage)

    print("Transformed successfully.")
    return df
def create_additional_features(df):
    print('Creating additional features..')
    
    df['Vehicle_Age_Policy_Sales_Channel'] = pd.factorize(df['Vehicle_Age'].astype(str) + df['Policy_Sales_Channel'].astype(str))[0]
    df['Age_Vehicle_Age'] = pd.factorize(df['Age'].astype(str) + df['Vehicle_Age'].astype(str))[0]
    df['Prev_Insured_Vehicle_Damage'] = pd.factorize(df['Previously_Insured'].astype(str) + df['Vehicle_Damage'].astype(str))[0]
    df['Prev_Insured_Vintage'] = pd.factorize(df['Previously_Insured'].astype(str) + df['Vintage'].astype(str))[0]
    df['Policy_Sales_Channel_Age'] = pd.factorize(df['Policy_Sales_Channel'].astype(str) + df['Age'].astype(str))[0]

    return df
def adjust_data_types(df):
    print('Adjusting data types')
    df['Region_Code'] = df['Region_Code'].astype(int)
    df['Annual_Premium'] = df['Annual_Premium'].astype(int)
    df['Policy_Sales_Channel'] = df['Policy_Sales_Channel'].astype(int)
    
    return df
def optimize_memory_usage(df):
    print('Optimizing memory usage')
    start_mem_usage = df.memory_usage().sum() / 1024**2
    
    for col in df.columns:
        col_type = df[col].dtype
        if col_type.name in ['category', 'object']:
            raise ValueError(f"Column '{col}' is of type '{col_type.name}'")

        c_min = df[col].min()
        c_max = df[col].max()
        if str(col_type)[:3] == 'int':
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                df[col] = df[col].astype(np.int32)
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                df[col] = df[col].astype(np.int64)
        else:
            if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                df[col] = df[col].astype(np.float16)
            elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)

    end_mem_usage = df.memory_usage().sum() / 1024**2
    print(f'------ Memory usage before: {start_mem_usage:.2f} MB')
    print(f'------ Memory usage after: {end_mem_usage:.2f} MB')
    print(f'------ Reduced memory usage by {(100 * (start_mem_usage - end_mem_usage) / start_mem_usage):.1f}%')

    return df
df.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
id Gender Age Driving_License Region_Code Previously_Insured Vehicle_Age Vehicle_Damage Annual_Premium Policy_Sales_Channel Vintage Response is_train
0 0 Male 21 1 35.0 0 1-2 Year Yes 65101.0 124.0 187 0.0 1
1 1 Male 43 1 28.0 0 > 2 Years Yes 58911.0 26.0 288 1.0 1
2 2 Female 25 1 14.0 1 < 1 Year No 38043.0 152.0 254 0.0 1
3 3 Female 35 1 1.0 0 1-2 Year Yes 2630.0 156.0 76 0.0 1
4 4 Female 36 1 15.0 1 1-2 Year No 31951.0 152.0 294 0.0 1
df = transform_categorical_features(df)
df = adjust_data_types(df)  
df = create_additional_features(df)
df = optimize_memory_usage(df)

df.head()                          
Transforming categorical features..
Transformed successfully.
Adjusting data types
Creating additional features..
Optimizing memory usage
------ Memory usage before: 2560.09 MB
------ Memory usage after: 713.17 MB
------ Reduced memory usage by 72.1%
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
id Gender Age Driving_License Region_Code Previously_Insured Vehicle_Age Vehicle_Damage Annual_Premium Policy_Sales_Channel Vintage Response is_train Vehicle_Age_Policy_Sales_Channel Age_Vehicle_Age Prev_Insured_Vehicle_Damage Prev_Insured_Vintage Policy_Sales_Channel_Age
0 0 0 21 1 35 0 1 1 65101 124 187 0.0 1 0 0 0 0 0
1 1 0 43 1 28 0 2 1 58911 26 288 1.0 1 1 1 0 1 1
2 2 1 25 1 14 1 0 0 38043 152 254 0.0 1 2 2 1 2 2
3 3 1 35 1 1 0 1 1 2630 156 76 0.0 1 3 3 0 3 3
4 4 1 36 1 15 1 1 0 31951 152 294 0.0 1 4 4 1 4 4
# Compute the correlation matrix
corr = df.corr()

# Create a heatmap of the correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', vmin=-1, vmax=1)
plt.title('Correlation Matrix')
plt.show()
from sklearn.preprocessing import MinMaxScaler

# Initialize MinMaxScaler
min_max_scaler = MinMaxScaler()

# Select features to scale
features_to_scale = ['Annual_Premium', 'Vintage', 'Policy_Sales_Channel']

# Fit and transform the selected features
df[features_to_scale] = min_max_scaler.fit_transform(df[features_to_scale])
df.head()
df.head()

Step 4. Split the data

# Split the data back into train and test sets
train = df[df['is_train'] == 1].drop(columns=['is_train'])
test = df[df['is_train'] == 0].drop(columns=['is_train'])

X_train = train.drop(columns=[TARGET])
y_train = train[TARGET]

X_test = test.drop(columns=[TARGET])
y_test = submission_data
X_train.shape
(11504798, 16)
y_train.shape
(11504798,)
X_test.shape
(7669866, 16)
y_test.shape
(7669866, 2)

Subsample the data to speed up training process

X_train_subsample = X_train.sample(frac=0.01, random_state=42)
y_train_subsample = y_train.sample(frac=0.01, random_state=42)
X_test_subsample = X_test.sample(frac = 0.01, random_state=42)
X_train_subsample.shape
X_test_subsample.shape
X_train.head()
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, Callback

# Custom callback to print additional training information
class CustomCallback(Callback):
    def on_epoch_end(self, epoch, logs=None):
        print(f"Epoch {epoch + 1}/{self.params['epochs']}")
        print(f" - loss: {logs['loss']:.4f} - auc: {logs['auc']:.4f} - val_loss: {logs['val_loss']:.4f} - val_auc: {logs['val_auc']:.4f}")


# Build the model
model = Sequential()
model.add(Input(shape=(X_train.shape[1],)))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model with AUC as a metric
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC()])

# Define early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train, epochs=100, validation_split=0.2, callbacks=[early_stopping])

# Evaluate the model
loss, auc = model.evaluate(X_test, y_test)
print(f"Test AUC: {auc:.4f}")

# Predict probabilities
y_pred_proba = model.predict(X_test).ravel()

# Calculate the AUC score
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC: {roc_auc:.4f}")
Epoch 1/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m301s�[0m 1ms/step - auc_2: 0.5000 - loss: 132.1440 - val_auc_2: 0.5000 - val_loss: 0.3724
Epoch 2/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m250s�[0m 868us/step - auc_2: 0.5003 - loss: 0.3755 - val_auc_2: 0.5000 - val_loss: 0.3724
Epoch 3/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m251s�[0m 872us/step - auc_2: 0.4995 - loss: 0.3732 - val_auc_2: 0.5000 - val_loss: 0.3723
Epoch 4/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m258s�[0m 898us/step - auc_2: 0.4995 - loss: 0.3730 - val_auc_2: 0.5000 - val_loss: 0.3725
Epoch 5/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m264s�[0m 917us/step - auc_2: 0.4997 - loss: 0.3728 - val_auc_2: 0.5000 - val_loss: 0.3723
Epoch 6/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m271s�[0m 940us/step - auc_2: 0.4992 - loss: 0.3731 - val_auc_2: 0.5000 - val_loss: 0.3723
Epoch 7/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m270s�[0m 939us/step - auc_2: 0.4998 - loss: 0.3727 - val_auc_2: 0.5000 - val_loss: 0.3724
Epoch 8/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m264s�[0m 918us/step - auc_2: 0.4998 - loss: 0.3733 - val_auc_2: 0.5000 - val_loss: 0.3724
Epoch 9/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m261s�[0m 906us/step - auc_2: 0.4993 - loss: 0.3727 - val_auc_2: 0.5000 - val_loss: 0.3722
Epoch 10/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m265s�[0m 920us/step - auc_2: 0.4994 - loss: 0.3725 - val_auc_2: 0.5000 - val_loss: 0.3724
Epoch 11/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m270s�[0m 936us/step - auc_2: 0.4999 - loss: 0.3727 - val_auc_2: 0.5000 - val_loss: 0.3723
Epoch 12/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m257s�[0m 894us/step - auc_2: 0.4999 - loss: 0.3728 - val_auc_2: 0.5000 - val_loss: 0.3724
Epoch 13/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m256s�[0m 888us/step - auc_2: 0.4999 - loss: 0.3726 - val_auc_2: 0.5000 - val_loss: 0.3723
Epoch 14/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m248s�[0m 862us/step - auc_2: 0.4998 - loss: 0.3729 - val_auc_2: 0.5000 - val_loss: 0.3723
Epoch 15/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m250s�[0m 867us/step - auc_2: 0.5002 - loss: 0.3728 - val_auc_2: 0.5000 - val_loss: 0.3724
Epoch 16/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m251s�[0m 872us/step - auc_2: 0.4998 - loss: 0.3729 - val_auc_2: 0.5000 - val_loss: 0.3723
Epoch 17/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m251s�[0m 871us/step - auc_2: 0.4995 - loss: 0.3727 - val_auc_2: 0.5000 - val_loss: 0.3724
Epoch 18/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m253s�[0m 878us/step - auc_2: 0.4999 - loss: 0.3730 - val_auc_2: 0.5000 - val_loss: 0.3723
Epoch 19/100
�[1m287620/287620�[0m �[32m━━━━━━━━━━━━━━━━━━━━�[0m�[37m�[0m �[1m253s�[0m 879us/step - auc_2: 0.5002 - loss: 0.3729 - val_auc_2: 0.5000 - val_loss: 0.3723



---------------------------------------------------------------------------

InvalidArgumentError                      Traceback (most recent call last)

Cell In[22], line 32
     29 history = model.fit(X_train, y_train, epochs=100, validation_split=0.2, callbacks=[early_stopping])
     31 # Evaluate the model
---> 32 loss, auc = model.evaluate(X_test, y_test)
     33 print(f"Test AUC: {auc:.4f}")
     35 # Predict probabilities


File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\utils\traceback_utils.py:122, in filter_traceback.<locals>.error_handler(*args, **kwargs)
    119     filtered_tb = _process_traceback_frames(e.__traceback__)
    120     # To get the full stack trace, call:
    121     # `keras.config.disable_traceback_filtering()`
--> 122     raise e.with_traceback(filtered_tb) from None
    123 finally:
    124     del filtered_tb


File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\tensorflow\python\eager\execute.py:53, in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
     51 try:
     52   ctx.ensure_initialized()
---> 53   tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
     54                                       inputs, attrs, num_outputs)
     55 except core._NotOkStatusException as e:
     56   if name is not None:


InvalidArgumentError: Graph execution error:

Detected at node UnsortedSegmentSum_1 defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\ipykernel\kernelapp.py", line 739, in start

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\tornado\platform\asyncio.py", line 205, in start

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\asyncio\base_events.py", line 641, in run_forever

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\asyncio\base_events.py", line 1987, in _run_once

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\asyncio\events.py", line 88, in _run

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\ipykernel\kernelbase.py", line 545, in dispatch_queue

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\ipykernel\kernelbase.py", line 534, in process_one

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\ipykernel\kernelbase.py", line 437, in dispatch_shell

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\ipykernel\ipkernel.py", line 362, in execute_request

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\ipykernel\kernelbase.py", line 778, in execute_request

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\ipykernel\ipkernel.py", line 449, in do_execute

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\ipykernel\zmqshell.py", line 549, in run_cell

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\IPython\core\interactiveshell.py", line 3075, in run_cell

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\IPython\core\interactiveshell.py", line 3130, in _run_cell

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\IPython\core\interactiveshell.py", line 3334, in run_cell_async

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\IPython\core\interactiveshell.py", line 3517, in run_ast_nodes

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\IPython\core\interactiveshell.py", line 3577, in run_code

  File "C:\Users\user\AppData\Local\Temp\ipykernel_14524\278436547.py", line 32, in <module>

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 425, in evaluate

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 161, in one_step_on_iterator

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 150, in one_step_on_data

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 87, in test_step

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\trainers\trainer.py", line 412, in compute_metrics

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\trainers\compile_utils.py", line 330, in update_state

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\trainers\compile_utils.py", line 17, in update_state

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\metrics\confusion_metrics.py", line 1379, in update_state

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\metrics\metrics_utils.py", line 481, in update_confusion_matrix_variables

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\metrics\metrics_utils.py", line 277, in _update_confusion_matrix_variables_optimized

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\ops\math.py", line 59, in segment_sum

  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\backend\tensorflow\math.py", line 17, in segment_sum

data.shape = [64] does not start with segment_ids.shape = [32]
	 [[{{node UnsortedSegmentSum_1}}]] [Op:__inference_one_step_on_iterator_26047228]

Step 5. Train and evaluate the model

def train_and_evaluate(model, X, y, X_test, folds=10, random_state=None):
    print(f'Training {model.__class__.__name__}\n')
    
    scores = []
    feature_importances = np.zeros(X.shape[1])
    evaluation_history = []
    
    oof_pred_probs = np.zeros(X.shape[0])
    test_pred_probs = np.zeros(X_test.shape[0])
    
    skf = StratifiedKFold(n_splits=10, random_state=94, shuffle=True)
    
    for fold_index, (train_index, val_index) in enumerate(skf.split(X, y)):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        
        model_clone = copy.deepcopy(model)
        model_clone.fit(
                X_train, 
                y_train, 
                eval_set=[(X_val, y_val)], 
                verbose=500)
        
        feature_importances += model_clone.feature_importances_ / folds
        evaluation_history.append(model_clone.evals_result())
        
        y_pred_probs = model_clone.predict_proba(X_val)[:, 1]
        oof_pred_probs[val_index] = y_pred_probs
        
        temp_test_pred_probs = model_clone.predict_proba(X_test)[:, 1]
        test_pred_probs += temp_test_pred_probs / folds
        
        auc_score = roc_auc_score(y_val, y_pred_probs)
        scores.append(auc_score)
        
        print(f'\n--- Fold {fold_index + 1} - AUC: {auc_score:.5f}\n\n')
        
        del model_clone
        gc.collect()
    
    print(f'------ Average AUC: {np.mean(scores):.5f} ± {np.std(scores):.5f}\n\n')

    return oof_pred_probs, test_pred_probs
best_params = {
    'alpha': 1.302348865795227e-06, 
    'max_depth': 15, 
    'learning_rate': 0.061800451723613786, 
    'subsample': 0.7098803046786328, 
    'colsample_bytree': 0.2590672912533101, 
    'min_child_weight': 10, 
    'gamma': 0.8399887056014855, 
    'reg_alpha': 0.0016943548302122801, 
    'max_bin': 71284,
    'early_stopping_rounds': 50
}
best_xgb_model = XGBClassifier(**best_params, n_estimators=12000, random_state=94, eval_metric="auc")

# Call train_and_evaluate function with XGBClassifier model
oof_pred_probs, predictions = train_and_evaluate(best_xgb_model, X_train, y_train, X_test, folds=10, random_state=SEED)
submission = pd.DataFrame({
    'id': X_test['id'],
    'Response': predictions
})
submission.to_csv('submission.csv', index=False)
submission.head()

Possible improvements

##Binning some features

from sklearn.preprocessing import LabelEncoder, StandardScaler

# Binning Vintage
bins_vintage = [0, 200, 400, 600, 800, float('inf')]
labels_vintage = ['Very New', 'New', 'Moderately New', 'Experienced', 'Very Experienced']
df['Vintage_Binned'] = pd.cut(df['Vintage'], bins=bins_vintage, labels=labels_vintage)
# Binning Annual_Premium
bins_premium = [0, 10000, 30000, 50000, 100000, float('inf')]
labels_premium = ['Very Low', 'Low', 'Moderate', 'High', 'Very High']
df['Annual_Premium_Binned'] = pd.cut(df['Annual_Premium'], bins=bins_premium, labels=labels_premium)

# Encoding Policy_Sales_Channel
le = LabelEncoder()
df['Policy_Sales_Channel_Encoded'] = le.fit_transform(df['Policy_Sales_Channel'])

# Dropping original columns
df = df.drop(['Vintage', 'Annual_Premium', 'Policy_Sales_Channel'], axis=1)

df['Annual_Premium_Binned_Numeric'], _ = pd.factorize(df['Annual_Premium_Binned'])
df['Vintage_Binned_Numeric'],_ = pd.factorize(df['Vintage_Binned'])
##Using RandomizedSearch - hyperparamiters tunning

from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier


xgb_params = {
    'colsample_bylevel': [0.1, 0.2, 0.3, 0.5, 0.7, 1.0],
    'colsample_bynode': [0.1, 0.2, 0.3, 0.5, 0.7, 1.0],
    'colsample_bytree': [0.1, 0.2, 0.3, 0.5, 0.7, 1.0],
    'gamma': [0, 0.1, 0.5, 0.6051, 1],
    'max_bin': [256, 512, 682, 1024],
    'max_delta_step': [0, 1, 5, 7, 10],
    'max_depth': [3, 5, 10, 20, 50, 68, 100],
    'min_child_weight': [1, 3, 5, 7, 10],
    'n_estimators': [100, 500, 1000, 5000, 10000],
    'reg_alpha': [0, 0.1, 0.4651, 0.5],
    'reg_lambda': [0, 0.1, 0.5, 1],
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
}

# Set up cross-validation strategy
cv = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)

xgb_model = XGBClassifier(objective="binary:logistic", n_jobs=-1, random_state=SEED, eval_metric="auc", verbosity=0, tree_method='hist')

random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=xgb_params, n_iter=5, scoring='roc_auc', cv=cv, verbose=1, random_state=SEED)
print(random_search)
random_search.fit(X_train_subsample, y_train_subsample)

print("Best parameters found: ", random_search.best_params_)
print("Best AUC score: ", random_search.best_score_)