cusomter_churn_prediction.py

# -*- coding: utf-8 -*-
"""cusomter_churn_prediction.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/15PtSQCZPlfP1qaiAIdRTi6ZuSCeqDiqO
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import tensorflow as tf
import pickle

from google.colab import drive
drive.mount('/content/drive')

path = "/content/drive/MyDrive/customer_churn/dataset/customer_churn_large_dataset.xlsx"

df = pd.read_excel(path)

# Converting to Pandas DataFrame
df = pd.DataFrame(df)
df.head()

df.shape

df.dtypes

df.describe(include='all')

"""**Count** for each column is 100,000 which is the total number of data points

**Location**

- Number of locations = 5

- Most common occurring location is **Houston** with 20157 data points

**Gender**

- Majority of the customers are **Female**
- Total number of females = 50216

**Age**


- Mininum age = 18

- Average age = 44

- Maximum Age = 70

**Subscription Duration**

- Minimum = 1 month

- Average = 1 year

- Maximum = 2 years

=> There are no subscriptions that are for more than 2 years


"""

df.info()

"""## => There are no null values present in the dataset

##This can easily be confirmed using the isnull() method as well as visualized using a bar plot
"""

df.isnull().any()

"""## => This indicates that none of the features contains any null values.

## Visualization for the same can be seen below:
"""

df.count().plot.bar()

"""# **Outlier Detection**
## Visualization of outliers using Distribution
"""

# Creating a 1x2 grid of subplots
fig, axs = plt.subplots(1, 2, figsize=(12, 6))

# Histogram of Age
axs[0].hist(df.Age, bins=(df.Age.max()-df.Age.min())+1, rwidth=0.8)
axs[0].set_title("Age Distribution")

# Histogram of subscription months
axs[1].hist(df.Subscription_Length_Months, bins=(df.Subscription_Length_Months.max()-df.Subscription_Length_Months.min())+1,rwidth=0.8)
axs[1].set_title("Subscription Distribution")

axs[0].set_ylabel('Count')
axs[0].set_xlabel('Ages')
axs[1].set_xlabel('# Months')
plt.show()

"""## **Age Distribution**
### Age histogram exhibits uniform distribution along with the repeated peaks. Same pattern is followed throughout which rules out any possibility of outliers

## **Subscription Distribution**
### - The even distribution of subscriptions suggests that people are subscribing uniformly across various time intervals.

### - It that there are no particular periods that attract significantly more or fewer subscriptions than others
"""

# Histogram of total usage
plt.hist(df.Total_Usage_GB, bins=20, rwidth=0.8)
plt.title("Total Usage Distribution")
plt.show()

"""### - There is a uniform distribution of usage across various data packages which indicates that each package is utilized by similar proportion of the total population

### - This could imply that there is no strong preference or bias towards any particular package

### - It is safe to say that there are no outliers

# **Outlier Detection using Z-Score**

### Considering threshold value to be 3. If the z-score > 3, this implies existence of outliers
"""

# Age
z_score_age = (df.Age - df.Age.mean())/df.Age.std()
z_score_age

df.Age[z_score_age >=3].count()

# Monthly Bill
z_score_bill = (df.Monthly_Bill - df.Monthly_Bill.mean())/df.Monthly_Bill.std()
z_score_bill

df.Monthly_Bill[z_score_bill >=3].count()

# Total_Usage_GB
# Monthly Bill
z_score_usage = (df.Total_Usage_GB - df.Total_Usage_GB.mean())/df.Total_Usage_GB.std()
z_score_usage

df.Total_Usage_GB[z_score_bill >=3].count()

"""### The Z-score test implies that there are no existence of outliers in the features - Age, Monthly Bill and Total Usage"""

df.info()

"""# **Encoding Categorical Data**"""

df.head()

"""### There are two categorial features that need to be encoded, namely, Gender and Location

### Extracting unique values from each feature
"""

df['Gender'].unique()

"""=> Two unique values:
- Male
- Female
"""

df['Location'].unique()

"""=> Five unique values:
- Los Angeles
- New York
- Miami
- Chicago
- Houston
"""

df

"""### Applying Label-Encoding on Gender Feature
#### using sklearn
"""

df.info()

# Label Encoding
le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])

df.info()

"""### Applying One Hot Encoding on  Locations Feature

"""

# Saving column names
dummy_cols = pd.get_dummies(df['Location']).columns
dummy_cols = list(dummy_cols)

cols =  dummy_cols + ['CustomerId','Name','Age','Gender'] + ['Subscription_Length_Months','Monthly_Bill','Total_Usage_GB','Churn']
cols

ct = ColumnTransformer(transformers = [('encoder', OneHotEncoder(),[4])], remainder='passthrough')
df = pd.DataFrame(ct.fit_transform(df))

df.info()

df.columns = cols
df.info()

df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')
df.info()

"""# Train Test Splitting"""

y = df.iloc[:,-1]
y = np.array(y)
y

# drop CustomerId column
df = df.drop(columns=['CustomerId', 'Name'])
df = df[['Chicago','Houston','Los Angeles','Miami','Gender','Age','Subscription_Length_Months','Monthly_Bill','Total_Usage_GB']]
X = df.iloc[:,:-1]
X


# splitting the dataset
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

X_train

"""# **Feature Scaling**
## Standardization (z-score)
"""

# Creating an instance of Standard Scaler
sc = StandardScaler()
X_train.iloc[:,5:] = sc.fit_transform(X_train.iloc[:,5:])
X_train

X_test.iloc[:,5:] = sc.transform(X_test.iloc[:,5:])
X_test


"""# **Model Building**
## **Logistic Regression**
"""

lg_model = LogisticRegression()
lg_model = lg_model.fit(X_train,y_train)

y_pred_lg = lg_model.predict(X_test)
print(classification_report(y_test,y_pred_lg))

"""### Total overall accuracy is 50%
### Precision for True Negatives is 51% and that of True Positives is 51%

### The performance is average

## **Support** **Vector** Machines
"""

svm_model = SVC()
svm_model = svm_model.fit(X_train,y_train)

y_pred_svm = svm_model.predict(X_test)
print(classification_report(y_test,y_pred_svm))

"""### The performance of SVM is quite similar to Logistic Regression
### However in some cases it performes better, like in case of recall and f1-score

## **Random** **Forest Classifier**
"""

rand_forest_model = RandomForestClassifier()
rand_forest_model.fit(X_train,y_train)

y_pred_forest = rand_forest_model.predict(X_test)
print(classification_report(y_test,y_pred_forest))

rand_forest_model.score(X_test,y_test)

"""### Overall the Random Forest is performing worse than both Logistic Regression and SVM with an accuracy of less than 50%

# Fine Tuning Random Forest
### Checking the performance by increasing the number of Trees
"""

rand_forest_model = RandomForestClassifier(n_estimators=20)
rand_forest_model.fit(X_train,y_train)

rand_forest_model.score(X_test,y_test)

"""### For 20 trees the model does not improve that much
### Further increasing the number of Trees
"""

rand_forest_model = RandomForestClassifier(n_estimators=40)
rand_forest_model.fit(X_train,y_train)

rand_forest_model.score(X_test,y_test)

rand_forest_pred = rand_forest_model.predict(X_test)

"""### Best performance occurs with n_estimators = 40
### Further increasing the value leads to diminishing returns
"""

# Plotting Confusion Matrix for Random Forest
cm = confusion_matrix(y_test,rand_forest_pred)

cm

# Commented out IPython magic to ensure Python compatibility.
# Visualizing the confusion matrix
# %matplotlib inline
import seaborn as sns
sns.heatmap(cm,annot=True)
plt.xlabel('Predicted')
plt.ylabel('Ground Truth')
plt.show()

"""### - 5200 True Negatives were correctly predicted
### - 4700 True Positives were correctly predicted
### - A similar number of predicted values were wrongly predicted
#### -- This implies the accuracy of the model is around 50% as seen previously

# Neural Network
"""

def plot_loss(history):
  plt.plot(history.history['loss'],label='loss')
  plt.plot(history.history['val_loss'],label='val_loss')
  plt.xlabel('Epoch')
  plt.ylabel('Binary Crossentropy')
  plt.legend()
  plt.grid(True)
  plt.show()

def plot_accuracy(history):
  plt.plot(history.history['accuracy'],label='accuracy')
  plt.plot(history.history['val_accuracy'],label='val_accuracy')
  plt.xlabel('Epoch')
  plt.ylabel('Accuracy')
  plt.legend()
  plt.grid(True)
  plt.show()

nn_model = tf.keras.Sequential([
    tf.keras.layers.Dense(64,activation='relu'),
    tf.keras.layers.Dense(32,activation='relu'),
    tf.keras.layers.Dense(1,activation='sigmoid'),
])

nn_model.compile(optimizer=tf.keras.optimizers.Adam(0.001),loss='binary_crossentropy',metrics=['accuracy'])

history = nn_model.fit(
    X_train, y_train,epochs=100,batch_size=32,validation_split=0.2
)

plot_loss(history)
plot_accuracy(history)

"""## The Neural Networks are performing well as the loss is clearly decreasing and the accuracy is increasing with each iteration.

## The accuracy however is 53% which is better than the other algorithms

# Fine Tuning model parameters
## testing on learning rate = 0.005 and 64 neurons in the 2nd layer
"""

nn_model = tf.keras.Sequential([
    tf.keras.layers.Dense(64,activation='relu'),
    tf.keras.layers.Dense(64,activation='relu'),
    tf.keras.layers.Dense(1,activation='sigmoid'),
])

nn_model.compile(optimizer=tf.keras.optimizers.Adam(0.001),loss='binary_crossentropy',metrics=['accuracy'])

history = nn_model.fit(
    X_train, y_train,epochs=100,batch_size=32,validation_split=0.2
)

pickle.dump(classifier,open('model.pkl','wb'))