-
Notifications
You must be signed in to change notification settings - Fork 0
/
cusomter_churn_prediction.py
396 lines (264 loc) · 9.65 KB
/
cusomter_churn_prediction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
# -*- coding: utf-8 -*-
"""cusomter_churn_prediction.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/15PtSQCZPlfP1qaiAIdRTi6ZuSCeqDiqO
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import tensorflow as tf
import pickle
from google.colab import drive
drive.mount('/content/drive')
path = "/content/drive/MyDrive/customer_churn/dataset/customer_churn_large_dataset.xlsx"
df = pd.read_excel(path)
# Converting to Pandas DataFrame
df = pd.DataFrame(df)
df.head()
df.shape
df.dtypes
df.describe(include='all')
"""**Count** for each column is 100,000 which is the total number of data points
**Location**
- Number of locations = 5
- Most common occurring location is **Houston** with 20157 data points
**Gender**
- Majority of the customers are **Female**
- Total number of females = 50216
**Age**
- Mininum age = 18
- Average age = 44
- Maximum Age = 70
**Subscription Duration**
- Minimum = 1 month
- Average = 1 year
- Maximum = 2 years
=> There are no subscriptions that are for more than 2 years
"""
df.info()
"""## => There are no null values present in the dataset
##This can easily be confirmed using the isnull() method as well as visualized using a bar plot
"""
df.isnull().any()
"""## => This indicates that none of the features contains any null values.
## Visualization for the same can be seen below:
"""
df.count().plot.bar()
"""# **Outlier Detection**
## Visualization of outliers using Distribution
"""
# Creating a 1x2 grid of subplots
fig, axs = plt.subplots(1, 2, figsize=(12, 6))
# Histogram of Age
axs[0].hist(df.Age, bins=(df.Age.max()-df.Age.min())+1, rwidth=0.8)
axs[0].set_title("Age Distribution")
# Histogram of subscription months
axs[1].hist(df.Subscription_Length_Months, bins=(df.Subscription_Length_Months.max()-df.Subscription_Length_Months.min())+1,rwidth=0.8)
axs[1].set_title("Subscription Distribution")
axs[0].set_ylabel('Count')
axs[0].set_xlabel('Ages')
axs[1].set_xlabel('# Months')
plt.show()
"""## **Age Distribution**
### Age histogram exhibits uniform distribution along with the repeated peaks. Same pattern is followed throughout which rules out any possibility of outliers
## **Subscription Distribution**
### - The even distribution of subscriptions suggests that people are subscribing uniformly across various time intervals.
### - It that there are no particular periods that attract significantly more or fewer subscriptions than others
"""
# Histogram of total usage
plt.hist(df.Total_Usage_GB, bins=20, rwidth=0.8)
plt.title("Total Usage Distribution")
plt.show()
"""### - There is a uniform distribution of usage across various data packages which indicates that each package is utilized by similar proportion of the total population
### - This could imply that there is no strong preference or bias towards any particular package
### - It is safe to say that there are no outliers
# **Outlier Detection using Z-Score**
### Considering threshold value to be 3. If the z-score > 3, this implies existence of outliers
"""
# Age
z_score_age = (df.Age - df.Age.mean())/df.Age.std()
z_score_age
df.Age[z_score_age >=3].count()
# Monthly Bill
z_score_bill = (df.Monthly_Bill - df.Monthly_Bill.mean())/df.Monthly_Bill.std()
z_score_bill
df.Monthly_Bill[z_score_bill >=3].count()
# Total_Usage_GB
# Monthly Bill
z_score_usage = (df.Total_Usage_GB - df.Total_Usage_GB.mean())/df.Total_Usage_GB.std()
z_score_usage
df.Total_Usage_GB[z_score_bill >=3].count()
"""### The Z-score test implies that there are no existence of outliers in the features - Age, Monthly Bill and Total Usage"""
df.info()
"""# **Encoding Categorical Data**"""
df.head()
"""### There are two categorial features that need to be encoded, namely, Gender and Location
### Extracting unique values from each feature
"""
df['Gender'].unique()
"""=> Two unique values:
- Male
- Female
"""
df['Location'].unique()
"""=> Five unique values:
- Los Angeles
- New York
- Miami
- Chicago
- Houston
"""
df
"""### Applying Label-Encoding on Gender Feature
#### using sklearn
"""
df.info()
# Label Encoding
le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])
df.info()
"""### Applying One Hot Encoding on Locations Feature
"""
# Saving column names
dummy_cols = pd.get_dummies(df['Location']).columns
dummy_cols = list(dummy_cols)
cols = dummy_cols + ['CustomerId','Name','Age','Gender'] + ['Subscription_Length_Months','Monthly_Bill','Total_Usage_GB','Churn']
cols
ct = ColumnTransformer(transformers = [('encoder', OneHotEncoder(),[4])], remainder='passthrough')
df = pd.DataFrame(ct.fit_transform(df))
df.info()
df.columns = cols
df.info()
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')
df.info()
"""# Train Test Splitting"""
y = df.iloc[:,-1]
y = np.array(y)
y
# drop CustomerId column
df = df.drop(columns=['CustomerId', 'Name'])
df = df[['Chicago','Houston','Los Angeles','Miami','Gender','Age','Subscription_Length_Months','Monthly_Bill','Total_Usage_GB']]
X = df.iloc[:,:-1]
X
# splitting the dataset
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train
"""# **Feature Scaling**
## Standardization (z-score)
"""
# Creating an instance of Standard Scaler
sc = StandardScaler()
X_train.iloc[:,5:] = sc.fit_transform(X_train.iloc[:,5:])
X_train
X_test.iloc[:,5:] = sc.transform(X_test.iloc[:,5:])
X_test
"""# **Model Building**
## **Logistic Regression**
"""
lg_model = LogisticRegression()
lg_model = lg_model.fit(X_train,y_train)
y_pred_lg = lg_model.predict(X_test)
print(classification_report(y_test,y_pred_lg))
"""### Total overall accuracy is 50%
### Precision for True Negatives is 51% and that of True Positives is 51%
### The performance is average
## **Support** **Vector** Machines
"""
svm_model = SVC()
svm_model = svm_model.fit(X_train,y_train)
y_pred_svm = svm_model.predict(X_test)
print(classification_report(y_test,y_pred_svm))
"""### The performance of SVM is quite similar to Logistic Regression
### However in some cases it performes better, like in case of recall and f1-score
## **Random** **Forest Classifier**
"""
rand_forest_model = RandomForestClassifier()
rand_forest_model.fit(X_train,y_train)
y_pred_forest = rand_forest_model.predict(X_test)
print(classification_report(y_test,y_pred_forest))
rand_forest_model.score(X_test,y_test)
"""### Overall the Random Forest is performing worse than both Logistic Regression and SVM with an accuracy of less than 50%
# Fine Tuning Random Forest
### Checking the performance by increasing the number of Trees
"""
rand_forest_model = RandomForestClassifier(n_estimators=20)
rand_forest_model.fit(X_train,y_train)
rand_forest_model.score(X_test,y_test)
"""### For 20 trees the model does not improve that much
### Further increasing the number of Trees
"""
rand_forest_model = RandomForestClassifier(n_estimators=40)
rand_forest_model.fit(X_train,y_train)
rand_forest_model.score(X_test,y_test)
rand_forest_pred = rand_forest_model.predict(X_test)
"""### Best performance occurs with n_estimators = 40
### Further increasing the value leads to diminishing returns
"""
# Plotting Confusion Matrix for Random Forest
cm = confusion_matrix(y_test,rand_forest_pred)
cm
# Commented out IPython magic to ensure Python compatibility.
# Visualizing the confusion matrix
# %matplotlib inline
import seaborn as sns
sns.heatmap(cm,annot=True)
plt.xlabel('Predicted')
plt.ylabel('Ground Truth')
plt.show()
"""### - 5200 True Negatives were correctly predicted
### - 4700 True Positives were correctly predicted
### - A similar number of predicted values were wrongly predicted
#### -- This implies the accuracy of the model is around 50% as seen previously
# Neural Network
"""
def plot_loss(history):
plt.plot(history.history['loss'],label='loss')
plt.plot(history.history['val_loss'],label='val_loss')
plt.xlabel('Epoch')
plt.ylabel('Binary Crossentropy')
plt.legend()
plt.grid(True)
plt.show()
def plot_accuracy(history):
plt.plot(history.history['accuracy'],label='accuracy')
plt.plot(history.history['val_accuracy'],label='val_accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)
plt.show()
nn_model = tf.keras.Sequential([
tf.keras.layers.Dense(64,activation='relu'),
tf.keras.layers.Dense(32,activation='relu'),
tf.keras.layers.Dense(1,activation='sigmoid'),
])
nn_model.compile(optimizer=tf.keras.optimizers.Adam(0.001),loss='binary_crossentropy',metrics=['accuracy'])
history = nn_model.fit(
X_train, y_train,epochs=100,batch_size=32,validation_split=0.2
)
plot_loss(history)
plot_accuracy(history)
"""## The Neural Networks are performing well as the loss is clearly decreasing and the accuracy is increasing with each iteration.
## The accuracy however is 53% which is better than the other algorithms
# Fine Tuning model parameters
## testing on learning rate = 0.005 and 64 neurons in the 2nd layer
"""
nn_model = tf.keras.Sequential([
tf.keras.layers.Dense(64,activation='relu'),
tf.keras.layers.Dense(64,activation='relu'),
tf.keras.layers.Dense(1,activation='sigmoid'),
])
nn_model.compile(optimizer=tf.keras.optimizers.Adam(0.001),loss='binary_crossentropy',metrics=['accuracy'])
history = nn_model.fit(
X_train, y_train,epochs=100,batch_size=32,validation_split=0.2
)
pickle.dump(classifier,open('model.pkl','wb'))