-
Notifications
You must be signed in to change notification settings - Fork 0
/
K-modes+Clustering cversion.py
121 lines (79 loc) · 3.48 KB
/
K-modes+Clustering cversion.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/usr/bin/env python
# coding: utf-8
# In[27]:
import pandas as pd
from kmodes.kmodes import KModes
import matplotlib.pyplot as plt
from yellowbrick.cluster import KElbowVisualizer
# prepare dataset filling null values with 0
df = pd.read_excel("Deloitte Team 1 Debtor segmentation.xlsx")
df = df.fillna(0)
df_copy=df.copy()
df5 = pd.DataFrame(df, columns= ['Amount Due', 'Active Bucket','30<60','61<90','91<120','121+'])
model = KModes()
visualizer = KElbowVisualizer(model, k=(1,7))
visualizer.fit(df5) # Fit the data to the visualizer
visualizer.show()
# In[28]:
# binning by manually dividing the scale of amount into buckets and attaching labels to each bucket
df['Amount Due bin'] = pd.cut(df['Amount Due'], [-1,0,1000,3000,5000,10000,50000,1000000],
labels = ['0','1-1000', '1000-3000','3000-5000','5000-10000',
'10000-50000','50000-1000000'])
df = df.drop('Amount Due', axis=1)
df['Active Bucket bin'] = pd.cut(df['Active Bucket'], [-1,0,1000,3000,5000,10000,20000],
labels = ['0','1-1000', '1000-3000','3000-5000',
'5000-10000','10000-20000'])
df = df.drop('Active Bucket', axis=1)
df['30<60 bin'] = pd.cut(df['30<60'], [-1,0,1000,3000,5000,10000,20000],
labels = ['0','1-1000', '1000-3000','3000-5000',
'5000-10000','15000-20000'])
df = df.drop('30<60', axis=1)
df['61<90 bin'] = pd.cut(df['61<90'], [-1,0,1000,3000,5000,10000,40000],
labels = ['0','1-1000', '1000-3000','3000-5000',
'5000-10000','15000-40000'])
df = df.drop('61<90', axis=1)
df['91<120 bin'] = pd.cut(df['91<120'], [-1,0,1000,3000,5000,10000,100000],
labels = ['0','1-1000', '1000-3000','3000-5000',
'5000-10000','10000-100000'])
df = df.drop('91<120', axis=1)
df['121+ bin'] = pd.cut(df['121+'], [-1,0,1000,3000,5000,10000,200000],
labels = ['0','1-1000', '1000-3000','3000-5000',
'5000-10000','10000-200000'])
df = df.drop('121+', axis=1)
# In[29]:
# create a sub-dataset containing only columns related to amount due
# which are more relevant to the risk level clustering
df1 = pd.DataFrame(df, columns= ['Amount Due bin', 'Active Bucket bin','30<60 bin','61<90 bin','91<120 bin','121+ bin'])
# fit the KModes model with k=4
kmodes = KModes(n_clusters = 3)
clusters = kmodes.fit_predict(df1)
# attach the cluster prediction to original dataset
clusters = pd.DataFrame(clusters)
clusters.columns = ['cluster_pred']
combined_df = pd.concat([df, clusters], axis=1).reset_index()
combined_df = combined_df.drop(['index'],axis=1)
# Print the cluster centroids
print(kmodes.cluster_centroids_)
# print cluster labels
print(clusters)
# print out the number of datapoints in each cluster
print(combined_df['cluster_pred'].value_counts())
# In[30]:
combined_df.to_excel(r'C:\Users\chenh\Downloads\newoutputresult.xlsx')
# In[31]:
fig = plt.figure(figsize = (40,20))
ax1 = fig.add_subplot(111)
ax1.set_title('risk analysis')
plt.xlabel('Amount Due')
plt.ylabel('risk level cluster')
x = df_copy['Amount Due']
y = combined_df['cluster_pred']
plt.xticks([0,5000,10000,20000,30000,50000,350000])
plt.yticks([0,1,2,3])
plt.text(336677,3,'Emirates Airlines')
plt.text(58543,3,'Air NZ')
plt.text(63938,1,'Singapore Airlines')
plt.scatter(x,y)
plt.savefig("scatter.png")
plt.show()
# In[ ]: