-
Notifications
You must be signed in to change notification settings - Fork 0
/
IrisClustersAlchem.py
162 lines (129 loc) · 8.72 KB
/
IrisClustersAlchem.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import matplotlib.pyplot as plt
import pandas as pd
from matplotlib import style
style.use('fivethirtyeight')
#from sklearn.datasets.samples_generator import make_blobs
from sklearn.datasets import make_blobs
import numpy as np
from scipy.stats import multivariate_normal
df = pd.read_csv( filepath_or_buffer='D:/SnigdhaDocs/iitm/sem2/ml/Dataset1.csv', header=None, sep=',')
df.columns=['Col1']
df.dropna(how="all", inplace=True)
L = df.iloc[:,0:4].values
#print(L)
# 0. Create dataset
X,Y = make_blobs(cluster_std=1.5,random_state=20,n_samples=500,centers=3)
# Stratch dataset to get ellipsoid data
X = np.dot(X,np.random.RandomState(0).randn(2,2))
class GMM:
def __init__(self,X,number_of_sources,iterations):
self.iterations = iterations
self.number_of_sources = number_of_sources
self.X = X
self.mu = None
self.pi = None
self.cov = None
self.XY = None
"""Define a function which runs for iterations, iterations"""
def run(self):
self.reg_cov = 1e-6*np.identity(len(self.X[0]))
x,y = np.meshgrid(np.sort(self.X[:,0]),np.sort(self.X[:,1]))
self.XY = np.array([x.flatten(),y.flatten()]).T
""" 1. Set the initial mu, covariance and pi values"""
self.mu = np.random.randint(min(self.X[:,0]),max(self.X[:,0]),size=(self.number_of_sources,len(self.X[0]))) # This is a nxm matrix since we assume n sources (n Gaussians) where each has m dimensions
self.cov = np.zeros((self.number_of_sources,len(X[0]),len(X[0]))) # We need a nxmxm covariance matrix for each source since we have m features --> We create symmetric covariance matrices with ones on the digonal
for dim in range(len(self.cov)):
np.fill_diagonal(self.cov[dim],5)
self.pi = np.ones(self.number_of_sources)/self.number_of_sources # Are "Fractions"
log_likelihoods = [] # In this list we store the log likehoods per iteration and plot them in the end to check if
# if we have converged
"""Plot the initial state"""
fig = plt.figure(figsize=(10,10))
ax0 = fig.add_subplot(111)
ax0.scatter(self.X[:,0],self.X[:,1])
ax0.set_title('Initial state')
for m,c in zip(self.mu,self.cov):
c += self.reg_cov
multi_normal = multivariate_normal(mean=m,cov=c)
ax0.contour(np.sort(self.X[:,0]),np.sort(self.X[:,1]),multi_normal.pdf(self.XY).reshape(len(self.X),len(self.X)),colors='black',alpha=0.3)
ax0.scatter(m[0],m[1],c='grey',zorder=10,s=100)
for i in range(self.iterations):
"""E Step"""
r_ic = np.zeros((len(self.X),len(self.cov)))
for m,co,p,r in zip(self.mu,self.cov,self.pi,range(len(r_ic[0]))):
co+=self.reg_cov
mn = multivariate_normal(mean=m,cov=co)
r_ic[:,r] = p*mn.pdf(self.X)/np.sum([pi_c*multivariate_normal(mean=mu_c,cov=cov_c).pdf(X) for pi_c,mu_c,cov_c in zip(self.pi,self.mu,self.cov+self.reg_cov)],axis=0)
"""
The above calculation of r_ic is not that obvious why I want to quickly derive what we have done above.
First of all the nominator:
We calculate for each source c which is defined by m,co and p for every instance x_i, the multivariate_normal.pdf() value.
For each loop this gives us a 100x1 matrix (This value divided by the denominator is then assigned to r_ic[:,r] which is in
the end a 100x3 matrix).
Second the denominator:
What we do here is, we calculate the multivariate_normal.pdf() for every instance x_i for every source c which is defined by
pi_c, mu_c, and cov_c and write this into a list. This gives us a 3x100 matrix where we have 100 entrances per source c.
Now the formula wants us to add up the pdf() values given by the 3 sources for each x_i. Hence we sum up this list over axis=0.
This gives us then a list with 100 entries.
What we have now is FOR EACH LOOP a list with 100 entries in the nominator and a list with 100 entries in the denominator
where each element is the pdf per class c for each instance x_i (nominator) respectively the summed pdf's of classes c for each
instance x_i. Consequently we can now divide the nominator by the denominator and have as result a list with 100 elements which we
can then assign to r_ic[:,r] --> One row r per source c. In the end after we have done this for all three sources (three loops)
and run from r==0 to r==2 we get a matrix with dimensionallity 100x3 which is exactly what we want.
If we check the entries of r_ic we see that there mostly one element which is much larger than the other two. This is because
every instance x_i is much closer to one of the three gaussians (that is, much more likely to come from this gaussian) than
it is to the other two. That is practically speaing, r_ic gives us the fraction of the probability that x_i belongs to class
c over the probability that x_i belonges to any of the classes c (Probability that x_i occurs given the 3 Gaussians).
"""
"""M Step"""
# Calculate the new mean vector and new covariance matrices, based on the probable membership of the single x_i to classes c --> r_ic
self.mu = []
self.cov = []
self.pi = []
log_likelihood = []
for c in range(len(r_ic[0])):
m_c = np.sum(r_ic[:,c],axis=0)
mu_c = (1/m_c)*np.sum(self.X*r_ic[:,c].reshape(len(self.X),1),axis=0)
self.mu.append(mu_c)
# Calculate the covariance matrix per source based on the new mean
self.cov.append(((1/m_c)*np.dot((np.array(r_ic[:,c]).reshape(len(self.X),1)*(self.X-mu_c)).T,(self.X-mu_c)))+self.reg_cov)
# Calculate pi_new which is the "fraction of points" respectively the fraction of the probability assigned to each source
self.pi.append(m_c/np.sum(r_ic)) # Here np.sum(r_ic) gives as result the number of instances. This is logical since we know
# that the columns of each row of r_ic adds up to 1. Since we add up all elements, we sum up all
# columns per row which gives 1 and then all rows which gives then the number of instances (rows)
# in X --> Since pi_new contains the fractions of datapoints, assigned to the sources c,
# The elements in pi_new must add up to 1
"""Log likelihood"""
log_likelihoods.append(np.log(np.sum([k*multivariate_normal(self.mu[i],self.cov[j]).pdf(X) for k,i,j in zip(self.pi,range(len(self.mu)),range(len(self.cov)))])))
"""
This process of E step followed by a M step is now iterated a number of n times. In the second step for instance,
we use the calculated pi_new, mu_new and cov_new to calculate the new r_ic which are then used in the second M step
to calculat the mu_new2 and cov_new2 and so on....
"""
fig2 = plt.figure(figsize=(10,10))
ax1 = fig2.add_subplot(111)
ax1.set_title('Log-Likelihood')
ax1.plot(range(0,self.iterations,1),log_likelihoods)
#plt.show()
"""Predict the membership of an unseen, new datapoint"""
def predict(self,Y):
# PLot the point onto the fittet gaussians
fig3 = plt.figure(figsize=(10,10))
ax2 = fig3.add_subplot(111)
ax2.scatter(self.X[:,0],self.X[:,1])
for m,c in zip(self.mu,self.cov):
multi_normal = multivariate_normal(mean=m,cov=c)
ax2.contour(np.sort(self.X[:,0]),np.sort(self.X[:,1]),multi_normal.pdf(self.XY).reshape(len(self.X),len(self.X)),colors='black',alpha=0.3)
ax2.scatter(m[0],m[1],c='grey',zorder=10,s=100)
ax2.set_title('Final state')
for y in Y:
ax2.scatter(y[0],y[1],c='orange',zorder=10,s=100)
prediction = []
for m,c in zip(self.mu,self.cov):
#print(c)
prediction.append(multivariate_normal(mean=m,cov=c).pdf(Y)/np.sum([multivariate_normal(mean=mean,cov=cov).pdf(Y) for mean,cov in zip(self.mu,self.cov)]))
#plt.show()
return prediction
GMM = GMM(X,3,50)
GMM.run()
GMM.predict([[0.5,0.5]])