-
Notifications
You must be signed in to change notification settings - Fork 10
/
KMeansND.py
100 lines (86 loc) · 3.69 KB
/
KMeansND.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import numpy as np
def assignPointsToCentroids(centroids, points):
'''Determine the centroid to which each point is nearest, and
store this as an int from 0 to K-1 in classifications.
'''
M = points.shape[0]
K = centroids.shape[0]
classifications = np.zeros((M,), dtype=np.int)
for i in range(M):
smallestDistance = 0
for k in range(K):
distance = np.linalg.norm(points[i,:] - centroids[k,:])
if k == 0:
smallestDistance = distance
classifications[i] = k
elif distance < smallestDistance:
smallestDistance = distance
classifications[i] = k
return classifications
def recalcCentroids(centroids, points, classifications):
'''Recalculate centroid locations for each cluster.'''
K = centroids.shape[0]
N = points.shape[1]
M = points.shape[0]
newCentroids = np.zeros((K, N))
for k in range(K):
if sum(classifications == k) > 0:
newCentroids[k,:] = (
np.sum(points[classifications == k,:], axis=0)
/ sum(classifications == k))
else:
newCentroids[k,:] = centroids[k,:]
return newCentroids
class KMeansND:
'''KMeansND(initialCentroids, points)
PARAMETERS:
initialCentroids: K x N array of K initial centroids with N
features/coordinates.
points: M x N array of M points with N features/coordinates.
METHODS:
(centroids, classifications, iterations) = getCentroids()
Perform K-means clustering. Return a tuple containing the
array of centroid coordinates, an M x 1 array of point
classifications, and number of iterations required.
getGenerator()
Return a generator function to step through K-means iterations.
Each call of the generator returns the current centroids,
classifications, and iteration, beginning with the initial
centroids and classifications.
'''
def __init__(self, initialCentroids, points):
if initialCentroids.shape[1] != points.shape[1]:
raise RuntimeError('Dimension mismatch. Centroids and data points'
+ ' must be described by the same number of features.')
else:
self.initialCentroids = initialCentroids
self.points = points
def getCentroids(self):
centroids = np.copy(self.initialCentroids)
# Initialize lastCentroids to arbitrary value different from centroids
# to ensure loop executes at least once.
lastCentroids = centroids + 1
iteration = 0
while not np.array_equal(centroids, lastCentroids):
lastCentroids = np.copy(centroids)
classifications = assignPointsToCentroids(centroids, self.points)
centroids = recalcCentroids(centroids, self.points, classifications)
iteration += 1
return (centroids, classifications, iteration)
def _generatorFunc(self):
centroids = np.copy(self.initialCentroids)
lastCentroids = centroids + 1
iteration = 0
initialIteration = True
while not np.array_equal(centroids, lastCentroids):
if initialIteration:
classifications = assignPointsToCentroids(centroids, self.points)
initialIteration = False
else:
lastCentroids = np.copy(centroids)
classifications = assignPointsToCentroids(centroids, self.points)
centroids = recalcCentroids(centroids, self.points, classifications)
iteration += 1
yield (centroids, classifications, iteration)
def getGeneratorFunc(self):
return self._generatorFunc