-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnp_scores.py
184 lines (141 loc) · 5.91 KB
/
np_scores.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import numpy as np
from sklearn import datasets
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import silhouette_score
from datasets import create_data3, create_data3, create_data6, create_data7, create_data4, create_data2, create_data1, create_data5
def np_calinski_harabasz_score(X, labels):
"""
Calculate the Calinski-Harabasz index for a given clustering.
Parameters:
- X: ndarray, shape (n_samples, n_features)
The input data.
- labels: ndarray, shape (n_samples,)
Cluster labels for each data point.
Returns:
- ch_index: float
The Calinski-Harabasz index.
"""
n_samples, n_features = X.shape
k = np.max(labels) + 1 # Number of clusters
# Calculate cluster means
cluster_means = np.array([np.mean(X[labels == i], axis=0) for i in range(k)])
# Calculate overall mean
overall_mean = np.mean(X, axis=0)
# Calculate between-cluster sum of squares
between_cluster_ss = np.sum([np.sum((cluster_means[i] - overall_mean) ** 2) * np.sum(labels == i) for i in range(k)])
# Calculate within-cluster sum of squares
within_cluster_ss = np.sum([np.sum((X[labels == i] - cluster_means[i]) ** 2) for i in range(k)])
# Calculate Calinski-Harabasz index
ch_index = (between_cluster_ss / (k - 1)) / (within_cluster_ss / (n_samples - k))
return ch_index
def np_davies_bouldin_score(X, labels):
"""
Calculate the Davies-Bouldin index for a given clustering.
Parameters:
- X: ndarray, shape (n_samples, n_features)
The input data.
- labels: ndarray, shape (n_samples,)
Cluster labels for each data point.
Returns:
- db_index: float
The Davies-Bouldin index.
"""
n_samples, n_features = X.shape
k = np.max(labels) + 1 # Number of clusters
# Calculate cluster means
cluster_means = np.array([np.mean(X[labels == i], axis=0) for i in range(k)])
# Calculate cluster distances
cluster_distances = np.zeros((k, k))
for i in range(k):
for j in range(k):
if i != j:
cluster_distances[i, j] = np.linalg.norm(cluster_means[i] - cluster_means[j])
# Calculate cluster-wise scatter
cluster_scatter = np.array([np.mean([np.linalg.norm(X[m] - cluster_means[i]) for m in range(n_samples) if labels[m] == i]) for i in range(k)])
# Calculate Davies-Bouldin index
db_index = 0.0
for i in range(k):
max_similarity = -np.inf
for j in range(k):
if i != j:
similarity = (cluster_scatter[i] + cluster_scatter[j]) / cluster_distances[i, j]
if similarity > max_similarity:
max_similarity = similarity
db_index += max_similarity
db_index /= k
return db_index
def np_silhouette_score(X, labels):
"""
Calculate the silhouette score for a given clustering.
Parameters:
- X: ndarray, shape (n_samples, n_features)
The input data.
- labels: ndarray, shape (n_samples,)
Cluster labels for each data point.
Returns:
- silhouette_avg: float
The silhouette score.
"""
n_samples = X.shape[0]
unique_labels = np.unique(labels)
n_clusters = len(unique_labels)
if n_clusters == 1:
return 0.0
# Compute the mean silhouette score for all samples
silhouette_values = np.zeros(n_samples)
for i in range(n_samples):
# Calculate the mean intra-cluster distance (a_i)
cluster_label = labels[i]
cluster_points = X[labels == cluster_label]
a_i = np.mean(np.linalg.norm(cluster_points - X[i], axis=1))
# Calculate the mean nearest-cluster distance (b_i)
b_i = np.inf
for j in range(n_clusters):
if j != cluster_label:
other_cluster_points = X[labels == j]
dist = np.mean(np.linalg.norm(other_cluster_points - X[i], axis=1))
b_i = min(b_i, dist)
# Compute silhouette value for sample i
silhouette_values[i] = (b_i - a_i) / max(a_i, b_i)
silhouette_avg = np.mean(silhouette_values)
return silhouette_avg
def np_silhouette_score2(X, labels):
n_samples = len(X)
unique_labels = np.unique(labels)
num_clusters = len(unique_labels)
if num_clusters == 1:
return 0
cluster_means = np.array([np.mean(X[labels == i], axis=0) for i in unique_labels])
# Compute the distance matrix between samples and cluster means
distance_matrix = np.linalg.norm(X[:, np.newaxis] - cluster_means, axis=2)
# Compute the intra-cluster distances
intra_cluster_distances = np.zeros(n_samples)
for i in range(num_clusters):
intra_cluster_distances[labels == unique_labels[i]] = np.mean(distance_matrix[labels == unique_labels[i], i])
# Compute the inter-cluster distances for each sample
inter_cluster_distances = np.min(distance_matrix + np.where(labels[:, np.newaxis] == unique_labels, np.inf, 0), axis=1)
# Compute the silhouette coefficient for each sample
silhouette_coefficients = (inter_cluster_distances - intra_cluster_distances) / np.maximum(inter_cluster_distances, intra_cluster_distances)
# Compute the mean silhouette score over all samples
mean_silhouette_score = np.mean(silhouette_coefficients)
return mean_silhouette_score
if __name__ == '__main__':
# X, y = create_data_circles()
# X, y = create_data_moons()
# X, y = create_data_blobs()
# X, y = create_data_elongated_close_blobs()
# X, y = create_data_diff_density_blobs()
# X, y = create_data_nostructure()
X, y = create_data6()
# X, y = create_data2()
print(calinski_harabasz_score(X, y))
print(np_calinski_harabasz_score(X, y))
print()
print(davies_bouldin_score(X, y))
print(np_davies_bouldin_score(X, y))
print()
print(silhouette_score(X, y))
print(np_silhouette_score(X, y))
print(np_silhouette_score2(X, y))
print()