-
Notifications
You must be signed in to change notification settings - Fork 0
/
new_scores.py
203 lines (152 loc) · 6.54 KB
/
new_scores.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
import numpy as np
from sklearn.metrics import davies_bouldin_score, calinski_harabasz_score, silhouette_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle
import time
from datasets import create_data4
from distance import max_edge_in_guided_path, centre_from_data
from labelling import diagonal_line, assign_labels_by_given_line, vertical_line, horizontal_line
def new_silhouette_score(X, labels, k=5, lookahead=3, debug=False):
"""
Calculate the Silhouette score for a given set of labels.
Parameters:
- X: ndarray, shape (n_samples, n_features)
The input data.
- labels: ndarray, shape (n_samples,)
Cluster labels for each data point.
Returns:
- db_score: float
The Davies-Bouldin score.
"""
n_samples, n_features = X.shape
unique_labels = np.unique(labels)
n_clusters = len(unique_labels)
if n_clusters == 1:
return 0
cluster_means = np.array([centre_from_data(X[labels == i]) for i in unique_labels])
distance_matrix = np.zeros((n_samples, n_clusters))
for sid, sample in enumerate(X):
for cid, cc in enumerate(cluster_means):
dist = max_edge_in_guided_path(X, sample, cc, k, lookahead, debug)
distance_matrix[sid, cid] = dist
# Compute the intra-cluster distances
intra_cluster_distances = np.zeros(n_samples)
for i in range(n_clusters):
intra_cluster_distances[labels == unique_labels[i]] = np.mean(distance_matrix[labels == unique_labels[i], i])
# Compute the inter-cluster distances for each sample
inter_cluster_distances = np.min(distance_matrix + np.where(labels[:, np.newaxis] == unique_labels, np.inf, 0), axis=1)
# Compute the silhouette coefficient for each sample
silhouette_coefficients = (inter_cluster_distances - intra_cluster_distances) / np.maximum(inter_cluster_distances, intra_cluster_distances)
# Compute the mean silhouette score over all samples
mean_silhouette_score = np.mean(silhouette_coefficients)
return mean_silhouette_score
def new_davies_bouldin_score(X, labels, k=5, lookahead=3, debug=False):
"""
Calculate the Davies-Bouldin score for a given set of labels.
Parameters:
- X: ndarray, shape (n_samples, n_features)
The input data.
- labels: ndarray, shape (n_samples,)
Cluster labels for each data point.
Returns:
- db_score: float
The Davies-Bouldin score.
"""
n_samples, n_features = X.shape
unique_labels = np.unique(labels)
n_clusters = len(unique_labels)
# Calculate cluster means
cluster_means = np.array([centre_from_data(X[labels == i]) for i in unique_labels])
# Calculate cluster distances
cluster_distances = np.zeros((n_clusters, n_clusters))
for i in range(n_clusters):
for j in range(n_clusters):
if i != j:
# cluster_distances[i, j] = np.linalg.norm(cluster_means[i] - cluster_means[j])
cluster_distances[i, j] = max_edge_in_guided_path(X, cluster_means[i], cluster_means[j], k, lookahead, debug)
# Calculate cluster-wise scatter
cluster_scatter = np.array([
np.mean([
max_edge_in_guided_path(X, sample, cluster_means[i], k, lookahead, debug)
for sample in X[labels==i]
]) for i in range(n_clusters)
])
# Calculate Davies-Bouldin index
db_index = 0.0
for i in range(n_clusters):
max_similarity = -np.inf
for j in range(n_clusters):
if i != j:
similarity = (cluster_scatter[i] + cluster_scatter[j]) / cluster_distances[i, j]
if similarity > max_similarity:
max_similarity = similarity
db_index += max_similarity
db_index /= n_clusters
return db_index
def new_calinski_harabasz_score(X, labels, k=5, lookahead=3, debug=False):
"""
Calculate the Calinski-Harabasz score for a given set of labels.
Parameters:
- X: ndarray, shape (n_samples, n_features)
The input data.
- labels: ndarray, shape (n_samples,)
Cluster labels for each data point.
Returns:
- ch_index: float
The Calinski-Harabasz score.
"""
n_samples, n_features = X.shape
unique_labels = np.unique(labels)
n_clusters = len(unique_labels)
# Calculate cluster means
cluster_means = np.array([centre_from_data(X[labels == i]) for i in unique_labels])
# Calculate overall mean
overall_mean = centre_from_data(X)
# Calculate between-cluster sum of squares
# between_cluster_ss = np.sum([np.sum((cluster_means[i] - overall_mean) ** 2) * np.sum(labels == i) for i in range(n_clusters)])
between_cluster_ss = np.sum([max_edge_in_guided_path(X, cluster_means[i], overall_mean, k, lookahead, debug) * np.sum(labels == i) for i in range(n_clusters)])
# Calculate within-cluster sum of squares
# within_cluster_ss = np.sum([np.sum((X[labels == i] - cluster_means[i]) ** 2) for i in range(n_clusters)])
within_cluster_ss = np.sum([
max_edge_in_guided_path(X, sample, cluster_means[i], k, lookahead, debug)
for i in range(n_clusters)
for sample in X[labels == i]
])
# Calculate Calinski-Harabasz index
ch_index = (between_cluster_ss / (n_clusters - 1)) / (within_cluster_ss / (n_samples - n_clusters))
return ch_index
if __name__ == '__main__':
n_samples = 1000
X, y = create_data4(n_samples)
X = MinMaxScaler((-1, 1)).fit_transform(X)
X, y = shuffle(X, y, random_state=7)
line = diagonal_line(X)
dp = assign_labels_by_given_line(X, line)
line = vertical_line(0)
vl = assign_labels_by_given_line(X, line)
line = horizontal_line(0)
hl = assign_labels_by_given_line(X, line)
k = 5
la = 20
y = np.copy(vl)
start = time.time()
score = silhouette_score(X, y)
print(f"SS: {score} in {time.time() - start:.2f}s")
start = time.time()
score = new_silhouette_score(X, y, debug=True)
print(f"NSS: {score} in {time.time() - start:.2f}s")
print()
start = time.time()
score = davies_bouldin_score(X, y)
print(f"DBS: {score} in {time.time() - start:.2f}s")
start = time.time()
score = new_davies_bouldin_score(X, y, k=k, lookahead=la, debug=True)
print(f"NDBS: {score} in {time.time() - start:.2f}s")
print()
start = time.time()
score = calinski_harabasz_score(X, y)
print(f"CHS: {score} in {time.time() - start:.2f}s")
start = time.time()
score = new_calinski_harabasz_score(X, y, debug=True)
print(f"NCHS: {score} in {time.time() - start:.2f}s")
print()