-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhierarchical_clustering.py
419 lines (372 loc) · 22.7 KB
/
hierarchical_clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
"""
Code to perform hierarchical spectral or Kmeans clustering on given data
The data is transformed to PCA before the clustering.
The code consists of an inner loop that does the clustering for a single leaf
of the current tree, and an outer node that decides whether to accept that
clustering and whether to go to the next level in the hierarchy.
The user can chose which scores to use for both the inner and outer loop from
Wemmert-Gancarski, CalinskiHarabasz, Silhouette or DaviesBouldin
If a cluster is too sparsely populated it will be added to an outlier node.
Authors:
* Written by Julia Crook, Maryna Lukach
October 2024
"""
import warnings
from sklearn.preprocessing import QuantileTransformer
from sklearn import cluster
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.neighbors import NearestCentroid
from sklearn.decomposition import PCA
from cluster_tree import *
from scores import *
#---------------------------------------------------------------------------------
# get_PCA()
# Based on the transformed data get the number of PCA components and the representation
# of X_std in the selected variables
# inputs:
# X_std[nsamples,nvars] - standard deviation of the variables
# evr_threshold - explained variance ratio threshold - we use this to determine how many variables to use
# verbose - if True, print what is going on
# returns:
# selected_pca - the number of variables to use
# X_new[nsamples, nvars]
#---------------------------------------------------------------------------------
def get_PCA(X_std,evr_threshold,verbose=False):
pca = PCA()
if(verbose):
print ('get_PCA: input data shape is {}, evr_threshold is {}'.format(X_std.shape, evr_threshold))
X_new = pca.fit_transform(X_std)
if(verbose):
print ('pca.explained_variance_ratio_', pca.explained_variance_ratio_, np.sum(pca.explained_variance_ratio_))
print ('pca.components_',pca.components_)
if(np.sum(pca.explained_variance_ratio_[0:(np.max(np.where(pca.explained_variance_ratio_ >= 0.10))+1)]) >= evr_threshold):
selected_pca = np.max(np.where(pca.explained_variance_ratio_ >= 0.10)) + 1
else:
selected_pca = np.max(np.where(pca.explained_variance_ratio_ >= 0.10)) + 2
if(verbose):
print ('selected_pca',selected_pca)
return selected_pca, X_new
#---------------------------------------------------------------------------------
# is_cluster_too_small
# Determines clusters that have fewer than a specified minimum number of points
# and returns the indices of the data points belonging to these small clusters.
# inputs:
# labels (array-like): Cluster labels for the dataset.
# min_cluster_size (int): The minimum number of points that a cluster should have.
# verbose - if True, print what is going on
# returns:
# small_clusters_indices : array of all indices belonging to these small clusters.
#---------------------------------------------------------------------------------
def is_cluster_too_small(labels, min_cluster_size, verbose=False):
unique_labels,label_counts = np.unique(labels, return_counts=True)
small_clusters_indices = []
for i in range(len(unique_labels)):
if verbose:
print('cluster_label = {} label_counts[cluster_label] = {}'.format(unique_labels[i],label_counts[i] ))
if label_counts[i] < min_cluster_size:
# Get indices of data points belonging to the small cluster
indices = np.where(labels == unique_labels[i])
small_clusters_indices.append(indices[0])
if len(small_clusters_indices)>0:
small_clusters_indices=np.concatenate(small_clusters_indices)
return small_clusters_indices
#---------------------------------------------------------------------------------
# remove_label_gaps
# Replaces the values of labels in the input list so that there are no more gaps in the list of unique labels,
# while keeping the order of label values.
# inputs:
# labels (list): List of labels, which may have gaps in the unique values.
#
# Returns:
# new_labels: A new numpy array of labels with the gaps removed.
#---------------------------------------------------------------------------------
def remove_label_gaps(labels):
# Get the sorted unique labels from the input list
unique_labels = np.unique(labels)
label_mapping = {old_label: new_label for new_label, old_label in enumerate(unique_labels)}
# Apply the mapping to the input list of labels
new_labels = np.asarray([label_mapping[label] for label in labels])
return new_labels
#---------------------------------------------------------------------------------
# get_replacement_current_cluster_labels
# replaces all values of 0 in the model_labels with the old_label (that which this cluster had) and set all other non zero labels
# as next_cluster_label plus value in model_labels. Leave the OUTLIER_LABELS as they are.
# inputs:
# old_label - the label of the cluster that has been subclustered
# model_labels - the labels that the clustering algorithm came up with. Usually 0-n, but may contain OUTLIER_LABEL
# next_cluster_label - this is the next available number to use for new labels
# Returns:
# new_labels: A new numpy array of labels (same size as model_labels) with the 0s replaced by old_label
# and the other positive labels replaced by next_cluster_label upwards.
#---------------------------------------------------------------------------------
def get_replacement_current_cluster_labels(old_label, model_labels, next_cluster_label):
# where the model_labels is 0 set to the old label
# for all the other positive model_labels set to next_cluster_label upwards
new_labels=np.copy(model_labels)
ix_zero=np.where(model_labels==0)
ix_non_zero=np.where(model_labels>0)
new_labels[ix_zero[0]]=old_label
new_labels[ix_non_zero[0]]=model_labels[ix_non_zero]+next_cluster_label-1
return new_labels
#---------------------------------------------------------------------------------
# get_subclusters
# The inner loop of the clustering that finds the optimum number of splits for this_cluster_data
# Starts with nclusters=2, uses sklearn spectral_clustering or kmeans to do the split, checks if there are outliers
# calculates a new score (not including the outliers) to determine if this split is better than previous split
# If better we go round the loop again with nclusters incremented
#
# inputs:
# this_cluster_data - the data[nsamples,nvars] for the current cluster
# evr_threshold - explained variance ratio threshold - we use this to determine how many variables to use in the PCA calculation
# min_npoints_to_cluster - if a cluster holds less than this number of points we wont try to subcluster
# max_npoints_for_outlier - the number of points that defines a subcluster as being an outlier
# scorer_type - the type of scorer to use for the inner loop to determine when to stop
# cluster_name - full label of the cluster we are working on (just for printing purposes)
# use_kmeans - if True, use kmeans instead of spectral clustering
# verbose - if True, print what is going on
# returns:
# prev_labels - the labels defining the new sub clusters of this cluster
# X_new - the pca data for this cluster
#---------------------------------------------------------------------------------
def get_subclusters(this_cluster_data, evr_threshold, min_npoints_to_cluster, max_npoints_for_outlier, scorer_type, cluster_name, use_kmeans=False, verbose=False):
# Initialize variables
number_of_clusters = 2
scorer=create_scorer(scorer_type)
split_is_better=True
# convert data to pca
X_std = QuantileTransformer(output_distribution='normal').fit_transform(this_cluster_data)
selected_pca, X_new = get_PCA(X_std, evr_threshold, verbose=verbose)
active_indices=np.arange(this_cluster_data.shape[0]) # initially all indices for this data
nactive=this_cluster_data.shape[0]
new_labels=np.zeros(this_cluster_data.shape[0], int)
prev_labels=[]
clust_type='spectral'
if use_kmeans:
clust_type='kmeans'
while split_is_better:
if verbose:
print('get_subclusters: number_of_clusters {}'.format(number_of_clusters))
X_new_filtered = X_new[active_indices, 0:selected_pca] # used to do a copy?
if verbose:
print('get_subclusters: doing {} clustering of {} with X_new_filtered.shape = {}'.format(clust_type, cluster_name, X_new_filtered.shape))
# Apply spectral clustering
if use_kmeans:
model = cluster.KMeans(n_clusters=number_of_clusters, init='k-means++', random_state=0, n_init=10)
kmeans = model.fit(X_new_filtered)
else:
model = cluster.SpectralClustering(n_clusters=number_of_clusters, assign_labels='discretize')
# catch warnings related to kneighbors_graph
with warnings.catch_warnings():
warnings.filterwarnings('ignore',
message='the number of connected components of the connectivity matrix is [0-9]{1,2}' +
' > 1. Completing it to avoid stopping the tree early.',
category=UserWarning)
warnings.filterwarnings('ignore',
message='Graph is not fully connected, spectral embedding may not work as expected.',
category=UserWarning)
model.fit(X_new_filtered)
labels = model.labels_
unique_lab=np.unique(labels)
if verbose:
print('get_subclusters: finding small clusters')
# Check and remove small clusters
small_clusters_indices = is_cluster_too_small(labels, max_npoints_for_outlier,verbose=verbose)
noutliers=len(small_clusters_indices)
if (noutliers>0):
# update the labels to indicate outlier
labels[small_clusters_indices]=OUTLIER_LABEL
# recalculate active_indices and make sure the non outlier labels are contiguous from 0
this_active_indices=np.where(labels!=OUTLIER_LABEL)
this_labels = remove_label_gaps(labels[this_active_indices])
labels[this_active_indices]=this_labels
noutliers=1
new_labels[active_indices]=labels
# recalculate active_indices to exclude outliers for score calculation
active_indices=np.where(new_labels!=OUTLIER_LABEL)[0]
nactive=len(active_indices)
if nactive<min_npoints_to_cluster:
# we cannot continue to split
split_is_better=False
if verbose:
print('get_subclusters: remaining active points to few to cluster - abandoning')
continue
if number_of_clusters - noutliers==1:
# only 1 cluster left if we remove the outliers
if verbose:
print("get_subclusters: only 1 cluster left after removing outliers")
continue # we go to the next iteration of while loop with the active_indices to try again without the outliers
X_new_filtered=X_new[active_indices, 0:selected_pca]
this_score=scorer.compute_score(X_new_filtered, new_labels[active_indices])
if verbose:
print('get_subclusters: calculated {} score as'.format(scorer.name), this_score)
if(number_of_clusters > 2):
split_is_better=scorer.split_is_better(this_score)
if split_is_better:
# we are going round loop again so save these labels and score
# if we found outliers then don't change the number of clusters but do the clustering again without the outliers
# otherwise increment the number of clusters
if noutliers==0:
number_of_clusters += 1
scorer.add_score(this_score)# save the scores
prev_labels=np.copy(new_labels) # save labels from this loop
if verbose:
print('get_subclusters: accepting split and continuing')
elif verbose:
print('get_subclusters: rejecting split and stopping')
# we stopped because we found a worse split so use prev_labels
# recalculate active_indices to exclude outliers
active_indices=np.where(prev_labels!=OUTLIER_LABEL)[0]
nactive=len(active_indices)
if nactive==0:
print('get_subclusters: no active cluster any more!!!!!')
pdb.set_trace()
print('get_subclusters: loop complete')
return prev_labels, X_new
#---------------------------------------------------------------------------------
# perform_hierarchical_clustering
# The outer loop of the clustering
# inputs:
# cluster_data[namples,nvars] - all the data that will be clustered
# field_list - list of the names of the radar variables in the cluster_data (this is stored in root of the tree)
# evr_threshold - explained variance ratio threshold - we use this to determine how many variables to use in the PCA calculation
# min_pc_points_to_cluster - if a cluster holds less than this % x number of samples we wont try to subcluster
# max_pc_points_for_outlier - defines the number of points (this % x number of samples) that defines an outlier
# scorer_outer - the type of scorer for the outer loop
# scorer_inner - the type of scorer for the inner loop
# output_dir - where to store the output
# plot_fname_prefix - the prefix for the output filename
# use_kmeans - if True use kmeans otherwise use spectral clustering
# verbose - if True, print what is going on
#---------------------------------------------------------------------------------
def perform_hierarchical_clustering(cluster_data, field_list, evr_threshold, min_pc_points_to_cluster, max_pc_points_for_outlier, scorer_outer, scorer_inner, output_dir, plot_fname_prefix, use_kmeans=False, verbose=False):
nsamples=cluster_data.shape[0]
min_npoints_to_cluster=int(min_pc_points_to_cluster*nsamples/100)
max_npoints_for_outlier=int(max_pc_points_for_outlier*nsamples/100)
if verbose:
print(nsamples, 'samples, min points to cluster=', min_npoints_to_cluster, 'max points for outlier=',max_npoints_for_outlier)
all_labels = np.zeros(nsamples, int)
possible_labels=np.copy(all_labels)
# whole dataset is the root cluster
# create root node of the cluster-tree
root = ClusterNode('root',None,np.arange(nsamples),data=cluster_data,variables=field_list)
outlier_node=None
n_level = 0
best_split = False
scorer=create_scorer(scorer_outer)
# best_split is the result of scorer calculation,
while (best_split is False):
n_splits = 0
if(verbose):
print('perform_hierarchical_clustering: we are starting the new loop for level',n_level)
# select all current active clusters
active_clusters=[]
root.leaves(active_clusters, ignore_outlier=True)
nactive_clusters=len(active_clusters)
next_cluster_label=nactive_clusters
cluster_sizes=np.asarray([len(cluster.indxs) for cluster in active_clusters])
sorted_ix = np.flip(np.argsort(cluster_sizes)) # sort in descending order
if verbose:
print(nactive_clusters, 'active clusters: sorted clusters is', sorted_ix, 'and the sizes are', cluster_sizes[sorted_ix])
for k in sorted_ix:
cluster = active_clusters[k]
if hasattr(cluster, 'complete'):
# we tried this before so skip
continue
current_cluster_data = cluster_data[cluster.indxs]
clust_name = cluster.get_full_name() # get the full name so we know where we are in the hierarchy
if verbose:
print('perform_hierarchical_clustering: we work with cluster {} name {} of size {}, next cluster_label= {}'.format(k,clust_name,cluster_sizes[k],next_cluster_label))
# threshold on the minimum number of points to cluster
if (current_cluster_data.shape[0]>min_npoints_to_cluster):
best_labels, current_cluster_pca_data = get_subclusters(current_cluster_data, evr_threshold, min_npoints_to_cluster, max_npoints_for_outlier, scorer_inner, clust_name, use_kmeans=use_kmeans, verbose=verbose)
# find outliers
ix_outlier=np.where(best_labels==OUTLIER_LABEL)[0]
noutliers=len(ix_outlier)
cluster_outlier_indxs = cluster.indxs[ix_outlier]
ix_non_outlier=np.where(best_labels!=OUTLIER_LABEL)[0]
nnon_outlier=len(ix_non_outlier)
if verbose:
print('perform_hierarchical_clustering: ', noutliers, 'outlier points', nnon_outlier, 'non outlier points in cluster', clust_name)
# Use ix_non_outlier to filter current_cluster_data, current_cluster_indxs for further processing
current_cluster_data_filtered = current_cluster_data[ix_non_outlier]
current_cluster_pca_data_filtered=current_cluster_pca_data[ix_non_outlier]
current_cluster_indxs_filtered = cluster.indxs[ix_non_outlier]
nsub_clusters_filtered = len(np.unique(best_labels[ix_non_outlier]))
current_label=np.unique(possible_labels[current_cluster_indxs_filtered])[0]
replacement_labels=get_replacement_current_cluster_labels(current_label, best_labels, next_cluster_label)
if(verbose):
print('perform_hierarchical_clustering: replace current cluster {}, label {}, by its subcluster labels'.format(clust_name, current_label), np.unique(replacement_labels))
possible_labels[cluster.indxs]=replacement_labels
if verbose:
print('perform_hierarchical_clustering: poss labels after inner loop for cluster', clust_name, np.unique(possible_labels))
# get all the labels and data that are not outliers
ix_good=np.where(possible_labels!=OUTLIER_LABEL)[0]
possible_labels_filtered=possible_labels[ix_good]
all_data_filtered=cluster_data[ix_good]
# compute score for this split on the original dataset minus the outliers
# should this be calculated on actual data or pca data?
new_score=scorer.compute_score(all_data_filtered, possible_labels_filtered,verbose=verbose)
if(verbose):
print('perform_hierarchical_clustering: got the new {} score'.format(scorer.name), new_score)
split_is_better = scorer.split_is_better(new_score)
if split_is_better==False:
# go back to previous accepted labels and mark this cluster as tested so we don't try doing it again in the next loop
possible_labels=np.copy(all_labels)
cluster.complete=True
if(verbose):
print('perform_hierarchical_clustering: level=', n_level," we don't accept this split")
print('perform_hierarchical_clustering: possible labels set back to all_labels', np.unique(possible_labels))
else:
if(verbose):
print('perform_hierarchical_clustering: level=',n_level," we accept this split and write all subclusters to the tree")
# get the medoids of the clusters
# calculate the centroids for the new clusters
clf = NearestCentroid(metric='euclidean')
print('perform_hierarchical_clustering: calculating centroids for current cluster with data:', current_cluster_pca_data_filtered.shape, best_labels[ix_non_outlier].shape, type(best_labels[0]))
clf.fit(current_cluster_data_filtered, best_labels[ix_non_outlier])
medoids_indx_filtered, distances_filtered = pairwise_distances_argmin_min(clf.centroids_, current_cluster_data_filtered)
medoids = current_cluster_data[medoids_indx_filtered,:]
# now create nodes to add to the tree for the new sub clusters
if(verbose):
print('perform_hierarchical_clustering: creating {} subclusters'.format(nsub_clusters_filtered))
for c in range(nsub_clusters_filtered):
ix=np.where(best_labels[ix_non_outlier]==c)[0]
current_subcluster_indxs = current_cluster_indxs_filtered[ix]
subcluster = ClusterNode("cl{}".format(c+1), cluster,current_subcluster_indxs, centroid=clf.centroids_[c],medoid=medoids[c])
# now create any outlier node
if noutliers>0:
outlier_data = current_cluster_data[ix_outlier]
if outlier_node==None:
if(verbose):
print('perform_hierarchical_clustering: creating outlier node')
outlier_node = ClusterNode(OUTLIER_NAME, root, cluster_outlier_indxs)
else:
# add these outlier indices to the already existing outlier node
outlier_node.indxs=np.concatenate([outlier_node.indxs, cluster_outlier_indxs])
# copy the labels we worked out this time to all_labels and save the score
all_labels=np.copy(possible_labels)
next_cluster_label=np.amax(all_labels)+1
scorer.add_score(new_score, next_cluster_label)
n_splits += 1
if (n_splits == 0):
best_split = True
if verbose:
print('perform_hierarchical_clustering: level {}. No new splits were done and best_split is reached'.format(n_level))
print('perform_hierarchical_clustering: The list of scores looks like', scorer.scores)
elif verbose:
print('perform_hierarchical_clustering: level {}. We go to the next round'.format(n_level))
this_plot_fname="{}{}_tree_{}_loop.png".format(output_dir,plot_fname_prefix, n_level)
if verbose:
print('perform_hierarchical_clustering: plotting tree for loop', this_plot_fname)
root.plot_tree(this_plot_fname)
out_fname='{}_{}_data_{}_level_{}_active_clusters.hdf'.format(plot_fname_prefix, root.name, str(n_level), str(next_cluster_label))
if verbose:
print('perform_hierarchical_clustering: writing tree data for level', n_level, out_fname)
clust_type='spectral'
if use_kmeans:
clust_type='Kmeans'
write_cluster_tree_hdf(root, clust_type, scorer_inner, scorer_outer, max_npoints_for_outlier, min_npoints_to_cluster, output_dir+out_fname)
n_level += 1
scorer.plot_scores(output_dir+plot_fname_prefix+'_')
print('perform_hierarchical_clustering: loop complete clustering is finished')
return root