forked from DiogoApostolo/pycol
-
Notifications
You must be signed in to change notification settings - Fork 4
/
complexity.py
2072 lines (1588 loc) · 75.5 KB
/
complexity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import copy
import math
from operator import itemgetter, ne
from typing import overload
from numpy.core.numerictypes import ScalarType
from pandas import read_csv
import numpy as np
from numpy.core.fromnumeric import shape, transpose, var
from scipy.sparse import data
import arff
from scipy.spatial.kdtree import distance_matrix
from sklearn.cluster import KMeans
import sklearn
import scipy.spatial
import sys
np.random.seed(0)
class Complexity:
'''
Complexity class, it makes available the following methods to calculate complexity metrics:
F1, F1v, F2, F3, F4, R_value, D3, CM, kDN, MRCA, C1, C2, T1, DBC, N1, N2, N3, N4, SI,
LSC, purity, neighbourhood_seperability, input_noise, borderline, deg_overlap, ICSV, NSG and Clust
'''
def __init__(self,file_name,distance_func="default",file_type="arff"):
'''
Constructor method, setups up the the necessary class attributes to be
used by the complexity measure functions.
Starts by reading the file in arff format which contains the class samples X (self.X), class labels y (self.y) and contextual information
about the features (self.meta).
It then calculates the distance matrix that contains the distance between all points in X (self.dist_matrix).
It also saves in an array the unique labels of all existing classes (self.classes), the number of samples in each class (self.class_count) and
the indexes in X of every class (self.class_inxs).
-----
Parameters:
file_name (string): Location of the file that contains the dataset.
distance_func (string): The distance function to be used to calculate the distance matrix. Only available option right now is "HEOM".
file_type (string): The type of file where the dataset is stored. Only available option right now is "arff".
'''
if(file_type=="arff"):
[X,y,meta]=self.__read_file(file_name)
else:
print("Only arff files are available for now")
return
self.X=np.array(X)
#self.X = self.X[:,0:2]
self.y=np.array(y)
classes=np.unique(self.y)
self.classes = classes
self.meta=meta
self.dist_matrix,self.unnorm_dist_matrix = self.__calculate_distance_matrix(self.X,distance_func=distance_func)
self.class_count = self.__count_class_instances()
self.class_inxs = self.__get_class_inxs()
return
def __count_class_instances(self):
'''
Is called by the __init__ method.
Count instances of each class in the dataset.
--------
Returns:
class_count (numpy.array): An (Nx1) array with the number of intances for each of the N classes in the dataset
'''
class_count = np.zeros(len(self.classes))
for i in range(len(self.classes)):
count=len(np.where(self.y == self.classes[i])[0])
class_count[i]+=count
return class_count
def __read_file(self,file_name):
'''
Is called by the __init__ method.
Read an arff file containing the dataset.
--------
Parameters:
file_name: string, name of the arff file to read from
--------
Returns:
X (numpy.array): An array containing the attributes of all samples;
y (numpy.array): An array containing the class labels of all samples;
meta (array): An array with information about the type of attributes (numerical or categorical)
--------
'''
f = arff.load(open(file_name, 'r'))
data = f['data']
num_attr = len(data[0])-1
att=f['attributes']
meta=[]
#for every attribute check if it is numeric or categorical
for i in range(len(att)-1):
if(att[i][1]=="NUMERIC"):
meta.append(0)
else:
meta.append(1)
#split each sample into attributes and label
X = [i[:num_attr] for i in data]
y = [i[-1] for i in data]
#indentify the existing classes
classes = np.unique(y)
#create new class labels from 0 to n, where n is the number of classes
y = [np.where(classes == i[-1])[0][0] for i in data]
return [X,y,meta]
def __distance_HEOM(self,X):
'''
Is called by the calculate_distance_matrix method.
Calculates the distance matrix between all pairs of points from an input matrix, using the HEOM metric, that way categorical attributes are
allow in the dataset.
--------
Parameters:
X (numpy.array): An (N*M) numpy matrix containing the points, where N is the number of points and M is the number of attributes per point.
--------
Returns:
dist_matrix (numpy.array): A (M*M) matrix containing the distance between all pairs of points in X
'''
meta= self.meta
dist_matrix=np.zeros((len(X),len(X)))
unnorm_dist_matrix = np.zeros((len(X),len(X)))
#calculate the ranges of all attributes
range_max=np.max(X,axis=1)
range_min=np.min(X,axis=1)
for i in range(len(X)):
for j in range(i+1,len(X)):
#for attribute
dist = 0
unnorm_dist = 0
for k in range(len(X[0])):
#missing value
if(X[i][k] == None or X[j][k]==None):
dist+=1
unnorm_dist+=1
#numerical
if(meta[k]==0):
#dist+=(abs(X[i][k]-X[j][k]))**2
#dist+=(abs(X[i][k]-X[j][k])/(range_max[k]-range_min[k]))**2
if(range_max[k]-range_min[k]==0):
dist+=(abs(X[i][k]-X[j][k]))**2
unnorm_dist+=(abs(X[i][k]-X[j][k]))**2
else:
dist+=(abs(X[i][k]-X[j][k])/(range_max[k]-range_min[k]))**2
unnorm_dist+= abs(X[i][k]-X[j][k])
#dist+=(abs(X[i][k]-X[j][k]))**2
#categorical
if(meta[k]==1):
if(X[i][k]!=X[j][k]):
dist+=1
unnorm_dist+=1
dist_matrix[i][j]=np.sqrt(dist)
dist_matrix[j][i]=np.sqrt(dist)
unnorm_dist_matrix[i][j]=np.sqrt(unnorm_dist)
unnorm_dist_matrix[j][i]=np.sqrt(unnorm_dist)
#print(dist_matrix)
return dist_matrix,unnorm_dist_matrix
def __distance_HEOM_different_arrays(self,X,X2):
'''
Calculates the distance matrix between all pairs of points from 2 input matrixes, using the HEOM metric, that way categorical attributes are
allow in the dataset.
--------
Parameters:
X (numpy.array): An (N*M) numpy matrix containing the first set of points, where N is the number of points and M is the number of attributes per point.
X2 (numpy.array): An (N*M) numpy matrix containing the second set of points, where N is the number of points and M is the number of attributes per point.
--------
Returns:
dist_matrix (numpy.array): A (M*M) matrix containing the distance between all pairs of points in X
'''
meta= self.meta
dist_matrix=np.zeros((len(X2),len(X)))
#calculate the ranges of all attributes
range_max=np.max(X,axis=1)
range_min=np.min(X,axis=1)
for i in range(len(X2)):
for j in range(len(X)):
#for attribute
dist = 0
for k in range(len(X2[0])):
#missing value
if(X2[i][k] == None or X[j][k]==None):
dist+=1
#numerical
if(meta[k]==0):
if(range_max[k]-range_min[k]==0):
dist+=(abs(X2[i][k]-X[j][k]))**2
else:
dist+=(abs(X2[i][k]-X[j][k])/(range_max[k]-range_min[k]))**2
#categorical
if(meta[k]==1):
if(X2[i][k]!=X[j][k]):
dist+=1
dist_matrix[i][j]=np.sqrt(dist)
#print(dist_matrix)
return dist_matrix
def __calculate_distance_matrix(self,X,distance_func="HEOM"):
'''
Is called by the __init__ method.
Function used to select which distance metric will be used to calculate the distance between a matrix of points.
Only the HEOM metric is implemented for now, however if more metrics are added this function can easily be changed to
incomporate the new metrics.
--------
Parameters:
X (numpy.array): An (N*M) numpy matrix containing the points, where N is the number of points and M is the number of attributes per point.
distance_func (string): The distance function to be used, only available option right now is "HEOM"
--------
Returns:
dist_matrix (numpy.array): A (M*M) matrix containing the distance between all pairs of points in X
--------
'''
if(distance_func=="HEOM"):
distance_matrix,unnorm_distance_matrix=self.__distance_HEOM(X)
elif(distance_func=="default"):
distance_matrix,unnorm_distance_matrix=self.__distance_HEOM(X)
#add other distance functions
return distance_matrix,unnorm_distance_matrix
def __get_class_inxs(self):
'''
Called by the __init__ method.
Calculates what are the indexes in X and y for each of the classes.
--------
Returns:
class_inds (array): An array of arrays where each inner array contains the indexes for one class. The number of inner arrays is equal to the
total number of unique classes in X.
--------
'''
class_inds = []
for cls in self.classes:
cls_ind=np.where(self.y==cls)[0]
class_inds.append(cls_ind)
return class_inds
def __knn(self,inx,line,k,clear_diag=True):
'''
Called by all the complexity metrics that need to calculate the nearest neighbours of a sample.
Calculates the class labels of the k nearest neighbours of a sample x. If clear_diag is True, it is assumed that point in
position "inx" is the sample x, which will have distance 0 to itself and so the distance is changed to infinite to avoid
one of the nearest neighbours being the point itself.
--------
Parameters:
inx (int): The index of the point in the array line. Not relevant if "line" does not contain the point itself.
line (array): An array of distances from "x" to all the points. Usually taken from the dist_matrix attribute of the class.
k (int): the number of neighbours to consider.
clear_diag (bool): True if the distance in position "inx" of "line" is the point itself.
--------
Returns:
count: An (n*1) array, where n is the number of unique class labels, where in each position is the value of how many of the k
nearest neighbours belong to that class. For example if k=5 and there are 3 classes a possible configuration could be [2,3,0] meaning that 2
of the neighbours are from the 1st class, 3 are from the second class, and there are none from the 3rd class.
'''
count = np.zeros(len(self.classes))
if(clear_diag):
line[inx]=math.inf
for i in range(k):
index=np.where(line == min(line))[0][0]
line[index]=math.inf
#if(self.y[index]!=self.y[inx]):
cls_inx = np.where( self.classes == self.y[index])[0][0]
count[cls_inx]+=1
return count
def __hypersphere(self,inx,sigma,distance_matrix=[],y=[]):
'''
Called by the C1 and MRCA complexity measures.
Draws an hypersphere of radius "sigma" and center on a sample x and calculates the number of samples that
are from the same class as a sample "x" as well as the number of sample that are not from the same class.
The sample will count itself as a instance of the same class inside the hypersphere.
If no distance matrix or class label arrays are passed the function assumes uses the attributes of the complexity
class calcultated in the __init__ method.
--------
Parameters:
inx (int): Index of the sample x in the array X
sigma (float): Size of the radius of the hypersphere
distance_matrix (numpy.array): An (N*N) matrix, where N is the number of points in the dataset, contains the pair wise
distances between all points
y (numpy.array): An (N*1) array containing all the class labels from the dataset
--------
Returns:
n_minus (int): The number of instances that have a different class label than sample x
n_plus (int): The number of instances that have the same class label than sample x
--------
'''
if(len(distance_matrix)==0):
distance_matrix=self.dist_matrix
if(len(y)==0):
y=self.y
#an array containing the distance to all points
line = distance_matrix[inx]
n_minus = 0
n_plus = 0
for i in range(len(line)):
#if the sample is inside de hypersphere
#print(line)
if(line[i]<=sigma):
#if the sample is from the same class as "x"
if(y[i]==y[inx]):
n_plus+=1
else:
n_minus+=1
return [n_minus,n_plus]
def __hypersphere_sim(self,inx,sigma):
'''
Called by C2 function.
Similiar to the hypersphere function, however instead of calculating the number of samples that are from the same class and the number
of samples that isn't, it calculates the sum of the distance from sample "x" to all samples of with the same class label and the sum of distance
to all samples with a different class label.
--------
Parameters:
inx (int): The index of the sample "x" in the array X
sigma (float): The radius of the hypersphere
--------
Returns:
n_minus: The sum of the distances from "x" to all the samples inside the hypersphere with a different class label
n_plus: The sum of the distances from "x" to all the samples inside the hypersphere with the same class label
--------
'''
line = self.dist_matrix[inx]
n_minus = 0
n_plus = 0
for i in range(len(line)):
#if the sample is inside the hypersphere
if(line[i]<=sigma):
#if the sample is from the same class as "x"
if(self.y[i]==self.y[inx]):
n_plus+=line[i]
else:
n_minus+=line[i]
return [n_minus,n_plus]
def R_value(self,k=5,theta=2):
'''
Calculate the Augmented R value complexity measure defined in [1].
--------
Parameters:
k (int): Number of neighbours, for the knn function
tetha (int): threshold of neighbours that can have a different class label
--------
Returns:
r_values (array): An array with the R values of the pair wise combinations of all classes (One VS One)
--------
References:
[1] Borsos Z, Lemnaru C, Potolea R (2018) Dealing with overlap and imbalance:
a new metric and approach. Pattern Analysis and Applications 21(2):381-395
--------
'''
r_matrix=np.zeros((len(self.classes),len(self.classes)))
for i in range(len(self.dist_matrix)):
line = self.dist_matrix[i]
#caclutate the k nearest neighbours of the instance
count=self.__knn(i,copy.copy(line),k)
cls_inx = np.where( self.classes == self.y[i])[0][0]
for j in range(len(self.classes)):
#check if the threshold of neighbours is crossed
if(theta<count[j]):
r_matrix[cls_inx,j]+=1
for i in range(len(r_matrix)):
for j in range(len(r_matrix[0])):
r_matrix[i,j]=r_matrix[i,j]/self.class_count[i]
r_values = []
for i in range(len(r_matrix)):
for j in range(i+1,len(r_matrix)):
imbalanced_ratio = 0
maj_r = 0
min_r = 0
if(self.class_count[i]>self.class_count[j]):
imbalanced_ratio = self.class_count[j]/self.class_count[i]
maj_r = r_matrix[i,j]
min_r = r_matrix[j,i]
else:
imbalanced_ratio = self.class_count[i]/self.class_count[j]
maj_r = r_matrix[j,i]
min_r = r_matrix[i,j]
r=(1/(imbalanced_ratio+1))*(maj_r+imbalanced_ratio*min_r)
r_values.append(r)
return r_values
def D3_value(self,k=5):
'''
Calculate the D3 value complexity measure defined in [1].
--------
Parameters:
k (int): Number of neighbours, for the knn function
--------
Returns:
d3_matirx (np.array): An array with the d3 values for all classes
--------
References:
[1] Sotoca JM, Mollineda RA, Sanchez JS (2006) A meta-learning framework
for pattern classication by means of data complexity measures. Inteligencia
Articial Revista Iberoamericana de Inteligencia Artificial 10(29):31-38
--------
'''
d3_matrix=np.zeros(len(self.classes))
for i in range(len(self.dist_matrix)):
line = self.dist_matrix[i]
count=self.__knn(i,copy.copy(line),k)
cls_inx = np.where( self.classes == self.y[i])[0][0]
if(0.5>(count[cls_inx]/k)):
d3_matrix[cls_inx]+=1
return d3_matrix
def kDN(self,k=5):
'''
Calculate the kDN value complexity measure defined in [1]
--------
Parameters:
k (int): Number of neighbours, for the knn function
--------
Returns:
kDN (float): The value of the complexity measure
--------
References:
[1] Smith MR, Martinez T, Giraud-Carrier C (2014) An instance level analysis
of data complexity. Machine learning 95(2):225-256
--------
'''
kDN_value = 0
for i in range(len(self.dist_matrix)):
line = self.dist_matrix[i]
count=self.__knn(i,copy.copy(line),k)
cls_inx = np.where( self.classes == self.y[i])[0][0]
kDN_value+= (k-count[cls_inx])/k
kDN_value/=len(self.X)
return kDN_value
def CM(self,k=5):
'''
Calculate the CM value complexity measure defined in [1].
-------
Parameters:
k (int): Number of neighbours, for the knn function
-------
Returns:
CM_value (float): The value of the complexity measure
-------
References:
[1] Anwar N, Jones G, Ganesh S (2014) Measurement of data complexity
for classification problems with unbalanced data. Statistical Analysis and
Data Mining: The ASA Data Science Journal 7(3):194-211
-------
'''
CM_value = 0
for i in range(len(self.dist_matrix)):
line = self.dist_matrix[i]
count=self.__knn(i,copy.copy(line),k)
cls_inx = np.where( self.classes == self.y[i])[0]
kDN_value = (k-count[cls_inx])/k
if(kDN_value>0.5):
CM_value+=1
CM_value/=len(self.X)
return CM_value
def __MRI_p(self,profile):
'''
Calculate the MRI value of a pattern.
------
Parameters:
profile (array): An array containing the features of the pattern.
------
Returns:
mri_val (float): The MRI value of the pattern.
'''
sum_val = 0
m = len(profile)
for j in range(m):
w = (1-(j/m))
sum_val += w * (1-profile[j])
mri_val = sum_val*(1/(2*m))
#print(profile)
#print(mri_val)
return mri_val
def __MRI_k(self,cluster):
'''
Called by the MRCA function. Calculates the Multiresolution Index (MRI) for a cluster by
averaging the MRI value of each pattern in the cluster.
--------
Parameters:
cluster (array): An array of arrays (N*M) containg all the points in a cluster, where N is the number of points and M is the
dimesion of each point.
---------
Returns:
mri_val (float): The multiresolution index of the cluster
---------
'''
sum_val = 0
for profile in cluster:
sum_val+= self.__MRI_p(profile)
mri_val=sum_val/len(cluster)
return mri_val
def MRCA(self,sigmas=[0.1,0.2,0.5],n_clusters=3,distance_func="default"):
'''
Calculates the MRCA value complexity measure defined in [1].
-------
Parameters:
sigmas (float): An array with the multiple hypersphere radius
n_clusters (int): the number of clusters to group in the kMeans algorithm
-------
Returns:
mrca (array): An array (n_clusters*1), with the mrca value of each cluster
-------
References:
[1] Armano G, Tamponi E (2016) Experimenting multiresolution analysis for
identifying regions of different classification complexity. Pattern Analysis
and Applications 19(1):129-137
'''
#one vs one approach
for i2 in range(len(self.class_inxs)):
for j2 in range(i2+1,len(self.class_inxs)):
#create new arrays with just the 2 classes being considired for this iteration
c1 = self.classes[i2]
c2 = self.classes[j2]
sample_c1 = self.X[self.class_inxs[i2]]
sample_c2 = self.X[self.class_inxs[j2]]
sample_c1_y = self.y[self.class_inxs[i2]]
sample_c2_y = self.y[self.class_inxs[j2]]
new_X = np.concatenate([sample_c1,sample_c2],axis=0)
new_y = np.concatenate([sample_c1_y,sample_c2_y],axis=0)
new_dist_matrix,_ = self.__calculate_distance_matrix(new_X,distance_func=distance_func)
mrca=np.zeros(n_clusters)
profiles = np.zeros((len(new_X),len(sigmas)))
#for each point
for i in range(len(new_X)):
#for each radius
for j in range(len(sigmas)):
sigma = sigmas[j]
#change this: if there is more than 2 classes the hypershpere n_minus of all other classes
n = self.__hypersphere(i,sigma,distance_matrix=new_dist_matrix,y=new_y)
#calculate the psi values for each profile in accordance to [1]
if(new_y[i]==c1):
alt_y = 1
psi = alt_y * ((n[1]-n[0])/(n[1]+n[0]))
else:
alt_y = -1
psi = alt_y * ((n[0]-n[1])/(n[0]+n[1]))
profiles[i,j]=psi
#cluster the the profiles
kmeans = KMeans(n_clusters=n_clusters).fit(profiles)
#for each cluster calculate the MRI value
for i in range(n_clusters):
inx = np.where( kmeans.labels_ == i)[0]
cluster = profiles[inx]
mrca[i]=self.__MRI_k(cluster)
return mrca
def C1(self,sigmas=[0.01,0.05,0.1]):
'''
Calculate the C1 value complexity measure defined in [1].
-------
Parameters:
sigmas (float): An array with the multiple hypersphere radius
-------
Returns:
c1_val (float): The value of the complexity measure
-------
References:
[1] Massie S, Craw S, Wiratunga N (2005) Complexity-guided case discovery
for case based reasoning. In: AAAI, vol 5, pp 216-221
'''
c1_sum = 0
for i in range(len(self.X)):
c1_instance_sum = 0
#for each radius sigma
for sigma in sigmas:
#draw a hypersphere with radius sigma around the point and check which points inside are from the same class and
#which are not.
n = self.__hypersphere(i,sigma)
pkj=n[1]/sum(n)
c1_instance_sum += pkj
c1_instance_val = 1-(c1_instance_sum/len(sigmas))
c1_sum += c1_instance_val
c1_val = c1_sum/len(self.X)
return c1_val
def C2(self,sigmas=[0.1,0.2,0.5]):
'''
Calculate the C2 value complexity measure defined in [1].
-------
Parameters:
sigmas (float): An array with the multiple hypersphere radius
-------
Returns:
c2_val (float): The value of the complexity measure
-------
References:
[1] Cummins L (2013) Combining and choosing case base maintenance algorithms. PhD thesis, University College Cork
'''
c2_sum = 0
for i in range(len(self.X)):
c2_instance_sum = 0
for sigma in sigmas:
n = self.__hypersphere_sim(i,sigma)
if(sum(n)!=0):
pkj=n[1]/sum(n)
else:
pkj=1
c2_instance_sum += pkj
c2_instance_val = 1-(c2_instance_sum/len(sigmas))
c2_sum += c2_instance_val
c2_val = c2_sum/len(self.X)
return c2_val
def __calculate_n_inter(self,dist_matrix=[],y=[]):
'''
Called by the DBC and N1 complexity measure functions.
Calculates the miminimum spanning tree using a distance matrix, dist_matrix. Afterwards it counts the amount
of vertixes in the MST that have an edge connecting them are from 2 distinct classes.
If no distance matrix or class label array y is passed it assumes the attributes calculated in the
__init__ method.
-------
Parameters:
dist_matrix (numpy.array): A distance matrix between all points, used to calculate the MST
y (numpy.array): an array with the class labels
-------
Returns:
count (int): the number of vertixes connected by an edge that have different class labels
'''
#If no parameters are passed it uses the distance matrix and class labels calcultated in the __init__ method
#This is necessary because the distance matrix and class labels are different depending on the complexity measure
#that calls this function.
if len(dist_matrix)==0:
dist_matrix = self.dist_matrix
if len(y) == 0:
y = self.y
#calculate the MST using the distance matrix
minimum_spanning_tree = scipy.sparse.csgraph.minimum_spanning_tree(csgraph=np.triu(dist_matrix, k=1), overwrite=True)
#convert the mst to an array
mst_array = minimum_spanning_tree.toarray().astype(float)
#iterate over the MST to determine which vertixes that are connected are from different classes.
vertix = []
aux_count = 0
for i in range(len(mst_array)):
for j in range(len(mst_array[0])):
if(mst_array[i][j]!=0):
if (y[i]!=y[j]):
vertix.append(i)
vertix.append(j)
count=len(np.unique(vertix))
return count
def N1(self):
'''
Calculate the N1 value complexity measure defined in [1].
-------
Returns:
n1 (float): The value of the complexity measure
-------
References:
Ho T, Basu M (2002) Complexity measures of supervised classification
problems. IEEE transactions on pattern analysis and machine intelligence 24(3):289-300
'''
count = self.__calculate_n_inter()
n1 = count/len(self.y)
return n1
def N2(self):
'''
Calculate the N2 value complexity measure defined in [1].
-------
Returns:
n2 (float): The value of the complexity measure
-------
References:
Ho T, Basu M (2002) Complexity measures of supervised classification
problems. IEEE transactions on pattern analysis and machine intelligence 24(3):289-300
'''
count_inter = 0
count_intra = 0
#for each sample
for i in range(len(self.dist_matrix)):
min_inter = np.inf
min_intra = np.inf
#iterate over every sample and check which is the nearest neighbour from the same class and
#which is the nearest neighbour from the opposite class.
for j in range(len(self.dist_matrix[0])):
if(self.y[i]==self.y[j] and i!=j and self.dist_matrix[i][j]<min_intra):
min_intra=self.dist_matrix[i][j]
if(self.y[i]!=self.y[j] and self.dist_matrix[i][j]<min_inter):
min_inter=self.dist_matrix[i][j]
count_inter+=min_inter
count_intra+=min_intra
r = count_intra/count_inter
N2 = r/(1+r)
return N2
def N3(self,k=1):
'''
Calculate the N3 value complexity measure in [1].
By default k=1 in accordance to the definition in the paper. However it is possible to select a different value for k.
-------
Parameters:
k (int): number of nearest neighbours to consider
-------
Returns:
n3 (float): the value of the complexity measure
-------
References:
Ho T, Basu M (2002) Complexity measures of supervised classification
problems. IEEE transactions on pattern analysis and machine intelligence 24(3):289-300
'''
sample_count=0
#for each sample
for sample in range(len(self.X)):
#calculate the nearest neighbour
count=self.__knn(sample,copy.copy(self.dist_matrix[sample]),k)
cls_inx = np.where( self.classes == self.y[sample])[0][0]
class_count=count[cls_inx]
max_count=np.max(count)
#are there more instances with the same class label or with a different class label.
if(class_count<max_count):
sample_count+=1
n3 = sample_count/len(self.y)
return n3
def __interpolate_samples(self):
'''
Create new interpolated samples from the sample array X.
To achieve this 2 samples in X from the same class are selected and a new sample is created by interpolating these 2.
This sample will have the same class label of the 2 used to create it.
This process is repeated N times, where N is the size of X.
------
Returns:
X_interp (array): The new array with the interpolated samples
y_inter (array): An array with the labels of the new class samples.
'''
X_interp = []
y_interp = []
for cls_inx in self.class_inxs:
new_X = self.X[cls_inx,:]
new_y = self.y[cls_inx]
sample1_inxs = np.random.choice( len(new_X), len(new_X))
sample2_inxs = np.random.choice( len(new_X), len(new_X))
sample1 = new_X[sample1_inxs, :]
sample2 = new_X[sample2_inxs, :]
alpha = np.random.ranf(new_X.shape)
X_interp_cls = sample1 + (sample2 - sample1)*alpha
y_interp=np.append(y_interp,new_y)
if(len(X_interp)==0):
X_interp=X_interp_cls
else:
X_interp=np.concatenate((X_interp,X_interp_cls),axis=0)
return X_interp, y_interp
def N4(self,k=1):
'''
Calculate the N4 value complexity measure defined in [1].
-------
Parameters:
k (int): Number of nearest neighbours to consider
-------
Returns:
n4 (float): The value of the complexity measure
-------
References:
[1] Lorena AC, Garcia LP, Lehmann J, Souto MC, Ho TK (2019) How complex
is your classification problem? a survey on measuring classification
complexity. ACM Computing Surveys (CSUR) 52(5):1-34
'''
#create new interpolated samples
X_interp, y_interp=self.__interpolate_samples()
new_dist = self.__distance_HEOM_different_arrays(self.X,X_interp)
sample_count=0
#for each sample in X_interpol check calculate their k nearest neighbours in X and determine if there
#are more neighbours from the same class or more neighbours from the opposite classes.
for sample in range(len(X_interp)):
count=self.__knn(sample,copy.copy(new_dist[sample]),k,clear_diag=False)
cls_inx = np.where( self.classes == y_interp[sample])[0][0]
#number of neighbours with the same class label
class_count=count[cls_inx]
max_count=np.max(count)
if(class_count<max_count):
sample_count+=1
n4 = sample_count/len(y_interp)
return n4
def SI(self,k=1):
'''
Calculate the Separability index (SI) complexity measure defined in [1].
--------
Returns:
si_measure (float): The SI complexity measure value calculated.
--------
References:
Thornton C (1998) Separability is a learner's best friend. In: 4th Neural
Computation and Psychology Workshop, London, 9-11 April 1997,
Springer, pp 40-46
'''
sample_count=0
#for each sample
for sample in range(len(self.X)):
#calculate the nearest neighbour
count=self.__knn(sample,copy.copy(self.dist_matrix[sample]),k)
cls_inx = np.where( self.classes == self.y[sample])[0][0]
class_count=count[cls_inx]
max_count=np.max(count)
#are there more instances with the same class label or with a different class label.
if(class_count==max_count):
sample_count+=1