-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfullfactorialexperiment.R
145 lines (139 loc) · 7.63 KB
/
fullfactorialexperiment.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
require(MixSim)
require(cluster)
require(clustrd)
require(clustMixType)
require(mclust)
require(kmed)
require(FactoMineR)
require(fpc)
require(mclust)
require(kamila)
# categorized numerical variable function
intv <- function(vec, class) {
nbase <- (1:(class-1))/class
nq <- numeric(length(nbase))
for (i in 1:length(nq)) {
nq[i] <- quantile(vec, nbase[i])
}
res <- c(min(vec), nq, max(vec))
res[1] <- res[1]-1
for (i in 2:length(res)){
if (res[i-1]==res[i]){
res[i] <- res[i]+2e-15
}
}
return(res)
}
# Full factorial simulation study - NON-SPHERICAL CLUSTERS
# Empty data frame to store simulation results
fullfactorial <- data.frame(seed=numeric(),
nClust=numeric(),
overlap=numeric(),
nrows=numeric(),
ncols=numeric(),
pi=numeric(),
method=character(),
res=numeric(),
stringsAsFactors=FALSE)
cluster_sizes <- c(3:5)
overlap_levels <- c(0.01, seq(0.05, 0.20, by=0.05))
num_rows <- c(300, 600, 1200)
num_cols <- c(6, 10, 14)
pi_values <- c(0.01, 1.0)
nreps <- 50
tot <- nreps*length(cluster_sizes)*length(overlap_levels)*length(num_rows)*length(num_cols)*length(pi_values)
for (l in 0:(nreps-1)){
# Random seed
set.seed(1234+l)
# Number of clusters
for (clusters in cluster_sizes){
# Overlap level
for (overlap in overlap_levels){
# Number of rows
for (rows in num_rows){
# Number of variables/columns
for (columns in num_cols){
# Balanced/unbalanced design
for (pi_val in pi_values){
# Construct artificial data set - sphericity set to FALSE by default
mixsimaux <- MixSim(BarOmega = overlap, MaxOmega = 3*overlap/2, PiLow=pi_val, K = clusters, p = columns, resN = 1000000)
mixdtaux <- simdataset(n = rows, Pi = mixsimaux$Pi, Mu = mixsimaux$Mu, S = mixsimaux$S)
# Discretise first half attributes using 4 levels
for (k in 1:(columns/2)){
mixdtaux$X[,k] <-
as.factor(cut(mixdtaux$X[,k], intv(mixdtaux$X[,k], 4), labels = (1:4)))
}
mixdt1df <- as.data.frame(mixdtaux$X)
for (k in 1:(columns/2)){
mixdt1df[,k] <- as.factor(mixdt1df[,k])
}
# this temporarily fixes a nasty bug of cluspcamix
colnames(mixdt1df) <- sprintf("a%d", 1:columns)
save(mixdt1df,file=paste("mixdt1df",run,sep = "",".Rdata"))
# Calculate Gower's dissimilarities and Ahmad distances
gower_dist <- daisy(mixdt1df, metric = "gower")
mix_sim <- distmix(mixdt1df, method = "ahmad", idcat = c(1:(columns/2)), idnum = c((columns/2+1):columns))
# Specify continuous & categorical attributes
conDf <- data.frame(scale(mixdt1df[,((columns/2+1):columns)]))
catDf <- dummyCodeFactorDf(data.frame(mixdt1df[,1:(columns/2)]))
# Extract principal components
outpcamix <- FAMD(mixdt1df, ncp = clusters -1, graph=FALSE)
# PAM with Gower's
pam_fit <- pam(gower_dist, diss = TRUE, k = clusters, do.swap = FALSE, cluster.only = TRUE,
nstart=100)
fullfactorial <- rbind(fullfactorial, data.frame(seed=l+1, nClust=clusters, overlap=overlap,
nrows=rows, ncols=columns, pi=pi_val,
method='PAM', res=adjustedRandIndex(pam_fit, mixdtaux$id)))
save(fullfactorial, file='fullfactorial.RData')
cat('PAM done for dataset',l+1,'with',clusters,'clusters,',rows,'rows',columns,
'columns, an overlap of',overlap, 'and pi',pi_val,'\n')
# K-prototypes
outk <- kproto(mixdt1df, clusters, nstart = 100,verbose = FALSE)
fullfactorial <- rbind(fullfactorial, data.frame(seed=l+1, nClust=clusters, overlap=overlap, nrows=rows,
ncols=columns, pi=pi_val, method='K-Prot',
res=adjustedRandIndex(outk$cluster, mixdtaux$id)))
save(fullfactorial, file='fullfactorial.RData')
cat('K-prototypes done for dataset',l+1,'with',clusters,'clusters,',rows,'rows',columns,
'columns, an overlap of',overlap, 'and pi',pi_val,'\n')
# Mixed K-means
kmedres <- fastkmed(mix_sim, clusters, iterate = 100, init = sample(1:nrow(mixdt1df), clusters))
fullfactorial <- rbind(fullfactorial, data.frame(seed=l+1, nClust=clusters, overlap=overlap, nrows=rows,
ncols=columns, pi=pi_val, method='Mixed',
res=adjustedRandIndex(kmedres$cluster, mixdtaux$id)))
save(fullfactorial, file='fullfactorial.RData')
cat('Mixed K-means done for dataset',l+1,'with',clusters,'clusters,',rows,'rows',columns,
'columns, an overlap of',overlap, 'and pi',pi_val,'\n')
# Modha-Spangler
msRes <- gmsClust(conDf, catDf, nclust = clusters, searchDensity = 5, nstart=100)
fullfactorial <- rbind(fullfactorial, data.frame(seed=l+1, nClust=clusters, overlap=overlap, nrows=rows,
ncols=columns, pi=pi_val, method='Modha-Spangler',
res=adjustedRandIndex(msRes$results$cluster, mixdtaux$id)))
save(fullfactorial, file='fullfactorial.RData')
cat('Modha-Spangler done for dataset',l+1,'with',clusters,'clusters,',rows,'rows',columns,
'columns, an overlap of',overlap, 'and pi',pi_val,'\n')
# FAMD + K-means
outkm <- kmeans(outpcamix$ind$coord, nstart=100, clusters, iter.max=1000, algorithm = "MacQueen")
fullfactorial <- rbind(fullfactorial, data.frame(seed=l+1, nClust=clusters, overlap=overlap, nrows=rows,
ncols=columns, pi=pi_val, method='FAMD',
res=adjustedRandIndex(outkm$cluster, mixdtaux$id)))
save(fullfactorial, file='fullfactorial.RData')
cat('FAMD done for dataset',l+1,'with',clusters,'clusters,',rows,'rows',columns,
'columns, an overlap of',overlap, 'and pi',pi_val,'\n')
# Mixed RKM
outmix = clustrd::cluspcamix(mixdt1df, nclus = clusters, ndim = clusters-1, nstart=100)
fullfactorial <- rbind(fullfactorial, data.frame(seed=l+1, nClust=clusters, overlap=overlap, nrows=rows,
ncols=columns, pi=pi_val, method='RKM',
res=adjustedRandIndex(outmix$cluster, mixdtaux$id)))
save(fullfactorial, file='fullfactorial.RData')
cat('Mixed RKM done for dataset',l+1,'with',clusters,'clusters,',rows,'rows',columns,
'columns, an overlap of',overlap, 'and pi',pi_val,'\n')
cat('Simulations for data set',l+1,'with',clusters,'clusters,',rows,'rows',columns,
'columns, an overlap of',overlap, 'and pi',pi_val,'\n')
cat('***** Run',run,'/',tot,' *****','\n')
run <- run + 1
}
}
}
}
}
}