Merge pull request #44 from katelynqueen98/master

Fix bug when first cluster is of size one
USCbiostats · Nov 10, 2024 · bf5863a · bf5863a
2 parents 4c3b323 + d14c3f1
commit bf5863a
Showing 1 changed file with 32 additions and 7 deletions.
diff --git a/R/super_partition.R b/R/super_partition.R
@@ -62,18 +62,31 @@ super_partition <- function(full_data,
   }
 
   # ensure 0 < threshold < 1
-  if (0 > threshold | 1 < threshold) stop("Threshold must be between 0 and 1.")
+  if (0 > threshold | 1 < threshold) {
+    stop("Threshold must be between 0 and 1.")
+  }
 
   # ensure no column names contain x
-  if (any(grepl(x, colnames(full_data)))) stop(paste0("The prefix for new variable names, ", x, ", is contained within existing data column names. Please choose a different prefix to avoid errors."))
+  if (any(grepl(x, colnames(full_data)))) {
+    stop(paste0("The prefix for new variable names, ", x, ", is contained within existing data column names. Please choose a different prefix to avoid errors."))
+  }
 
   # ensure data frame structure
   full_data <- as.data.frame(full_data)
 
   # if < cluster_size features, call regular partition
   if (ncol(full_data) < cluster_size) {
     message(paste0("Using `partition()` since there are < ", cluster_size, "features."))
-    return(partition(full_data, threshold = threshold))
+    prt <- partition(
+      full_data,
+      threshold = threshold,
+      partitioner = partitioner,
+      tolerance = tolerance,
+      niter = niter,
+      x = x,
+      .sep = .sep
+    )
+    return(prt)
   }
 
   # iteration counters
@@ -183,18 +196,27 @@ super_partition <- function(full_data,
     )
   }
 
-  ## first cluster
+  # if no dimension reduction, use partition instead
+  if (length(unique(master_cluster$cluster)) == ncol(full_data)) {
+    if (verbose) message("No dimension reduction occured using Super Partition. Using Partition instead.")
+    return(partition(full_data, threshold, partitioner, tolerance, niter, x, .sep))
+  }
+
+  ## first cluster - always use largest cluster
+  clust_sizes   <- as.data.frame(table(master_cluster$cluster))
+  first_clust   <- which(unique(master_cluster$cluster) == clust_sizes[which.max(clust_sizes$Freq), 1])
+
   # get initial partition to build off
   part_master <- partition(
-    full_data[, which(master_cluster$cluster == unique(master_cluster$cluster)[1])],
+    full_data[, which(master_cluster$cluster == unique(master_cluster$cluster)[first_clust])],
     threshold, partitioner, tolerance, niter, x, .sep
   )
 
   # update indices for each module
   mod_rows <- grep(x, part_master$mapping_key$variable)
   part_master$mapping_key$indices <- full_data_col_numbers(
     full_data = full_data,
-    small_data = full_data[, which(master_cluster$cluster == unique(master_cluster$cluster)[1])],
+    small_data = full_data[, which(master_cluster$cluster == unique(master_cluster$cluster)[first_clust])],
     modules = part_master$mapping_key$indices
   )
 
@@ -208,7 +230,10 @@ super_partition <- function(full_data,
   if (progress_bar) pb$tick()
 
   # for each cluster...
-  for (i in 2:n_iter) {
+  for (i in seq_len(n_iter)) {
+    # skip if first cluster
+    if (i == first_clust) next()
+
     # what to do if cluster is of size one
     if (sum(master_cluster$cluster == unique(master_cluster$cluster)[i]) == 1) {
       # cbind data to master partition reduced data