WIP

sistm · Jun 26, 2024 · fdf175d · fdf175d
1 parent 043146a
commit fdf175d
Showing 1 changed file with 154 additions and 134 deletions.
diff --git a/R/plot_ccdf_cit_gsa.R b/R/plot_ccdf_cit_gsa.R
@@ -1,5 +1,5 @@
 
-#' Function for plotting the CDF of all the genes within a gene set
+#' Function for plotting the CDF of all the genes within a gene set or for one gene
 #'
 #' @param x an object of class \code{\link{cit_gsa}} 
 #' 
@@ -17,149 +17,169 @@
 
 plot.citcdf <- function(x, ...){ 
 
-  if(x$type =="gsa"){
-
-    ccdf <- x$ccdf  
-
-    genes <- names(ccdf)
-
-    # Create data frame with all the ccdf ----
-    data_gene <- do.call(rbind, lapply(genes, function(name) {
-      x <- ccdf[[name]]  
-      df <- cbind.data.frame(x$cdf, x$ccdf, x$x, x$y)
-      colnames(df) <- c("cdf", "ccdf", "x", "y") 
-      df$Gene <- rep(name, nrow(df))
-      return(df)
-    })) 
-
-    sev <- unique(data_gene$x)
-
-    # Complete data if no values for some y ----
-    data_gene_sep <- split(data_gene, list(data_gene$Gene, data_gene$x)) # separate the table by genes and x
-
-    new_data_gene_sep <- lapply(data_gene_sep, function(df) {
-      # max value of current y
-      max_y_cur <- max(unique(df$y)) 
-      # max value of total y
-      max_y_all <- max(unique(data_gene$y))
-
-      # all the y that are between max current y value and max total y value
-      y_differ_data <- setdiff(data_gene$y, df$y) # y values that are not in current data 
-      miss_y <- subset(y_differ_data, y_differ_data >= max_y_cur & y_differ_data <= max_y_all) # y values that are not in current data (previous) and between the 2 max
-      n_miss_y <- length(miss_y)
-
-      # add the new y in the data + affect the max value of the ccdf to these y
-      # WARNING : ccdf sorted not y !
-      if (n_miss_y != 0){
-        df <- data.frame(cbind(rep(max(df$cdf),n_miss_y), 
-                               rep(max(df$ccdf),n_miss_y), 
-                               rep(df$x[1],n_miss_y), miss_y, 
-                               rep(df$Gene[1],n_miss_y)))
-        colnames(df) <- c("cdf","ccdf","x","y","Gene")
-        df$x <- factor(df$x, levels = rep(1:length(levels(data_gene$x))) , labels = levels(data_gene$x) ) 
-      } else{
-        df <- 0
-      }
-      return(df)
-    })
-
-    # Combine the data set
-    final_data <- do.call(rbind,lapply(genes, function(name){
-      # if some genes doesn't have missing y values
-      no_zero <- do.call(rbind,Filter(function(x) {!is.numeric(x) || !all(x == 0)}, new_data_gene_sep)) 
-      # new dataset completed
-      df <- rbind(data_gene[data_gene$Gene==name,], no_zero[no_zero$Gene==name,])
-      df$y <- as.numeric(df$y)
-      df$cdf <- as.numeric(df$cdf)
-      df$ccdf <- as.numeric(df$ccdf)
-      return(df)
-    }))
+  if(x$type == "gsa"){
+
 
-    # Thresholds ----
-    Y_after <- final_data$y
-    number_y = 20
-    # y value for each thresholds  
-    y_after <- seq(from = ifelse(length(which(Y_after==0))==0, min(Y_after), min(Y_after[-which(Y_after==0)])), 
-                   to = max(Y_after[-which.max(as.matrix(Y_after))]), length.out = number_y) 
+    ccdf_all <- x$ccdf
 
-    seuils <- c(0,y_after)
+    browser()
 
-    # Compute the median ----
-    med <- lapply(sev, function(x){
-      med <- rep(NA, number_y)
-      filtre_x <- final_data$x == x
-      filtre_row_i0 <-  final_data$y >= seuils[1] & final_data$y < seuils[1+1]
-      indices0 <- which(filtre_x & filtre_row_i0)
-      med[1] <- median(final_data$ccdf[indices0])
-
-      for (i in 2:length(seuils)){
+    for (i in 1:length(ccdf_all)){
+      ccdf <- ccdf_all[[i]]
+
+      genes <- names(ccdf[[1]])
+
+      # test_ccdf[[1]][[1]] # accède à Y1 Y2 names(ccdf_all[[1]][[1]])
+      # test_ccdf[[2]][[1]] # accède à Y3 Y4 names(ccdf_all[[2]][[1]])
+
+      browser()
+
+      # Create data frame with all the ccdf ----
+      data_gene <- do.call(rbind, lapply(genes, function(name) {
+        x <- ccdf[[name]]  
+        df <- cbind.data.frame(x$cdf, x$ccdf, x$x, x$y)
+        colnames(df) <- c("cdf", "ccdf", "x", "y") 
+        df$Gene <- rep(name, nrow(df))
+        return(df)
+      })) 
+
+      sev <- unique(data_gene$x)
+
+
+      # Complete data if no values for some y ----
+      data_gene_sep <- split(data_gene, list(data_gene$Gene, data_gene$x)) # separate the table by genes and x
+
+      new_data_gene_sep <- lapply(data_gene_sep, function(df) {
+        # max value of current y
+        max_y_cur <- max(unique(df$y)) 
+        # max value of total y
+        max_y_all <- max(unique(data_gene$y))
 
-        filtre_row_i <-  final_data$y >= seuils[i] & final_data$y < seuils[i+1]
-        indices <- which(filtre_x & filtre_row_i)
+        # all the y that are between max current y value and max total y value
+        y_differ_data <- setdiff(data_gene$y, df$y) # y values that are not in current data 
+        miss_y <- subset(y_differ_data, y_differ_data >= max_y_cur & y_differ_data <= max_y_all) # y values that are not in current data (previous) and between the 2 max
+        n_miss_y <- length(miss_y)
 
-        med[i] <- median(final_data$ccdf[indices])
+        # add the new y in the data + affect the max value of the ccdf to these y
+        # WARNING : ccdf sorted not y !
+        if (n_miss_y != 0){
+          df <- data.frame(cbind(rep(max(df$cdf),n_miss_y), 
+                                 rep(max(df$ccdf),n_miss_y), 
+                                 rep(df$x[1],n_miss_y), miss_y, 
+                                 rep(df$Gene[1],n_miss_y)))
+          colnames(df) <- c("cdf","ccdf","x","y","Gene")
+          df$x <- factor(df$x, levels = rep(1:length(levels(data_gene$x))) , labels = levels(data_gene$x) ) 
+        } else{
+          df <- 0
+        }
+        return(df)
+      })
+
+      # Combine the data set
+      final_data <- do.call(rbind,lapply(genes, function(name){
+        # if some genes doesn't have missing y values
+        no_zero <- do.call(rbind,Filter(function(x) {!is.numeric(x) || !all(x == 0)}, new_data_gene_sep)) 
+        # new dataset completed
+        df <- rbind(data_gene[data_gene$Gene==name,], no_zero[no_zero$Gene==name,])
+        df$y <- as.numeric(df$y)
+        df$cdf <- as.numeric(df$cdf)
+        df$ccdf <- as.numeric(df$ccdf)
+        return(df)
+      }))
+
+      # Thresholds ----
+      Y_after <- final_data$y
+      number_y = 20
+      # y value for each thresholds  
+      y_after <- seq(from = ifelse(length(which(Y_after==0))==0, min(Y_after), min(Y_after[-which(Y_after==0)])), 
+                     to = max(Y_after[-which.max(as.matrix(Y_after))]), length.out = number_y) 
+
+      seuils <- c(0,y_after)
+
+      # Compute the median ----
+      med <- lapply(sev, function(x){
+        med <- rep(NA, number_y)
+        filtre_x <- final_data$x == x
+        filtre_row_i0 <-  final_data$y >= seuils[1] & final_data$y < seuils[1+1]
+        indices0 <- which(filtre_x & filtre_row_i0)
+        med[1] <- median(final_data$ccdf[indices0])
 
-        ref_value <- med[i-1]
-        range_value <- final_data[filtre_x & filtre_row_i, ]$ccdf
+        for (i in 2:length(seuils)){
+
+          filtre_row_i <-  final_data$y >= seuils[i] & final_data$y < seuils[i+1]
+          indices <- which(filtre_x & filtre_row_i)
+
+          med[i] <- median(final_data$ccdf[indices])
+
+          ref_value <- med[i-1]
+          range_value <- final_data[filtre_x & filtre_row_i, ]$ccdf
+
+          closest_index <- min(which(range_value > ref_value))
+          closest_value <- range_value[closest_index]
+
+          if (med[i] < med[i-1] & closest_index != Inf ){ # force monotony when we can, if no values > previous median leave the current median 
+            med[i] <- closest_value
+          }
+        }
+        med <- data.frame(med[1:number_y])
+        med$y <- y_after 
+        med$x <- x 
+        names(med)[1] <- "ccdf"
 
-        closest_index <- min(which(range_value > ref_value))
-        closest_value <- range_value[closest_index]
+        return(med)
+      })
+
+      # Replace values with the max : if some values are below the max after it 
+      replace_max <- lapply(med, function(m){
+        ind_max <- which.max(m$ccdf)
+        m[ind_max:nrow(m),]$ccdf <- max(m$ccdf)
+        return(m)
+      })
+
+      # Step function : add the ccdf to the y values that are not a thresholds
+      step_function <- lapply(replace_max, function(df){
+        # y values that are not in the data of the thresholds
+        new_y <- data.frame(setdiff(final_data$y, df$y)) ; colnames(new_y) <- "y" 
+        # separate the values between the intervals of the thresholds
+        intervalle_indices <- findInterval(new_y$y, df$y, left.open = TRUE) 
+        indices_int <- intervalle_indices+1 
 
-        if (med[i] < med[i-1] & closest_index != Inf ){ # force monotony when we can, if no values > previous median leave the current median 
-          med[i] <- closest_value
-        }
-      }
-      med <- data.frame(med[1:number_y])
-      med$y <- y_after 
-      med$x <- x 
-      names(med)[1] <- "ccdf"
-
-      return(med)
-    })
-
-    # Replace values with the max : if some values are below the max after it 
-    replace_max <- lapply(med, function(m){
-      ind_max <- which.max(m$ccdf)
-      m[ind_max:nrow(m),]$ccdf <- max(m$ccdf)
-      return(m)
-    })
-
-    # Step function : add the ccdf to the y values that are not a thresholds
-    step_function <- lapply(replace_max, function(df){
-      # y values that are not in the data of the thresholds
-      new_y <- data.frame(setdiff(final_data$y, df$y)) ; colnames(new_y) <- "y" 
-      # separate the values between the intervals of the thresholds
-      intervalle_indices <- findInterval(new_y$y, df$y, left.open = TRUE) 
-      indices_int <- intervalle_indices+1 
-
-      new_y$ccdf <- df$ccdf[indices_int]
-      new_y$x <- unique(df$x)
-
-      return(new_y)
-    })
-
-    # Data final -----
-    data_med <- rbind(do.call(rbind,step_function),do.call(rbind,med))
-
-    # Data modified for the legend
-    data_med$Gene <- "all"
-    data_med$Legend <- "Gene set summary"  
-
-    data_gene <- data_gene[-1]
-    data_gene$Legend <- "Genes" 
-
-
-    comb_data <- rbind(data_gene,data_med)
-
-    # Plot -----
-    ggplot(data = comb_data, aes(x = y, color=x, y = ccdf,linetype = Legend, size=Legend)) +
-      geom_line(aes(group = interaction(Gene,x))) + 
-      scale_size_manual(values = c(0.8,0.2)) + # modify row size according to the variable in aes(size) 
-      labs(x = "Gene expression") +
-      theme_minimal()
+        new_y$ccdf <- df$ccdf[indices_int]
+        new_y$x <- unique(df$x)
+
+        return(new_y)
+      })
+
+      # Data final -----
+      data_med <- rbind(do.call(rbind,step_function),do.call(rbind,med))
+
+      # Data modified for the legend
+      data_med$Gene <- "all"
+      data_med$Legend <- "Gene set summary"  
+
+      data_gene <- data_gene[-1]
+      data_gene$Legend <- "Genes" 
+
+
+      comb_data <- rbind(data_gene,data_med)
+
+      # Plot -----
+      p <- ggplot(data = comb_data, aes(x = y, color=x, y = ccdf,linetype = Legend, size=Legend)) +
+        geom_line(aes(group = interaction(Gene,x))) + 
+        scale_size_manual(values = c(0.8,0.2)) + # modify row size according to the variable in aes(size) 
+        labs(x = "Gene expression") +
+        theme_minimal()
+
+      return(p)
+    }
+
+
   } 
 
-  # elseif (x$type == genewise)
+
+  # elseif (x$type == genewise){
+
+  #}
 
 
 }