Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
sara-fallet committed Jun 26, 2024
1 parent 043146a commit fdf175d
Showing 1 changed file with 154 additions and 134 deletions.
288 changes: 154 additions & 134 deletions R/plot_ccdf_cit_gsa.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

#' Function for plotting the CDF of all the genes within a gene set
#' Function for plotting the CDF of all the genes within a gene set or for one gene
#'
#' @param x an object of class \code{\link{cit_gsa}}
#'
Expand All @@ -17,149 +17,169 @@

plot.citcdf <- function(x, ...){

if(x$type =="gsa"){

ccdf <- x$ccdf

genes <- names(ccdf)

# Create data frame with all the ccdf ----
data_gene <- do.call(rbind, lapply(genes, function(name) {
x <- ccdf[[name]]
df <- cbind.data.frame(x$cdf, x$ccdf, x$x, x$y)
colnames(df) <- c("cdf", "ccdf", "x", "y")
df$Gene <- rep(name, nrow(df))
return(df)
}))

sev <- unique(data_gene$x)

# Complete data if no values for some y ----
data_gene_sep <- split(data_gene, list(data_gene$Gene, data_gene$x)) # separate the table by genes and x

new_data_gene_sep <- lapply(data_gene_sep, function(df) {
# max value of current y
max_y_cur <- max(unique(df$y))
# max value of total y
max_y_all <- max(unique(data_gene$y))

# all the y that are between max current y value and max total y value
y_differ_data <- setdiff(data_gene$y, df$y) # y values that are not in current data
miss_y <- subset(y_differ_data, y_differ_data >= max_y_cur & y_differ_data <= max_y_all) # y values that are not in current data (previous) and between the 2 max
n_miss_y <- length(miss_y)

# add the new y in the data + affect the max value of the ccdf to these y
# WARNING : ccdf sorted not y !
if (n_miss_y != 0){
df <- data.frame(cbind(rep(max(df$cdf),n_miss_y),
rep(max(df$ccdf),n_miss_y),
rep(df$x[1],n_miss_y), miss_y,
rep(df$Gene[1],n_miss_y)))
colnames(df) <- c("cdf","ccdf","x","y","Gene")
df$x <- factor(df$x, levels = rep(1:length(levels(data_gene$x))) , labels = levels(data_gene$x) )
} else{
df <- 0
}
return(df)
})

# Combine the data set
final_data <- do.call(rbind,lapply(genes, function(name){
# if some genes doesn't have missing y values
no_zero <- do.call(rbind,Filter(function(x) {!is.numeric(x) || !all(x == 0)}, new_data_gene_sep))
# new dataset completed
df <- rbind(data_gene[data_gene$Gene==name,], no_zero[no_zero$Gene==name,])
df$y <- as.numeric(df$y)
df$cdf <- as.numeric(df$cdf)
df$ccdf <- as.numeric(df$ccdf)
return(df)
}))
if(x$type == "gsa"){


# Thresholds ----
Y_after <- final_data$y
number_y = 20
# y value for each thresholds
y_after <- seq(from = ifelse(length(which(Y_after==0))==0, min(Y_after), min(Y_after[-which(Y_after==0)])),
to = max(Y_after[-which.max(as.matrix(Y_after))]), length.out = number_y)
ccdf_all <- x$ccdf

seuils <- c(0,y_after)
browser()

# Compute the median ----
med <- lapply(sev, function(x){
med <- rep(NA, number_y)
filtre_x <- final_data$x == x
filtre_row_i0 <- final_data$y >= seuils[1] & final_data$y < seuils[1+1]
indices0 <- which(filtre_x & filtre_row_i0)
med[1] <- median(final_data$ccdf[indices0])

for (i in 2:length(seuils)){
for (i in 1:length(ccdf_all)){
ccdf <- ccdf_all[[i]]

genes <- names(ccdf[[1]])

# test_ccdf[[1]][[1]] # accède à Y1 Y2 names(ccdf_all[[1]][[1]])
# test_ccdf[[2]][[1]] # accède à Y3 Y4 names(ccdf_all[[2]][[1]])

browser()

# Create data frame with all the ccdf ----
data_gene <- do.call(rbind, lapply(genes, function(name) {
x <- ccdf[[name]]
df <- cbind.data.frame(x$cdf, x$ccdf, x$x, x$y)
colnames(df) <- c("cdf", "ccdf", "x", "y")
df$Gene <- rep(name, nrow(df))
return(df)
}))

sev <- unique(data_gene$x)


# Complete data if no values for some y ----
data_gene_sep <- split(data_gene, list(data_gene$Gene, data_gene$x)) # separate the table by genes and x

new_data_gene_sep <- lapply(data_gene_sep, function(df) {
# max value of current y
max_y_cur <- max(unique(df$y))
# max value of total y
max_y_all <- max(unique(data_gene$y))

filtre_row_i <- final_data$y >= seuils[i] & final_data$y < seuils[i+1]
indices <- which(filtre_x & filtre_row_i)
# all the y that are between max current y value and max total y value
y_differ_data <- setdiff(data_gene$y, df$y) # y values that are not in current data
miss_y <- subset(y_differ_data, y_differ_data >= max_y_cur & y_differ_data <= max_y_all) # y values that are not in current data (previous) and between the 2 max
n_miss_y <- length(miss_y)

med[i] <- median(final_data$ccdf[indices])
# add the new y in the data + affect the max value of the ccdf to these y
# WARNING : ccdf sorted not y !
if (n_miss_y != 0){
df <- data.frame(cbind(rep(max(df$cdf),n_miss_y),
rep(max(df$ccdf),n_miss_y),
rep(df$x[1],n_miss_y), miss_y,
rep(df$Gene[1],n_miss_y)))
colnames(df) <- c("cdf","ccdf","x","y","Gene")
df$x <- factor(df$x, levels = rep(1:length(levels(data_gene$x))) , labels = levels(data_gene$x) )
} else{
df <- 0
}
return(df)
})

# Combine the data set
final_data <- do.call(rbind,lapply(genes, function(name){
# if some genes doesn't have missing y values
no_zero <- do.call(rbind,Filter(function(x) {!is.numeric(x) || !all(x == 0)}, new_data_gene_sep))
# new dataset completed
df <- rbind(data_gene[data_gene$Gene==name,], no_zero[no_zero$Gene==name,])
df$y <- as.numeric(df$y)
df$cdf <- as.numeric(df$cdf)
df$ccdf <- as.numeric(df$ccdf)
return(df)
}))

# Thresholds ----
Y_after <- final_data$y
number_y = 20
# y value for each thresholds
y_after <- seq(from = ifelse(length(which(Y_after==0))==0, min(Y_after), min(Y_after[-which(Y_after==0)])),
to = max(Y_after[-which.max(as.matrix(Y_after))]), length.out = number_y)

seuils <- c(0,y_after)

# Compute the median ----
med <- lapply(sev, function(x){
med <- rep(NA, number_y)
filtre_x <- final_data$x == x
filtre_row_i0 <- final_data$y >= seuils[1] & final_data$y < seuils[1+1]
indices0 <- which(filtre_x & filtre_row_i0)
med[1] <- median(final_data$ccdf[indices0])

ref_value <- med[i-1]
range_value <- final_data[filtre_x & filtre_row_i, ]$ccdf
for (i in 2:length(seuils)){

filtre_row_i <- final_data$y >= seuils[i] & final_data$y < seuils[i+1]
indices <- which(filtre_x & filtre_row_i)

med[i] <- median(final_data$ccdf[indices])

ref_value <- med[i-1]
range_value <- final_data[filtre_x & filtre_row_i, ]$ccdf

closest_index <- min(which(range_value > ref_value))
closest_value <- range_value[closest_index]

if (med[i] < med[i-1] & closest_index != Inf ){ # force monotony when we can, if no values > previous median leave the current median
med[i] <- closest_value
}
}
med <- data.frame(med[1:number_y])
med$y <- y_after
med$x <- x
names(med)[1] <- "ccdf"

closest_index <- min(which(range_value > ref_value))
closest_value <- range_value[closest_index]
return(med)
})

# Replace values with the max : if some values are below the max after it
replace_max <- lapply(med, function(m){
ind_max <- which.max(m$ccdf)
m[ind_max:nrow(m),]$ccdf <- max(m$ccdf)
return(m)
})

# Step function : add the ccdf to the y values that are not a thresholds
step_function <- lapply(replace_max, function(df){
# y values that are not in the data of the thresholds
new_y <- data.frame(setdiff(final_data$y, df$y)) ; colnames(new_y) <- "y"
# separate the values between the intervals of the thresholds
intervalle_indices <- findInterval(new_y$y, df$y, left.open = TRUE)
indices_int <- intervalle_indices+1

if (med[i] < med[i-1] & closest_index != Inf ){ # force monotony when we can, if no values > previous median leave the current median
med[i] <- closest_value
}
}
med <- data.frame(med[1:number_y])
med$y <- y_after
med$x <- x
names(med)[1] <- "ccdf"

return(med)
})

# Replace values with the max : if some values are below the max after it
replace_max <- lapply(med, function(m){
ind_max <- which.max(m$ccdf)
m[ind_max:nrow(m),]$ccdf <- max(m$ccdf)
return(m)
})

# Step function : add the ccdf to the y values that are not a thresholds
step_function <- lapply(replace_max, function(df){
# y values that are not in the data of the thresholds
new_y <- data.frame(setdiff(final_data$y, df$y)) ; colnames(new_y) <- "y"
# separate the values between the intervals of the thresholds
intervalle_indices <- findInterval(new_y$y, df$y, left.open = TRUE)
indices_int <- intervalle_indices+1

new_y$ccdf <- df$ccdf[indices_int]
new_y$x <- unique(df$x)

return(new_y)
})

# Data final -----
data_med <- rbind(do.call(rbind,step_function),do.call(rbind,med))

# Data modified for the legend
data_med$Gene <- "all"
data_med$Legend <- "Gene set summary"

data_gene <- data_gene[-1]
data_gene$Legend <- "Genes"


comb_data <- rbind(data_gene,data_med)

# Plot -----
ggplot(data = comb_data, aes(x = y, color=x, y = ccdf,linetype = Legend, size=Legend)) +
geom_line(aes(group = interaction(Gene,x))) +
scale_size_manual(values = c(0.8,0.2)) + # modify row size according to the variable in aes(size)
labs(x = "Gene expression") +
theme_minimal()
new_y$ccdf <- df$ccdf[indices_int]
new_y$x <- unique(df$x)

return(new_y)
})

# Data final -----
data_med <- rbind(do.call(rbind,step_function),do.call(rbind,med))

# Data modified for the legend
data_med$Gene <- "all"
data_med$Legend <- "Gene set summary"

data_gene <- data_gene[-1]
data_gene$Legend <- "Genes"


comb_data <- rbind(data_gene,data_med)

# Plot -----
p <- ggplot(data = comb_data, aes(x = y, color=x, y = ccdf,linetype = Legend, size=Legend)) +
geom_line(aes(group = interaction(Gene,x))) +
scale_size_manual(values = c(0.8,0.2)) + # modify row size according to the variable in aes(size)
labs(x = "Gene expression") +
theme_minimal()

return(p)
}


}

# elseif (x$type == genewise)

# elseif (x$type == genewise){

#}


}
Expand Down

0 comments on commit fdf175d

Please sign in to comment.