Rphenograph_UMAP_Script.R

## Start of script
## Analysis of high-dimensional flow-cytometry using tSNE and kNN-clustering
## Generated by Ashish Sharma/Aarthi Talla

##Before starting:
# 1. Acquire data on Flow cytometer of interest
# 2. Export to flowjo/Fix compensation
# 3. Gate on live events -- To increase granularity of analyses you can specifically gate on a sub-population (i.e. Tregs, CD4s etc.)
# 4. Export gated events/sample as individual csv files - 
# 5. In the export window choose format as: CSV - scale value (Include header and choose "stain")
# 6. The exported file should have the following name format: `export_"Specimen"_"001"_(samplename)_"Well#_inHTS"_"Sample#"_"Gate".csv`
# 7. Create 2 folders within the folder with the csv files: InputFiles and OutputFiles
# 8. Make sure the Marker/Flurophore names are the same across all the samples

rm(list=ls())

# Set working Directory -- Location where the csv files are
setwd("~/Desktop/INR/SCOPE/Flow/Mito Panel/UMAP_CD4")

# Load libraries for the script
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(tidyr))
suppressPackageStartupMessages(library(tibble))
suppressPackageStartupMessages(library(RColorBrewer))
suppressPackageStartupMessages(library(Rtsne))
suppressPackageStartupMessages(library(igraph))
suppressPackageStartupMessages(library(FNN))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(ggrepel))
suppressPackageStartupMessages(library(lars))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(pheatmap))
suppressPackageStartupMessages(library(Rphenograph))
suppressPackageStartupMessages(library(inflection))
suppressPackageStartupMessages(library(umap))
suppressPackageStartupMessages(library(flowCore))
suppressPackageStartupMessages(library(Biobase))

# Import 10000 or more events per sample
# NOTE: Import the same # of events per sample -- This will reduce the sample bias during tSNE visualization
#     If a sample does not meet minimum 
# NOTE: Try to keep the maximum cumulative # of events under 100,000 -- Running tSNE on 100,000 events can take 30-50mins#List of all csv files in working directory
fileLS <- list.files(path = ".",
                     pattern = "*.csv$",
                     full.names = TRUE,
                     recursive = FALSE)

set.seed(seed = 1)
fileLS

#Check what is the minimum # of cells/sample
flowLS_temp <- lapply(fileLS, function(FILE) {
  fileDF <- read.csv(FILE) %>%
    as.data.frame()
  return(fileDF)
})
events <- integer()
for (i in 1:length(flowLS_temp)){
  events[i] <- nrow(flowLS_temp[[i]])
}
min_events <- min(events)

#specifying min no. of events per sample
min_events <- if(min_events > 10000) 10000 else min_events
min_events

#Import csv files
flowLS <- lapply(fileLS, function(FILE) {
  #Pattern for importing DonorID
  fName <- gsub(".+_(.+)_.+_.+_.+", "\\1", FILE)
  fileDF <- read.csv(FILE) %>%
    as.data.frame()
  fileDF_randomSample <- fileDF[c(sample.int(nrow(fileDF), min_events)), ] %>% mutate(DonorID = fName)
  print(paste(fName, dim(fileDF_randomSample)[1]), sep = ":")
  return(fileDF_randomSample)
})

# row bind above csv files into one large matrix
flowDF0 <- do.call(rbind, flowLS)
#The column names below are the name of the marker +/- flurophore used in your panel
colnames(flowDF0)

# Pick the markers you want to choose for your analysis. 
# I typically remove any markers used for pre-gating on a specific population
# For example: If I want to specifically look at Live CD4 T cells in a panel. I would remove parameters used to gate on CD4s - i.e. FSC, SSC, CD3, CD4 and Live/Dead
# In addition, I remove Time and DonorID -- because they don't need to be used in the analyses
flowDF <- do.call(rbind, flowLS) %>%
  select( -`FSC.A`, -`FSC.H`,-`FSC.W`, -`SSC.A`,-`SSC.H`,-`SSC.W`,
          -`CD4.PERCPCY5.5`, -`CD3.A700`,-`CD8.PB`, -`CD25.BUV395`,-`LD.AMCYAM`,-`CD127.BV786`,
         -`Time`, -`DonorID`)
flowDF2 <- flowDF
colnames(flowDF2)

flowDF_scaled <- scale(flowDF, scale = TRUE, center = TRUE)

flowDF_scaled.umap <- umap(flowDF_scaled, method = "umap-learn", verbose = TRUE)
x1 <- as.data.frame(flowDF_scaled.umap$layout)

#Print UMAP plot
rS <- ggplot(x1, aes(x=V1, y=V2)) +  
  geom_point(size=0.1, alpha = 0.5)
pdf(file = "OutputFiles/UMAP_All_events.pdf")
print(rS)
dev.off()

#Finding clusters using Rphenograph
x0 <- cbind(x1, flowDF0)
colnames(x0)

Rphenograph_out <- Rphenograph(flowDF_scaled, k = 30)
louvain_phenograph <- membership(Rphenograph_out[[2]])
x0$louvain = as.factor(louvain_phenograph)
colnames(x0)

# Print clusters on UMAP plot
lc.cent = x0 %>% group_by(louvain) %>% select(V1, 
                                              V2) %>% summarize_all(mean)
rS <- ggplot(x0, aes(x = V1, y = V2, colour = louvain)) + 
  geom_point(size=0.1, alpha = 0.5) + theme_bw() + geom_label_repel(aes(label = louvain), 
                                                          data = lc.cent) + guides(colour = FALSE) 
pdf(file = "OutputFiles/UMAP_All_events_clusters.pdf")
print(rS)
dev.off()

#Merge cluster information with original data
colnames(x0)
x3 <- cbind(x0[,c(3:ncol(x0))],x0[,c(1:2)])
colnames(x3)

#Writing and making a heatmap of the median marker MFIs per cluster
new_x0 <- aggregate(x3[,1:(ncol(x3)-5)], list(x3$louvain), median)
rownames(new_x0) <- new_x0[,1]
new_x0 <- new_x0[,-1]
write.csv(new_x0, 'OutputFiles/MFI_per_cluster.csv')
new_x0 <- new_x0 %>% 
  select(-`FSC.A`, -`FSC.H`,-`FSC.W`, -`SSC.A`,-`SSC.H`,-`SSC.W`,
         -`CD4.PERCPCY5.5`, -`CD3.A700`,-`CD8.PB`, -`CD25.BUV395`,-`LD.AMCYAM`,-`CD127.BV786`)
t_new_x0 <- t(new_x0) %>% as.data.frame(.)
colnames(t_new_x0)

hm.parameters <- list(t_new_x0,
                      scale = "row",
                      cellwidth =10, cellheight=10,
                      color = colorRampPalette(c("blue4","blue1","white","red1","brown1"))(100),
                      kmeans_k = NA,
                      show_rownames = T, show_colnames = T,
                      fontsize = 8, 
                      main = "",
                      clustering_method = "ward.D2",
                      cluster_rows = T, cluster_cols = T,
                      clustering_distance_rows = "euclidean",
                      clustering_distance_cols = "euclidean")

pdf(file = "OutputFiles/MFI_per_cluster.pdf")
kmean.hm <- do.call("pheatmap", hm.parameters)
print(kmean.hm)
dev.off()


##Bi-exp transform and make heatmap per cluster
# data frame of flow data, with columns as markers and rows as events
fcsData <- as.matrix(flowDF)

# create meta data of the markers
metaData <- data.frame(name = colnames(fcsData),
                       desc = paste('this is column',colnames(fcsData)))

# calculate range of markers
metaData$range <- apply(apply(fcsData,2,range),2,diff)
metaData$minRange <- apply(fcsData,2,min)
metaData$maxRange <- apply(fcsData,2,max)

# create a flowframe
ff <- new("flowFrame", exprs = fcsData, parameters = AnnotatedDataFrame(metaData))

### transform MFI by 'biexponentatial transformation'
biexp <- biexponentialTransform("BETransform", 
                                a = 0.5, b = 1, c = 0.5, d = 1, f = 0, w = 0, 
                                tol = .Machine$double.eps^0.25, maxit = as.integer(5000))
BETnsemat <- transform(ff, transformList(colnames(exprs(ff)), biexp))

# fetch transformed expression
fcsDataBET <- as.data.frame(exprs(BETnsemat))
fcsDataBET_new <- cbind(fcsDataBET, x3$`louvain`)

new_fcsDataBET <- aggregate(fcsDataBET_new[,1:(ncol(fcsDataBET_new)-1)], 
                            list(fcsDataBET_new$`x3$louvain`), median)
rownames(new_fcsDataBET) <- new_fcsDataBET[,1]
new_fcsDataBET <- new_fcsDataBET[,-1]
t_new_fcsDataBET <- t(new_fcsDataBET) %>% as.data.frame(.)
colnames(t_new_fcsDataBET)

hm.parameters <- list(t_new_fcsDataBET,
                      scale = "row",
                      cellwidth =10, cellheight=10,
                      color = colorRampPalette(c("blue4","blue1","white","red1","brown1"))(100),
                      kmeans_k = NA,
                      show_rownames = T, show_colnames = T,
                      fontsize = 8, 
                      main = "",
                      clustering_method = "ward.D2",
                      cluster_rows = T, cluster_cols = T,
                      clustering_distance_rows = "euclidean",
                      clustering_distance_cols = "euclidean")

pdf(file = "OutputFiles/MFI_per_cluster_Biexp.pdf")
kmean.hm <- do.call("pheatmap", hm.parameters)
print(kmean.hm)
dev.off()


#Add annotations
#Prior to startig this step -- create a Tab-delimited file in the input files folder where: 
#the "DonorID" is in the first column (Don't add a label to first column; match the DonorIDs generate in the is script)
#Add sample characteristics (e.g. clinical outcomes) to Subsequent columns (columns 2+; label these columns)
#If you wish to visualize these characteristics -- make them numeric

write.csv(x3, 'OutputFiles/KNN_clutering_for_FlowJo.csv')
colnames(x3)

#Make cluster frequency table
x6 <- as.matrix(table(x3$DonorID, x3$louvain))
write.csv(x6, 'OutputFiles/cluster_table_louvain.csv')