Karman_et_al_manuscript_code_final.R

######################Figures 1 and 2., Supplementary Figures 1 and 5.#####################
.libPaths(c(.libPaths(), "/sc/wo/home/karmajx/R/x86_64-pc-linux-gnu-library/3.4", "/sc/wo/home/karmajx/R/x86_64-pc-linux-gnu-library/3.5"))
options(stringsAsFactors = F)
setwd('/sc/wo/tri_data/jk/2018_IPF_publicdata_JK/')
sapply(c('rcompanion', 'ggplot2', 'genefilter', 'WGCNA', 'reshape2', 'pheatmap', 'RColorBrewer', 'limma', 'edgeR', 'cowplot',
         'ConsensusClusterPlus', 'xlsx', 'GSVA', 'clusterProfiler', 'DOSE', 'GO.db', 'org.Hs.eg.db', 'GSEABase', 'Pigengene',
         'ReactomePA', 'magrittr', 'FactoMineR', 'factoextra', 'FSA', 'circlize', 'tibble', 'dplyr', 'corrplot', 'ggcorrplot'), library, character.only=T)

#consensus clustering, PCA and limma
gse47460_pheno <- read.csv('gse47460_pheno_clean.csv', header = T, row.names = 1) # contains all samples in GSE47460
gse47460_matrix <- read.csv('gse47460_as_matrix_platformremoved.csv', header = T, row.names = 1) 
# matrix generated by reading raw data from GEO by ArrayStudio, followed by quantile normalization and batch effect removal (see steps below)
# GSE47460 features two platforms, GPL6480 and GPL14550
# data from two platforms was processed separately from author-provided raw data using ArrayStudio
# using normalized data, probes were collapsed to genes for both sample sets using median of probes
# data from two platforms was merged based on common set of genes
# batch effect of platform removed by ArrayStudio 'Remove batch effect' feature, resulting in matrix above

set.seed(1)
nicecolors <- c("#999999", "#E69F00", "#56B4E9")
gsm <- vector()
for (i in 1:ncol(gse47460_matrix)){
  gsm <- c(gsm, strsplit(colnames(gse47460_matrix)[i], split = '_', fixed = T)[[1]][1])
}
colnames(gse47460_matrix) <- gsm
gse47460_phenom <- as.data.frame(merge(gse47460_pheno, t(gse47460_matrix), by.x = 0, by.y = 0))
rownames(gse47460_phenom) <- gse47460_phenom$Row.names
gse47460_phenom$Row.names <- NULL
set.seed(1)
d_gse47460 <- gse47460_matrix[,row.names(subset(gse47460_pheno, subtype=='2-UIP/IPF'))] # n=160 samples
mads_gse47460=apply(d_gse47460,1,mad)
d_gse47460=d_gse47460[rev(order(mads_gse47460))[1:5000],]
d_gse47460 = as.matrix(sweep(d_gse47460,1, apply(d_gse47460,1,median,na.rm=T)))
dir.create('gse47460_consensus_081420')
title='gse47460_consensus_081420'
results_gse47460 = ConsensusClusterPlus(d_gse47460,maxK=6,reps=1000,pItem=0.8,pFeature=1,clusterAlg="hc",
                                        distance="pearson",seed=1234) # Figure 1A

resultsicl <- calcICL(results_gse47460)
consensusmatrices <- lapply(2:6, function(k) results_gse47460[[k]]$consensusMatrix)
names(consensusmatrices) <- paste0('k', seq(2, 6))
pac <- lapply(consensusmatrices, function(k) diceR::PAC(k, lower = 0.1, upper = 0.9))
pacframe <- as.data.frame(unlist(pac))
colnames(pacframe) <- 'PAC'
pacframe$number_of_clusters <- seq(2, 6)
#Supplementary Figure 1.
ggplot(pacframe, aes(x=number_of_clusters, y=PAC)) + 
  geom_point(shape=18, color="blue", size = 5)+
  geom_line() + scale_y_continuous(limits = c(0.25, 0.35))

consensusdf_gse47460 <- as.data.frame(results_gse47460[[2]]$consensusClass)
colnames(consensusdf_gse47460) <- 'consensusclass'
consensusdf_gse47460$consensusclass <- paste0('Cluster_', consensusdf_gse47460$consensusclass)
healthydf_gse47460 <- subset(gse47460_pheno, Phenotype=='Control')
healthydf2_gse47460 <- healthydf_gse47460[,1]
names(healthydf2_gse47460) <- row.names(healthydf_gse47460)
healthydf2_gse47460 <- as.data.frame(healthydf2_gse47460)
colnames(healthydf2_gse47460) <- 'consensusclass'
consensusdf2_gse47460 <- rbind(consensusdf_gse47460, healthydf2_gse47460)
#consensusdf2_gse47460$geo <- as.data.frame(unlist(strsplit(row.names(consensusdf2_gse47460), split = '_', fixed = T)))[seq(1, 804, by=3),]
gse47460_phenom_consensus <- as.data.frame(merge(consensusdf2_gse47460, gse47460_phenom, by.x = 0, by.y = 0))
rownames(gse47460_phenom_consensus) <- gse47460_phenom_consensus$Row.names
gse47460_phenom_consensus$Row.names <- NULL

#heatmap in Figure 1B generated using ArrayStudio

#Figure 1C:
gse47460_pca <- PCA(gse47460_phenom_consensus, quali.sup = 1:18, graph = FALSE)
fviz_pca_ind(gse47460_pca, habillage = "consensusclass", addEllipses = TRUE, geom = 'point') # Figure 1C

gse47460_phenom_consensus2 <- gse47460_phenom_consensus[which(gse47460_phenom_consensus$smoker!='no_value'),] # narrowed analysis to patients with smoking information available so it can be included in linear model
smokervalues <- row.names(subset(gse47460_phenom_consensus2, consensusclass %in% c("Cluster_1", "Cluster_2")))
fl_gse47460 <- as.factor(subset(gse47460_phenom_consensus2, consensusclass %in% c("Cluster_1", "Cluster_2"))$consensusclass)
sm_gse47460 <- as.factor(subset(gse47460_phenom_consensus2, consensusclass %in% c("Cluster_1", "Cluster_2"))$smoker)
g_gse47460 <- as.factor(subset(gse47460_phenom_consensus2, consensusclass %in% c("Cluster_1", "Cluster_2"))$Sex)
design_gse47460 <- model.matrix(~0+fl_gse47460+sm_gse47460+g_gse47460)
gse47460_matrix2<- gse47460_matrix[,row.names(subset(gse47460_phenom_consensus2, gse47460_phenom_consensus2$Phenotype=='Interstitial lung disease'))]
fit_gse47460 <- lmFit(gse47460_matrix2, design_gse47460)
cont.matrix_gse47460 <- makeContrasts(fl_gse47460Cluster_1-fl_gse47460Cluster_2, levels=design_gse47460)
fit2_gse47460 <- contrasts.fit(fit_gse47460, cont.matrix_gse47460)
fit2_gse47460 <- eBayes(fit2_gse47460)
tT_gse47460 <- topTable(fit2_gse47460, adjust="fdr", sort.by="B", number=nrow(gse47460_matrix))
tT_gse47460$gene <- row.names(tT_gse47460)
tT_gse47460_sig <- subset(tT_gse47460, adj.P.Val<0.05 & abs(logFC)>0.58)

fl_gse47460_vshealthy <- as.factor(gse47460_phenom_consensus2$consensusclass)
sm_gse47460_vshealthy <- as.factor(gse47460_phenom_consensus2$smoker)
g_gse47460_vshealthy <- as.factor(gse47460_phenom_consensus2$Sex)

design_gse47460_vshealthy <- model.matrix(~0+fl_gse47460_vshealthy+sm_gse47460_vshealthy+g_gse47460_vshealthy)
fit_gse47460_vshealthy <- lmFit(t(gse47460_phenom_consensus2[,15:19010]), design_gse47460_vshealthy)
cont.matrix_gse47460_vshealthy <- makeContrasts(fl_gse47460_vshealthyCluster_1-fl_gse47460_vshealthyControl, fl_gse47460_vshealthyCluster_2-fl_gse47460_vshealthyControl,
                                                levels=design_gse47460_vshealthy)
cont.matrix_gse47460_1vshealthy <- makeContrasts(fl_gse47460_vshealthyCluster_1-fl_gse47460_vshealthyControl, levels=design_gse47460_vshealthy)
cont.matrix_gse47460_2vshealthy <- makeContrasts(fl_gse47460_vshealthyCluster_2-fl_gse47460_vshealthyControl, levels=design_gse47460_vshealthy)

fit2_gse47460_vshealthy <- contrasts.fit(fit_gse47460_vshealthy, cont.matrix_gse47460_vshealthy)
fit2_gse47460_vshealthy <- eBayes(fit2_gse47460_vshealthy)
tT_gse47460_vshealthy <- topTable(fit2_gse47460_vshealthy, adjust="fdr", sort.by="B", number=nrow(gse47460_matrix))
tT_gse47460_vshealthy$gene <- row.names(tT_gse47460_vshealthy)
colnames(tT_gse47460_vshealthy)[1:2] <- c('subset1', 'subset2')
tT_gse47460_vshealthy_sig1 <- subset(tT_gse47460_vshealthy, abs(tT_gse47460_vshealthy$subset1)>0.58 & tT_gse47460_vshealthy$adj.P.Val<0.05)
tT_gse47460_vshealthy_sig2 <- subset(tT_gse47460_vshealthy, abs(tT_gse47460_vshealthy$subset2)>0.58 & tT_gse47460_vshealthy$adj.P.Val<0.05)

fit2_gse47460_1vshealthy <- contrasts.fit(fit_gse47460_vshealthy, cont.matrix_gse47460_1vshealthy)
fit2_gse47460_1vshealthy <- eBayes(fit2_gse47460_1vshealthy)
tT_gse47460_1vshealthy <- topTable(fit2_gse47460_1vshealthy, adjust="fdr", sort.by="B", number=nrow(gse47460_matrix))
tT_gse47460_1vshealthy$gene <- row.names(tT_gse47460_1vshealthy)
#colnames(tT_gse47460_1vshealthy)[1:2] <- c('subset1', 'subset2')
tT_gse47460_1vshealthy_sig1 <- subset(tT_gse47460_1vshealthy, abs(tT_gse47460_1vshealthy$logFC)>0.58 & tT_gse47460_1vshealthy$adj.P.Val<0.05)
tT_gse47460_1vshealthy_sig1 <- tT_gse47460_1vshealthy_sig1[order(tT_gse47460_1vshealthy_sig1$logFC, decreasing = T),]
write.xlsx(tT_gse47460_1vshealthy_sig1, file = 'tT_gse47460_1vshealthy_sig1_081720.xlsx')

fit2_gse47460_2vshealthy <- contrasts.fit(fit_gse47460_vshealthy, cont.matrix_gse47460_2vshealthy)
fit2_gse47460_2vshealthy <- eBayes(fit2_gse47460_2vshealthy)
tT_gse47460_2vshealthy <- topTable(fit2_gse47460_2vshealthy, adjust="fdr", sort.by="B", number=nrow(gse47460_matrix))
tT_gse47460_2vshealthy$gene <- row.names(tT_gse47460_2vshealthy)
#colnames(tT_gse47460_2vshealthy)[1:2] <- c('subset1', 'subset2')
tT_gse47460_2vshealthy_sig1 <- subset(tT_gse47460_2vshealthy, abs(tT_gse47460_2vshealthy$logFC)>0.58 & tT_gse47460_2vshealthy$adj.P.Val<0.05)
tT_gse47460_2vshealthy_sig1 <- tT_gse47460_2vshealthy_sig1[order(tT_gse47460_2vshealthy_sig1$logFC, decreasing = T),]
write.xlsx(tT_gse47460_2vshealthy_sig1, file = 'tT_gse47460_2vshealthy_sig1_081720.xlsx')

tT_gse47460_1vshealthy_sig_2vshealthy_sig <- merge(tT_gse47460_1vshealthy_sig1, tT_gse47460_2vshealthy_sig1, by = 'gene')
cor.test(tT_gse47460_1vshealthy_sig_2vshealthy_sig$logFC.x, tT_gse47460_1vshealthy_sig_2vshealthy_sig$logFC.y)

###############Pathway enrichment tables###################
#Table 2A and 2B are generated in IPA

gene1list <- row.names(tT_gse47460_1vshealthy_sig1)
gene1entrez <- bitr(gene1list, fromType = "SYMBOL", toType = c("ENTREZID"), OrgDb = org.Hs.eg.db)
gene1listentrez <- merge(gene1entrez, tT_gse47460_1vshealthy_sig1, by.x = 'SYMBOL', by.y='gene')
gene1listentrez <- gene1listentrez[order(gene1listentrez$logFC, decreasing = T),]
gene1list_vector <- gene1listentrez[,3]
names(gene1list_vector) <- gene1listentrez$ENTREZID
gene1list_go <- gseGO(geneList = gene1list_vector, OrgDb = org.Hs.eg.db, ont = "BP", nPerm = 1000, 
                      minGSSize = 15, maxGSSize = 500, pvalueCutoff = 0.2, verbose = TRUE)
gene1list_kegg <- gseKEGG(geneList = gene1list_vector, organism = 'hsa', nPerm = 1000, minGSSize = 15, 
                          pvalueCutoff = 0.2, verbose = TRUE)
gene1list_goresults <- gene1list_go@result
gene1list_keggresults <- gene1list_kegg@result

gene1list_reactome <- gsePathway(gene1list_vector, nPerm=10000, minGSSize = 15,
                                 pvalueCutoff=0.2, pAdjustMethod="BH", verbose=TRUE)
gene1list_reactome_results <- gene1list_reactome@result
#Table 2C
gene1list_reactome_results <- gene1list_reactome_results[order(gene1list_reactome_results$NES, decreasing = T),]
gene1list_reactome_results_up <- gene1list_reactome_results[which(gene1list_reactome_results$NES>0),]
write.xlsx(gene1list_reactome_results_up, file = 'gse47460_subset1vshealthy_reactome_results_up_081720.xlsx')

gene2list <- row.names(tT_gse47460_2vshealthy_sig1)
gene2entrez <- bitr(gene2list, fromType = "SYMBOL", toType = c("ENTREZID"), OrgDb = org.Hs.eg.db)
gene2listentrez <- merge(gene2entrez, tT_gse47460_2vshealthy_sig1, by.x = 'SYMBOL', by.y='gene')
gene2listentrez <- gene2listentrez[order(gene2listentrez$logFC, decreasing = T),]
gene2list_vector <- gene2listentrez[,3]
names(gene2list_vector) <- gene2listentrez$ENTREZID
gene2list_go <- gseGO(geneList = gene2list_vector, OrgDb = org.Hs.eg.db, ont = "BP", nPerm = 1000, 
                      minGSSize = 15, maxGSSize = 500, pvalueCutoff = 0.2, verbose = TRUE)
gene2list_kegg <- gseKEGG(geneList = gene2list_vector, organism = 'hsa', nPerm = 1000, minGSSize = 15, 
                          pvalueCutoff = 0.2, verbose = TRUE)
gene2list_goresults <- gene2list_go@result
gene2list_keggresults <- gene2list_kegg@result
gene2list_reactome <- gsePathway(gene2list_vector, nPerm=10000, minGSSize = 15,
                                 pvalueCutoff=0.2, pAdjustMethod="BH", verbose=TRUE)
gene2list_reactome_results <- gene2list_reactome@result
gene2list_reactome_results <- gene2list_reactome_results[order(gene2list_reactome_results$NES, decreasing = T),]
#Table 2D
gene2list_reactome_results_up <- gene2list_reactome_results[which(gene2list_reactome_results$NES>0),]
write.xlsx(gene2list_reactome_results_up, file = 'gse47460_subset2vshealthy_reactome_results_up_081720.xlsx')
emapplot(gene2list_reactome)
viewPathway("Extracellular matrix organization", readable=TRUE, foldChange=gene1list_vector)
gseaplot(gene1list_reactome, geneSetID = "R-HSA-1474244")
viewPathway("Extracellular matrix organization", readable=TRUE, foldChange=gene2list_vector)
gseaplot(gene2list_reactome, geneSetID = "R-HSA-1474244")

#Figures 1D and 2A, Supplementary Figures 2 and 4: same code below repeated for each Figure, with appropriate 'genes' vector used
genes <- c('RPGRIP1', 'DNAH6', 'DNAH7', 'DNAI1', 'MUC5B')

rm(list=ls(pattern = 'Figure1_'))
rm(figure1_plots)
for (i in genes){
  assign(paste0('Figure1_', i), ggplot(gse47460_phenom_consensus, aes_string('consensusclass', i, fill = 'consensusclass')) + 
           geom_boxplot(width=0.5, show.legend = F) + geom_jitter(color="black", shape=16, position=position_jitter(0.2)) + theme_bw() + 
           scale_fill_manual(values=c("#999999", "#E69F00", "#56B4E9")) + theme(legend.position = "none"))
}
figure1_plots <- mget(ls(pattern = 'Figure1_'))
rm(list=ls(pattern = 'Figure1_'))
do.call(plot_grid, figure1_plots)

#Figure 1E:
gse32537_reported <- read.csv('gse32537_reported_difflist.csv', header = T)
gse32537_reported_genes <- unique(gse32537_reported$Gene)
gse32537_reported_bygene <- lapply(gse32537_reported_genes, function(k) subset(gse32537_reported, Gene==k)$logFC)
names(gse32537_reported_bygene) <- gse32537_reported_genes
gse32537_reported_geneavs <- lapply(gse32537_reported_bygene, function(k) mean(k)*(-1))
gse32537_reported2 <- as.data.frame(unlist(gse32537_reported_geneavs))
tT_gse47460_tT_gse32537_reported2 <- merge(tT_gse47460, gse32537_reported2, by.x = 'gene', by.y = 0)
rownames(tT_gse47460_tT_gse32537_reported2) <- tT_gse47460_tT_gse32537_reported2$gene
colnames(tT_gse47460_tT_gse32537_reported2)[c(2,8)] <- c('logFC_GSE47460', 'logFC_GSE32537')
#Figure 1F below:
ggplot(tT_gse47460_tT_gse32537_reported2, aes(x=logFC_GSE47460, y=logFC_GSE32537)) + 
  geom_point(shape=18, color="blue")+
  geom_smooth(method=lm,  linetype="dashed",
              color="darkred", fill="blue")
cor.test(tT_gse47460_tT_gse32537_reported2$logFC_GSE47460, tT_gse47460_tT_gse32537_reported2$logFC_GSE32537)

#Figure 2B:
gse47460_phenom_consensus_dlco <- subset(gse47460_phenom_consensus, dlco!='no_value')
gse47460_phenom_consensus_dlco$dlco <- as.numeric(gse47460_phenom_consensus_dlco$dlco)
dunnTest(dlco~consensusclass, 
         data = gse47460_phenom_consensus_dlco)
TukeyHSD(aov(dlco~consensusclass, data = gse47460_phenom_consensus_dlco))
gg_dlco <- ggplot(gse47460_phenom_consensus_dlco, aes(consensusclass, dlco, fill = consensusclass)) + 
  geom_boxplot(width=0.5, show.legend = F) + geom_jitter(color="black", shape=16, position=position_jitter(0.2)) + theme_bw() + 
  scale_fill_manual(values=nicecolors) + theme(legend.position = "none")
nicecolors <- c("#999999", "#E69F00", "#56B4E9")
gse47460_phenom_consensus_fvc_pre <- subset(gse47460_phenom_consensus, fvc_pre!='no_value')
gse47460_phenom_consensus_fvc_pre$fvc_pre <- as.numeric(gse47460_phenom_consensus_fvc_pre$fvc_pre)
dunnTest(fvc_pre~consensusclass, data = gse47460_phenom_consensus_fvc_pre)
gg_fev <- ggplot(gse47460_phenom_consensus_fvc_pre, aes(consensusclass, fvc_pre, fill = consensusclass)) + 
  geom_boxplot(width=0.5, show.legend = F) + geom_jitter(color="black", shape=16, position=position_jitter(0.2)) + theme_bw() + 
  scale_fill_manual(values=nicecolors) + theme(legend.position = "none")

gse47460_phenom_consensus_fev1_pre <- subset(gse47460_phenom_consensus, fev1_pre!='no_value')
gse47460_phenom_consensus_fev1_pre$fev1_pre <- as.numeric(gse47460_phenom_consensus_fev1_pre$fev1_pre)
dunnTest(fev1_pre~consensusclass, data = gse47460_phenom_consensus_fev1_pre)
gg_fvc <- ggplot(gse47460_phenom_consensus_fev1_pre, aes(consensusclass, fev1_pre, fill = consensusclass)) + 
  geom_boxplot(width=0.5, show.legend = F) + geom_jitter(color="black", shape=16, position=position_jitter(0.2)) + theme_bw() + 
  scale_fill_manual(values=nicecolors) + theme(legend.position = "none")

plot_grid(gg_dlco, gg_fev, gg_fvc)

######################Figures 3, 4, 5., Supplementary Figures 3, 5, 6, 8.##############
#GSE132771 processing for cell signatures
#processed fron 10X output provided by authors
#see https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE132771&format=file
.libPaths(c(.libPaths(), "/sc/wo/home/karmajx/R/x86_64-pc-linux-gnu-library/3.4", "/sc/wo/home/karmajx/R/x86_64-pc-linux-gnu-library/3.5"))
options(stringsAsFactors = F)
library(sctransform)
library(Seurat, lib.loc = "/sc/wo/home/karmajx/R/library")
library(httpuv, lib.loc = "/sc/wo/home/karmajx/R/library")
library(cowplot)
library(harmony)
library(magrittr)
library(foreach)
library(future)
library(dplyr)
gc()
for (mydir in list.dirs(recursive = F)){
  assign(paste0(gsub('./', '', mydir, fixed = T), '_10X'), Read10X(mydir,gene.column = 1))
}
gsmlist <- mget(ls(pattern='GSM'))
rm(list=ls(pattern = 'GSM[0-9]'))
for (i in 1:length(gsmlist)){
  assign(paste0(names(gsmlist)[i], '_seuratobject'), CreateSeuratObject(gsmlist[[i]], project = names(gsmlist)[i], min.cells = 5))
}

seuratlist <- mget(ls(pattern = 'seuratobject'))
for (i in 1:24){
  seuratlist[[i]] <- PercentageFeatureSet(seuratlist[[i]], pattern = "^MT-", col.name = "percent.mt")
  seuratlist[[i]] <- subset(seuratlist[[i]], subset = nFeature_RNA > 200 & nFeature_RNA < 3000 & percent.mt < 20)
  seuratlist[[i]] <- NormalizeData(seuratlist[[i]], verbose = TRUE)
  seuratlist[[i]] <- FindVariableFeatures(seuratlist[[i]], selection.method = "vst", nfeatures = 2000)
  seuratlist[[i]] <- ScaleData(seuratlist[[i]], verbose = TRUE)
  seuratlist[[i]] <- RunPCA(seuratlist[[i]], npcs = 30, verbose = TRUE)
  seuratlist[[i]] <- RunTSNE(seuratlist[[i]], reduction = "pca", dims = 1:20)
  seuratlist[[i]] <- FindNeighbors(seuratlist[[i]], reduction = "pca", dims = 1:20)
  seuratlist[[i]] <- FindClusters(seuratlist[[i]], resolution = 0.6)
  seuratlist[[i]] <- RenameCells(seuratlist[[i]], add.cell.id=names(seuratlist)[i])
}

#GSE132771 Total lung cell suspension data
humanseurat_all <- seuratlist[c(seq(10, 24, by=2))]
ipf_all_anchors <- FindIntegrationAnchors(object.list = humanseurat_all, dims = 1:20)
ipf_all_combined <- IntegrateData(anchorset = ipf_all_anchors, dims = 1:20)
DefaultAssay(ipf_all_combined) <- "integrated"
ipf_all_combined <- ScaleData(ipf_all_combined, verbose = TRUE)
ipf_all_combined <- RunPCA(ipf_all_combined, npcs = 30, verbose = TRUE)
ipf_all_combined <- JackStraw(ipf_all_combined, dims = 30)
ipf_all_combined <- RunTSNE(ipf_all_combined, reduction = "pca", dims = 1:30)
ipf_all_combined <- FindNeighbors(ipf_all_combined, reduction = "pca", dims = 1:30)
ipf_all_combined <- FindClusters(ipf_all_combined, resolution = 0.8)
ipf_all_combined@meta.data$celltype <- plyr::mapvalues(ipf_all_combined@active.ident, from = sort(unique(ipf_all_combined@active.ident)), 
                                                       to = c('SPP1_monocytes_0', 'Infl_monocytes_1', 'ACKR1pos_endo_2', 'ACKR1neg_endo_3',
                                                              'Fibroblasts_4', 'AT2_5', 'Th_6', 'Pericytes_7', 'HLAhigh_mac_8',
                                                              'Sm_9', 'HLAhigh_mac_10', 'Bcells_11', 'Tc_12', 'AT1_13', 'PC_14', 'Endo_15',
                                                              'Ciliated_16', 'Monocytes_17', 'Monocytes_18', 'Cluster_19', 'Cluster_20',
                                                              'Bcells_21', 'Pericytes_22', 'AT2_23', 'Endo_24'))
#Supplementary Figure 3A:
Idents(ipf_all_combined) <- 'celltype'
DimPlot(ipf_all_combined, label = T, label.size = 6) + NoLegend()

#GSE132771 Lin- data (CD45-Epcam-CD235a-)
humanseurat_lin <- seuratlist[c(9, 11, 13, 15, 17, 19, 21, 23)]
ipf_lin_anchors <- FindIntegrationAnchors(object.list = humanseurat_lin, dims = 1:20)
ipf_lin_combined <- IntegrateData(anchorset = ipf_lin_anchors, dims = 1:20)
DefaultAssay(ipf_lin_combined) <- "integrated"
ipf_lin_combined <- ScaleData(ipf_lin_combined, verbose = TRUE)
ipf_lin_combined <- RunPCA(ipf_lin_combined, npcs = 30, verbose = TRUE)
ipf_lin_combined <- JackStraw(ipf_lin_combined, dims = 30)
ipf_lin_combined <- RunTSNE(ipf_lin_combined, reduction = "pca", dims = 1:30)
ipf_lin_combined <- FindNeighbors(ipf_lin_combined, reduction = "pca", dims = 1:30)
ipf_lin_combined <- FindClusters(ipf_lin_combined, resolution = 0.4)
ipf_lin_combined@meta.data$celltype <- plyr::mapvalues(ipf_lin_combined@active.ident, from = sort(unique(ipf_lin_combined@active.ident)), 
                                                       to = c('THY1high_alv_fib_0', 'THY1pos_sm_1', 'THY1neg_sm_2', 'CTHRC1pos_3', 'Adventitial_4',
                                                              'THY1neg_alv_fib_5', 'Pericytes_6', 'Peribronchial_7', 'Sm_8', 'Alveolar_9',
                                                              'Alveolar_10', 'Epi_11', 'Hematopoietic_12', 'Sm_13', 'Hematopoietic_14'))
Idents(ipf_lin_combined) <- 'celltype'
#Supplementary Figure 3B:
DimPlot(ipf_lin_combined, label = T, label.size = 6) + NoLegend()
FeaturePlot(ipf_lin_combined, 'THY1', pt.size = 1)
save(ipf_lin_combined, file = "/sc/wo/tri_data/jk/2019_07_Hs_Sheppard_IPF_ILD_scRNAseq/ipf_control_lin_noig_combined_symbols_fromraw_062620.RData")

#GSE135893 processing
setwd("/sc/wo/tri_data/jk/2020_05_Hs_IPF_GSE135893")
gse135893 <- readRDS('GSE135893_ILD_annotated_fullsize.rds') # RDS file deposited by authors of GSE135893
#see https://ftp.ncbi.nlm.nih.gov/geo/series/GSE135nnn/GSE135893/suppl/GSE135893_ILD_annotated_fullsize.rds.gz
#removing IG genes so they are not used as plasma cell signature
notigh <- row.names(gse135893@assays$RNA@counts)[setdiff(1:nrow(gse135893@assays$RNA@counts), grep('^IGH', row.names(gse135893@assays$RNA@counts)))]
notigl <- setdiff(notigh, notigh[grep('^IGL', notigh)])
notigk <- setdiff(notigl, notigl[grep('^IGK', notigl)])
notigk[grep('^IG', notigk)]

gse135893_ipf_control <- subset(gse135893, features = notigk, cells = row.names(subset(gse135893@meta.data, gse135893@meta.data$Diagnosis %in% c('IPF', 'Control'))))
DefaultAssay(gse135893_ipf_control) <- 'RNA'
gse135893_ipf_control <- NormalizeData(gse135893_ipf_control, verbose = TRUE)
gse135893_ipf_control <- FindVariableFeatures(gse135893_ipf_control, selection.method = "vst", nfeatures = 2000)
gse135893_ipf_control <- ScaleData(gse135893_ipf_control, verbose = TRUE)
gse135893_ipf_control <- RunPCA(gse135893_ipf_control, npcs = 30, verbose = TRUE)
#gse135893_ipf_control <- JackStraw(gse135893_ipf_control, num.replicate = 100)
#gse135893_ipf_control <- ScoreJackStraw(gse135893_ipf_control, dims = 1:20)
gse135893_ipf_control <- RunUMAP(gse135893_ipf_control, reduction = "pca", dims = 1:20)
gse135893_ipf_control <- FindNeighbors(gse135893_ipf_control, reduction = "pca", dims = 1:20)
gse135893_ipf_control <- FindClusters(gse135893_ipf_control, resolution = 1)
DimPlot(gse135893_ipf_control, label = T, label.size = 6, split.by = 'Diagnosis') + NoLegend()
DimPlot(gse135893_ipf_control, label = T, label.size = 4) + NoLegend()
FeaturePlot(gse135893_ipf_control, features = c('C1QA', 'SPP1', 'HLA-DBP1', 'ACKR1', 'GZMB'), pt.size = 1)
FeaturePlot(gse135893_ipf_control, features = c('COL1A1', 'CTHRC1', 'HAS1', 'HAS2'), pt.size = 1)

save(gse135893_ipf_control, file = 'gse135893_ipf_control_seuratnorm_062820.RData')
FeaturePlot(gse135893_ipf_control, features = c('SFTPC', 'SFTPA1', 'SFTPA2', 'MUC5B'), pt.size = 1)
FeaturePlot(gse135893_ipf_control, features = c('SOX4', 'ELF3', 'TSC22D1', 'NUPR1', 'JUND'), pt.size = 1, split.by = 'Diagnosis')
FeaturePlot(gse135893_ipf_control, features = c('GSN'), pt.size = 1, split.by = 'Diagnosis')
gse135893_ipf_control@meta.data$celltypenew <- plyr::mapvalues(gse135893_ipf_control@active.ident, 
                                                               from = sort(unique(gse135893_ipf_control@active.ident)), 
                                                               to = c('Ciliated_0', 'Ciliated_1', 'AT2_2',
                                                                      'SPP1_mac_3', 'C1QA_mac_4', 'C1QA_mac_5',
                                                                      'cDC_6', 'Mono_7', 'Tc_8',
                                                                      'C1QA_mac_9', 'Th_10', 'AT1_11', 'C1QA_mac_12',
                                                                      'AT2_13', 'ACKR1_pos_endo_14',
                                                                      'MUC5Bpos_AT1_15', 'ACKR1_neg_endo_16',
                                                                      'Basal_AT1_17', 'Diff_cil_18', 
                                                                      'Fibroblasts_19', 'ACKR1_neg_endo_20',
                                                                      'Monocytes_21', 'Prolif_mac_22', 'Fibroblasts_23', 
                                                                      'Ly_endo_24', 'Bcells_25', 'Sm_26', 'MC_27', 'PC_28',
                                                                      'AT2_29', 'AT2_30', 'Mesothelial_31', 'Mac_32'))
Idents(gse135893_ipf_control) <- 'celltypenew'

#Supplementary Figure 5A:
DimPlot(gse135893_ipf_control, label = T, label.size = 4) + NoLegend()

#calculate cell type signatures. Repeated for each dataset (GSE132771 total lung suspension, GSE132771 Lin- data, GSE135893).
#i.e. total 3 gene signature sets are developed.
#example of GSE132771 total lung shown.

data <- as.matrix(ipf_all_combined@assays$RNA@data)
celltypes <- as.character(ipf_all_combined@active.ident)
metadata <- ipf_all_combined@meta.data
cells <- unique(metadata[,"celltype"])
group <- unique(metadata[,"celltype"])
optimizeGeneList <- function(geneList,data,label){
  glist <- data.frame(geneIn=0,AUC=0,stringsAsFactors = F)
  geneList <- geneList[order(-geneList[,2]),]
  n <- ifelse(nrow(geneList)>100,100,nrow(geneList))
  for(j in c(1:n)){
    ge <- rownames(geneList)[1:j]
    geD2 <- apply(data[ge,,drop=FALSE],2,mean)
    gROC2 <- roc(label,geD2)
    gROC_auc2 <- as.numeric(auc(gROC2))
    
    glist[j,1] <- j
    glist[j,2] <- gROC_auc2
  }
  increaseThr <- 0.005
  maxAUC <- max(glist[,2])
  glist[,3] <- maxAUC-glist[,2]
  mu <- min(which(glist[,3]<increaseThr))
  mu <- ifelse(mu<10,10,mu)
  
  g <- rownames(geneList)[1:mu]
  return(g)
}
library(foreach)
library(doParallel)
library(pROC)
cl <- makeCluster(8)
registerDoParallel(cl)
betweenGroupCompare <- foreach(i = 1:length(unique(celltypes)), .packages=c("Seurat","pROC")) %dopar% {
  .libPaths(c(.libPaths(), "/sc/wo/home/karmajx/R/x86_64-pc-linux-gnu-library/3.4", "/sc/wo/home/karmajx/R/x86_64-pc-linux-gnu-library/3.5"))
  library(MAST)
  c <- unique(celltypes)[i]
  id <- Idents(ipf_all_combined)
  label <- ifelse(id==c,1,0)
  
  geneList <- FindMarkers(object=ipf_all_combined,ident.1=c,test.use="MAST",min.pct=0.05,only.pos = TRUE)
  g <- optimizeGeneList(geneList,data,label)
  
  r <- list(cellgroup=c,result=geneList,signature=g)
  return(r)
}
stopCluster(cl)

fcthr <- 0.38
fdrthr <- 0.05
cl <- makeCluster(length(betweenGroupCompare))
registerDoParallel(cl)
x <- foreach(i = 1:length(betweenGroupCompare), .packages=c("Seurat","pROC")) %dopar% {
  
  g <- betweenGroupCompare[[i]]$cellgroup
  sigR <- betweenGroupCompare[[i]]$result
  sig <- rownames(sigR)[sigR[,2]>fcthr & sigR[,5]<fdrthr]
  
  cellT <- as.character(unique(metadata[metadata[,"celltype"]==g,"celltype"]))
  
  cellSig <- list()
  ci <- 1
  if(length(cellT)>1){
    subm <- metadata[metadata[,"celltype"]==g,]
    subd <- data[sig,rownames(subm)]
    subo <- CreateSeuratObject(subd)
    subo <- AddMetaData(object=subo,metadata=subm)
    subo <- SetIdent(object=subo,value=subm[,"celltype"])
    for(j in c(1:length(cellT))){
      id <- Idents(subo)
      label <- ifelse(id==cellT[j],1,0)
      genelist <- FindMarkers(object=subo,ident.1=cellT[j],test.use="MAST",min.pct=0.05,only.pos = TRUE)
      g <- optimizeGeneList(genelist,subd,label)
      
      r <- list(celltype=cellT[j],genelist=genelist,signature=g)
      cellSig[[ci]] <- r
      ci <- ci+1
    }
  }else{
    r <- list(celltype=g,genelist=sigR,signature=betweenGroupCompare[[i]]$signature)
    cellSig[[ci]] <- r
    ci <- ci+1
  }
  return(cellSig)
}
stopCluster(cl)

gene_sig <- data.frame(celltype=NA,signature=NA,stringsAsFactors = F)
gi <- 1
for(i in c(1:length(x))){
  a <- x[[i]]
  if(length(a)==1){
    l <- a[[1]]$celltype
    b <- a[[1]]$signature
    gene_sig[(gi:(gi+length(b)-1)),1] <- rep(l,length(b))
    gene_sig[(gi:(gi+length(b)-1)),2] <- b
    gi <- gi+length(b)
  }else{
    for(j in c(1:length(a))){
      l <- a[[j]]$celltype
      b <- a[[j]]$signature
      gene_sig[(gi:(gi+length(b)-1)),1] <- rep(l,length(b))
      gene_sig[(gi:(gi+length(b)-1)),2] <- b
      gi <- gi+length(b)
    }
  }
}
write.table(gene_sig,file="ipf_all_combined_signature.txt",row.names=F,col.names = T,sep="\t",quote=F)

#########plot heatmap######
#gene_sig <- gene_sig[!is.na(gene_sig[,2]),]
gene_sig <- cellsignatures
c <- unique(gene_sig[,1])
g <- unique(gene_sig[,2])
cgMatrix <- array(0,dim=c(length(c),length(g)))
rownames(cgMatrix) <- c
colnames(cgMatrix) <- g

for(i in c(1:nrow(gene_sig))){
  cgMatrix[gene_sig[i,1],gene_sig[i,2]] <- 1
}

library(pheatmap)

data <- gse135893_ipf_control@assays$RNA@data
metadata <- gse135893_ipf_control@meta.data
data <- as.matrix(data)

filter_d <- data[g,]
filter_d <- 2^filter_d-1
c <- unique(gene_sig[,1])
finalSig <- array(0,dim=c(length(g),length(c)))
rownames(finalSig) <- g
colnames(finalSig) <- c
for(i in c(1:length(c))){
  k <- c[i]
  l <- rownames(metadata)[metadata[,"celltypenew"]==k]
  b <- filter_d[,l]
  b <- apply(b,1,mean)
  if(length(which(is.na(b)))>0){
    cat("I:",i,"\n")
  }
  finalSig[,i] <- b
}

a <- (finalSig-apply(finalSig,1,mean))/apply(finalSig,1,sd)
a[a<(-2)] <- (-2)
a[a>2] <- 2

library(pheatmap)
png("IPF_gse135893_lin_manuscript_heatmap.png",width=1000,height=1000,res=100)

red <- colorRampPalette(c("blue","white","red"))(255)
mybreak <- seq(-2,2,length.out=256)

pheatmap(a,color=red,breaks=mybreak,scale="none",cluster_rows=F,cluster_cols=F,show_rownames=FALSE,show_colnames=T)

dev.off()

write.table(finalSig,file="IPF_scRNAseq_17cells_signature_MAST_referenceMatrix.txt",row.names=T,col.names=T,sep="\t",quote=F)


#calculate signature scores in GSE47460. Repeated for each signature set (GSE132771 total lung suspension, GSE132771 Lin- data, GSE135893).
#example of GSE132771 total lung shown.

#correct Sheppard lin
cellsignatures_lin <- read.table('/sc/wo/tri_data/jk/2019_07_Hs_Sheppard_IPF_ILD_scRNAseq/ipf_control_lin_scRNAseq_cells_signature_MAST_062620.txt', header = T)
#correct Sheppard all
cellsignatures <- read.table('/sc/wo/tri_data/jk/2019_07_Hs_Sheppard_IPF_ILD_scRNAseq/ipf_control_all_res08_scRNAseq_cells_signature_MAST_062620.txt', header = T)
#correct GSE135893
cellsignatures_gse135893 <- read.table('/sc/wo/tri_data/jk/2020_05_Hs_IPF_GSE135893/gse135893_ipf_control_res1_noig_seuratnorm_signature_062720.txt', header = T)

cellsignatures$celltype <- gsub('-', '_', cellsignatures$celltype, fixed = T)
celltypes <- unique(cellsignatures$celltype)
cellsignatures_genes <- lapply(celltypes, function(k) subset(cellsignatures, celltype==k)$signature)
names(cellsignatures_genes) <- celltypes
sort(unlist(lapply(cellsignatures_genes, function(k) length(intersect(row.names(gse47460_matrix), k)))))

cellsignatures_lin$celltype <- gsub('-', '_', cellsignatures_lin$celltype, fixed = T)
celltypes_lin <- unique(cellsignatures_lin$celltype)
cellsignatures_lin_genes <- lapply(celltypes_lin, function(k) subset(cellsignatures_lin, celltype==k)$signature)
names(cellsignatures_lin_genes) <- celltypes_lin
sort(unlist(lapply(cellsignatures_lin_genes, function(k) length(intersect(row.names(gse47460_matrix), k)))))

cellsignatures_gse135893$celltype <- gsub('-', '_', cellsignatures_gse135893$celltype, fixed = T)
celltypes_gse135893 <- unique(cellsignatures_gse135893$celltype)
cellsignatures_gse135893_genes <- lapply(celltypes_gse135893, function(k) subset(cellsignatures_gse135893, celltype==k)$signature)
names(cellsignatures_gse135893_genes) <- celltypes_gse135893
sort(unlist(lapply(cellsignatures_gse135893_genes, function(k) length(intersect(row.names(gse47460_matrix), k)))))

#removed 'Cluster_19 in GSE135893' (mitochondrial genes high) as it has no overlap with genes in GSE47460
cellsignatures_genes <- cellsignatures_genes[c(1:15, 17:25)]
names(cellsignatures_genes)

gse47460_celltypegsva <- lapply(cellsignatures_genes, function(k) gsva(as.matrix(gse47460_matrix), gset.idx.list = list(k)))
names(gse47460_celltypegsva) <- names(cellsignatures_genes)
gse47460_celltypegsva_table <- as.data.frame(do.call(rbind, gse47460_celltypegsva))
rownames(gse47460_celltypegsva_table) <- names(gse47460_celltypegsva)

gse47460_celltypegsva_lin <- lapply(cellsignatures_lin_genes, function(k) gsva(as.matrix(gse47460_matrix), gset.idx.list = list(k)))
names(gse47460_celltypegsva_lin) <- names(cellsignatures_lin_genes)
gse47460_celltypegsva_lin_table <- as.data.frame(do.call(rbind, gse47460_celltypegsva_lin))
rownames(gse47460_celltypegsva_lin_table) <- names(gse47460_celltypegsva_lin)

gse47460_celltypegsva_gse135893 <- lapply(cellsignatures_gse135893_genes, function(k) gsva(as.matrix(gse47460_matrix), gset.idx.list = list(k)))
names(gse47460_celltypegsva_gse135893) <- names(cellsignatures_gse135893_genes)
gse47460_celltypegsva_gse135893_table <- as.data.frame(do.call(rbind, gse47460_celltypegsva_gse135893))
rownames(gse47460_celltypegsva_gse135893_table) <- names(gse47460_celltypegsva_gse135893)

#Supplementary Figures 3C, 3D, 5B.
#correlation matrix from GSVA scores
#Supplementary Figure 3C.
corr <- round(cor(t(gse47460_celltypegsva_table)), 3)
p.mat <- cor_pmat(t(gse47460_celltypegsva_table))
ggcorrplot(corr, hc.order = TRUE, outline.col = "white")
ggcorrplot(corr, hc.order = TRUE, type = "lower",
           outline.col = "white",
           #ggtheme = ggplot2::theme_gray,
           colors = c("#6D9EC1", "white", "#E46726"), lab = T)

#Supplementary Figure 3D.
corr_lin <- round(cor(t(gse47460_celltypegsva_lin_table)), 3)
p.mat_lin <- cor_pmat(t(gse47460_celltypegsva_lin_table))
ggcorrplot(corr_lin, hc.order = TRUE, outline.col = "white")
ggcorrplot(corr_lin, hc.order = TRUE, type = "lower",
           outline.col = "white",
           #ggtheme = ggplot2::theme_gray,
           colors = c("#6D9EC1", "white", "#E46726"), lab = T)

#Supplementary Figure 5B.
corr_gse135893 <- round(cor(t(gse47460_celltypegsva_gse135893_table)), 3)
p.mat_gse135893 <- cor_pmat(t(gse47460_celltypegsva_gse135893_table))
ggcorrplot(corr_gse135893, hc.order = TRUE, outline.col = "white")
ggcorrplot(corr_gse135893, hc.order = TRUE, type = "lower",
           outline.col = "white",
           #ggtheme = ggplot2::theme_gray,
           colors = c("#6D9EC1", "white", "#E46726"), lab = T)

save.image('/sc/wo/tri_data/jk/2018_IPF_publicdata_JK/ipf_consensusanalysis_manuscriptclean_with_gsvamatrices_111020.RData')

gse47460_celltypegsva_consensus <- merge(consensusdf2_gse47460, t(gse47460_celltypegsva_table), by.x = 0, by.y = 0)
rownames(gse47460_celltypegsva_consensus) <- gse47460_celltypegsva_consensus$Row.names
gse47460_celltypegsva_consensus$Row.names <- NULL

gse47460_celltypegsva_lin_consensus <- merge(consensusdf2_gse47460, t(gse47460_celltypegsva_lin_table), by.x = 0, by.y = 0)
rownames(gse47460_celltypegsva_lin_consensus) <- gse47460_celltypegsva_lin_consensus$Row.names
gse47460_celltypegsva_lin_consensus$Row.names <- NULL

gse47460_celltypegsva_gse135893_consensus <- merge(consensusdf2_gse47460, t(gse47460_celltypegsva_gse135893_table), by.x = 0, by.y = 0)
rownames(gse47460_celltypegsva_gse135893_consensus) <- gse47460_celltypegsva_gse135893_consensus$Row.names
gse47460_celltypegsva_gse135893_consensus$Row.names <- NULL

#Figure 3 and 4.
rm(list=ls(pattern = 'gg_celltypes2_gse47460_'))
rm(target_gg_gse47460)
for (i in colnames(gse47460_celltypegsva_consensus)[2:ncol(gse47460_celltypegsva_consensus)]){
  assign(paste0('gg_celltypes2_gse47460_', i), ggplot(gse47460_celltypegsva_consensus, aes_string('consensusclass', i, fill = 'consensusclass')) + 
           geom_boxplot(width=0.5, show.legend = F) + geom_jitter(color="black", shape=16, position=position_jitter(0.2)) + theme_bw() + 
           scale_fill_manual(values=c("#999999", "#E69F00", "#56B4E9")) + theme(legend.position = "none") + 
           ylab('Signature score') + ggtitle(i) + theme(plot.title = element_text(size = 12), legend.position = "none")) + xlab('consensusclass')
}
target_gg_gse47460 <- mget(ls(pattern = 'gg_celltypes2_gse47460_'))
rm(list=ls(pattern = 'gg_celltypes2_gse47460_'))
do.call(plot_grid, target_gg_gse47460)

#Figure 5.
rm(list=ls(pattern = 'gg_celltypes2_gse47460_'))
rm(target_gg_gse47460)
for (i in colnames(gse47460_celltypegsva_lin_consensus)[2:ncol(gse47460_celltypegsva_lin_consensus)]){
  assign(paste0('gg_celltypes2_gse47460_', i), ggplot(gse47460_celltypegsva_lin_consensus, aes_string('consensusclass', i, fill = 'consensusclass')) + 
           geom_boxplot(width=0.5, show.legend = F) + geom_jitter(color="black", shape=16, position=position_jitter(0.2)) + theme_bw() + 
           scale_fill_manual(values=c("#999999", "#E69F00", "#56B4E9")) + theme(legend.position = "none") + 
           ylab('Signature score') + ggtitle(i) + theme(plot.title = element_text(size = 12), legend.position = "none")) + xlab('consensusclass')
}
target_gg_gse47460 <- mget(ls(pattern = 'gg_celltypes2_gse47460_'))
rm(list=ls(pattern = 'gg_celltypes2_gse47460_'))
do.call(plot_grid, target_gg_gse47460)

#Supplementary Figure 6.
rm(list=ls(pattern = 'gg_celltypes2_gse47460_'))
rm(target_gg_gse47460)
for (i in colnames(gse47460_celltypegsva_gse135893_consensus)[2:ncol(gse47460_celltypegsva_gse135893_consensus)]){
  assign(paste0('gg_celltypes2_gse47460_', i), ggplot(gse47460_celltypegsva_gse135893_consensus, aes_string('consensusclass', i, fill = 'consensusclass')) + 
           geom_boxplot(width=0.5, show.legend = F) + geom_jitter(color="black", shape=16, position=position_jitter(0.2)) + theme_bw() + 
           scale_fill_manual(values=c("#999999", "#E69F00", "#56B4E9")) + theme(legend.position = "none") + 
           ylab('Signature score') + ggtitle(i) + theme(plot.title = element_text(size = 12), legend.position = "none")) + xlab('consensusclass')
}
target_gg_gse47460 <- mget(ls(pattern = 'gg_celltypes2_gse47460_'))
rm(list=ls(pattern = 'gg_celltypes2_gse47460_'))
do.call(plot_grid, target_gg_gse47460)

#####################GSE134692#######################
gse134692 <- read.table ('GSE134692_tmm_normalized_and_filtered_log2CPM.txt', header = T, stringsAsFactors = F, row.names = 1)
gse134692_annotation <- read.delim('GSE134692_gene_annotation.txt', header = T, sep = '\t', row.names = 1)
gse134692_pheno <- read.delim('GSE134692_design.txt', header = T, row.names = 1, sep = '\t')
table(gse134692_pheno$DiseaseStatus)

set.seed(1)
d_gse134692 <- gse134692[,row.names(subset(gse134692_pheno, DiseaseStatus=='IPF'))]
mads_gse134692=apply(d_gse134692,1,mad)
d_gse134692=d_gse134692[rev(order(mads_gse134692))[1:5000],]
d_gse134692 = as.matrix(sweep(d_gse134692,1, apply(d_gse134692,1,median,na.rm=T)))
dir.create('gse134692_consensus_081820a_5000')
title='gse134692_consensus_081820a_5000'
results_gse134692 = ConsensusClusterPlus(d_gse134692,maxK=6,reps=1000,pItem=0.8,pFeature=1,clusterAlg="hc", title = title,
                                         distance="pearson",seed=1234, plot = 'png')
resultsicl <- calcICL(results_gse134692)
consensusmatrices <- lapply(2:6, function(k) results_gse134692[[k]]$consensusMatrix)
names(consensusmatrices) <- paste0('k', seq(2, 6))
pac <- lapply(consensusmatrices, function(k) diceR::PAC(k, lower = 0.1, upper = 0.9))
pacframe <- as.data.frame(unlist(pac))
colnames(pacframe) <- 'PAC'
pacframe$number_of_clusters <- seq(2, 6)
ggplot(pacframe, aes(x=number_of_clusters, y=PAC)) + 
  geom_point(shape=18, color="blue", size = 5)+
  geom_line()# + scale_y_continuous(limits = c(0.25, 0.35))

consensusdf_gse134692 <- as.data.frame(results_gse134692[[2]]$consensusClass)
colnames(consensusdf_gse134692) <- 'consensusclass'
consensusdf_gse134692$consensusclass <- paste0('Cluster_', consensusdf_gse134692$consensusclass)
consensusdf_gse134692$reverseclass <- plyr::mapvalues(consensusdf_gse134692$consensusclass, from = c('Cluster_1', 'Cluster_2'), to = c('Cluster_2', 'Cluster_1')) # so colors match with other figures
healthydf_gse134692 <- subset(gse134692_pheno, DiseaseStatus=='Normal')
#rownames(healthydf_gse134692) <- healthydf_gse134692$geo
healthydf2_gse134692 <- healthydf_gse134692[,2]
names(healthydf2_gse134692) <- row.names(healthydf_gse134692)
healthydf2_gse134692 <- as.data.frame(healthydf2_gse134692)
colnames(healthydf2_gse134692) <- 'consensusclass'
healthydf2_gse134692$reverseclass <- rep('Normal', nrow(healthydf2_gse134692))
consensusdf2_gse134692 <- rbind(consensusdf_gse134692, healthydf2_gse134692)

cellsignatures$celltype <- gsub('-', '_', cellsignatures$celltype, fixed = T)
celltypes <- unique(cellsignatures$celltype)
cellsignatures_genes <- lapply(celltypes, function(k) subset(cellsignatures, celltype==k)$signature)
names(cellsignatures_genes) <- celltypes
cellsignatures_genes_ensembl <- lapply(cellsignatures_genes, function(k) row.names(subset(gse134692_annotation, GeneName %in% k)))
sort(unlist(lapply(cellsignatures_genes_ensembl, function(k) length(intersect(k, row.names(gse134692))))))

gse134692_celltypegsva <- lapply(cellsignatures_genes_ensembl, function(k) gsva(as.matrix(gse134692), gset.idx.list = list(k)))
names(gse134692_celltypegsva) <- names(cellsignatures_genes_ensembl)
gse134692_celltypegsva_table <- do.call(rbind, gse134692_celltypegsva)
rownames(gse134692_celltypegsva_table) <- names(gse134692_celltypegsva)
gse134692_celltypegsva_consensus <- merge(consensusdf2_gse134692, t(gse134692_celltypegsva_table), by.x = 0, by.y = 0)
rownames(gse134692_celltypegsva_consensus) <- gse134692_celltypegsva_consensus$Row.names
gse134692_celltypegsva_consensus$Row.names <- NULL

#Supplementary Figure 7.
rm(list=ls(pattern = 'gg_celltypes2_gse134692_'))
rm(target_gg_gse134692)
for (i in colnames(gse134692_celltypegsva_consensus)[3:ncol(gse134692_celltypegsva_consensus)]){
  assign(paste0('gg_celltypes2_gse134692_', i), ggplot(gse134692_celltypegsva_consensus, aes_string('reverseclass', i, fill = 'reverseclass')) + 
           geom_boxplot(width=0.5, show.legend = F) + geom_jitter(color="black", shape=16, position=position_jitter(0.2)) + theme_bw() + 
           scale_fill_manual(values=c("#999999", "#E69F00", "#56B4E9")) + theme(legend.position = "none") + 
           ylab('Signature score') + ggtitle(i) + theme(plot.title = element_text(size = 12), legend.position = "none")) + xlab('consensusclass')
}
target_gg_gse134692 <- mget(ls(pattern = 'gg_celltypes2_gse134692_'))
rm(list=ls(pattern = 'gg_celltypes2_gse134692_'))
do.call(plot_grid, target_gg_gse134692)
save.image('gse134692_consensusanalysis_081820.RData')
gse134692_phenom_consensus <- merge(gse134692_pheno, t(gse134692), by.x = 0, by.y = 0)
rownames(gse134692_phenom_consensus) <- gse134692_phenom_consensus$Row.names
gse134692_phenom_consensus$Row.names <- NULL

######################Figure 6.#########################
#Figure 6A generated using ArrayStudio
#Figure 6B (code below is repeated for each chemokine):
FeaturePlot(gse135893_ipf_control, features = 'CCL5', pt.size = 1)
ggplot(gse47460_phenom_consensus, aes(consensusclass, CCL5, fill = consensusclass)) + 
  geom_boxplot(width=0.5, show.legend = F) + geom_jitter(color="black", shape=16, position=position_jitter(0.2)) + theme_bw() + 
  scale_fill_manual(values=c("#999999", "#E69F00", "#56B4E9")) + theme(legend.position = "none")
dunnTest(CXCL17~consensusclass, data = gse47460_phenom_consensus)

######################Figure 7., Supplementary Figures 8 and 9.####################
#process GSE135893 to separate 'Ciliated_high' and 'Ciliated_low donors
gse135893 <- readRDS('GSE135893_ILD_annotated_fullsize.rds')
gse135893_ipf <- subset(gse135893, cells = row.names(subset(gse135893@meta.data, gse135893@meta.data$Diagnosis=='IPF')))
DefaultAssay(gse135893_ipf) <- 'RNA'
gse135893_ipf <- PercentageFeatureSet(gse135893_ipf, pattern = "^MT-", col.name = "percent.mt")
gse135893_ipf <- NormalizeData(gse135893_ipf, verbose = TRUE)
gse135893_ipf <- FindVariableFeatures(gse135893_ipf, selection.method = "vst", nfeatures = 2000)
gse135893_ipf <- ScaleData(gse135893_ipf, verbose = TRUE, vars.to.regress = 'percent.mt')

gse135893_ipf <- RunPCA(gse135893_ipf, npcs = 30, verbose = TRUE)
gse135893_ipf <- JackStraw(gse135893_ipf, num.replicate = 100)
gse135893_ipf <- ScoreJackStraw(gse135893_ipf, dims = 1:20)
gse135893_ipf <- RunUMAP(gse135893_ipf, reduction = "pca", dims = 1:20)
gse135893_ipf <- FindNeighbors(gse135893_ipf, reduction = "pca", dims = 1:20)
gse135893_ipf <- FindClusters(gse135893_ipf, resolution = 0.6)
FeaturePlot(gse135893_ipf, features = c('SPP1', 'C1QA', 'GZMB', 'JCHAIN', 'ACKR1', 'HAS1'), pt.size = 1)
FeaturePlot(gse135893_ipf, features = c('CDH11', 'LPAR1', 'LOXL2'), pt.size = 1, split.by = 'ciliated')
VlnPlot(gse135893_ipf, features = c('CDH11', 'LPAR1', 'LOXL2'), pt.size = 0, log = T, group.by = 'ciliated', idents = 'HAS1_fibro_14')
gse135893_fibroblasts_ciliatedhighvslow <- FindMarkers(gse135893_ipf, test.use = 'MAST', logfc.threshold = 0,
                                                       subset.ident = 'Fibroblasts_17', group.by = 'ciliated', 
                                                       ident.1 = 'Ciliated_high', ident.2 = 'Ciliated_low')
gse135893_fibroblasts_ciliatedhighvslow$gene <- row.names(gse135893_fibroblasts_ciliatedhighvslow)
gse135893_ipf@meta.data$celltypenew <- plyr::mapvalues(gse135893_ipf@active.ident, 
                                                    from = sort(unique(gse135893_ipf@active.ident)),
                                                    to = c('SPP1pos_macs_0', 'Ciliated_1', 'C1QA_mac_2', 'Ciliated_3', 'AT1_4', 'AT2_5', 'C1QA_mac_6', 
                                                           'ACKR1pos_endo_7', 'Monocytes_8', 'AT1_9', 'Th_10', 'AT1_11', 'Macs_12', 'Tc_13', 'HAS1_fibro_14',
                                                           'Diff_ciliated_15', 'ACKR1neg_endo_16', 'Fibroblasts_17', 'Prolif_macs_18', 'Lymph_endo_19',
                                                           'Sm_20', 'Bcells_21', 'Macs_22', 'PC_23', 'AT2_24', 'MC_25', 'AT1_26', 'Macs_27', 'Ciliated_28', 
                                                           'Fibroblast_29'))
Idents(gse135893_ipf) <- 'celltypenew'
DimPlot(gse135893_ipf, label = T, label.size = 4) + NoLegend()
save(gse135893_ipf, file = 'gse135893_ipfonly_seurat_withseuratnorm_062620.RData')

cellclusters <- function(x){
  abstable <- as.matrix.data.frame(table(x@meta.data$orig.ident, x@meta.data$celltypenew))
  perctable <- round(prop.table(abstable, 1)*100, 2)
  colnames(perctable) <- colnames(table(x@meta.data$orig.ident, x@meta.data$celltypenew))
  rownames(perctable) <- row.names(table(x@meta.data$orig.ident, x@meta.data$celltypenew))
  perctable <- as.data.frame(perctable)
  return(perctable)
}
gse135893_ipf_perctable <- cellclusters(gse135893_ipf)
#celltypebydonor <- as.matrix.data.frame(table(gse135893_ipf@meta.data$orig.ident, gse135893_ipf@meta.data$celltype))
write.csv(gse135893_ipf_perctable, file = 'gse135893_ipfonly_celltypebydonorpercentage_seuratnorm.csv')
gse135893_ipf@meta.data$ciliated <- plyr::mapvalues(gse135893_ipf@meta.data$orig.ident, 
                                                    from = sort(unique(gse135893_ipf@meta.data$orig.ident)),
                                                    to = c(rep('Ciliated_low', 8), rep('Ciliated_high', 4), rep('Ciliated_low', 2), 
                                                           rep('Ciliated_high', 2), 'Ciliated_low', 'Ciliated_high', 'Ciliated_low'))

#Supplementary Figure 8:
DimPlot(gse135893_ipf, label = T, label.size = 4, split.by = 'ciliated') + NoLegend()

#Supplementary Table 3:
gse135893_ipf_perctable$totalciliated <- gse135893_ipf_perctable$Ciliated_1+gse135893_ipf_perctable$Ciliated_2+gse135893_ipf_perctable$Ciliated_3+
  gse135893_ipf_perctable$Ciliated_28+gse135893_ipf_perctable$Diff_ciliated_15
ciliatedvector <- vector()
for (i in 1:nrow(gse135893_ipf_perctable)){
  if (gse135893_ipf_perctable$totalciliated[i]>20){
    ciliatedvector[i] <- 'ciliated_high'
  }else{
    ciliatedvector[i] <- 'ciliated_low'
  }
}

gse135893_ipf_perctable$totalmyeloid <- gse135893_ipf_perctable$SPP1pos_macs_0 + gse135893_ipf_perctable$C1QA_mac_2 + gse135893_ipf_perctable$C1QA_mac_6 +  
  gse135893_ipf_perctable$Monocytes_8 + gse135893_ipf_perctable$Macs_12 + gse135893_ipf_perctable$Prolif_macs_18 + gse135893_ipf_perctable$Macs_22 + gse135893_ipf_perctable$Macs_27

gse135893_ipf_perctable$totalendothelial <- gse135893_ipf_perctable$ACKR1pos_endo_7 + gse135893_ipf_perctable$ACKR1neg_endo_16 +gse135893_ipf_perctable$Lymph_endo_19

gse135893_ipf_perctable$ciliatedphenotype <- ciliatedvector
ciliatedtests <- apply(gse135893_ipf_perctable[,1:31], 2, function(k) t.test(k~ciliatedphenotype, data = gse135893_ipf_perctable))
ciliated_p <- lapply(ciliatedtests, function(k) c(k$p.value, (k$estimate[1]-k$estimate[2])))
ciliatedteststable <- as.data.frame(do.call(rbind, ciliated_p))
colnames(ciliatedteststable) <- c('p_value', 'difference')
ciliatedteststable$padjust <- p.adjust(ciliatedteststable$p_value, method = 'BH')

#Supplementary Figure 8B. Repeated for all three populations.
ggplot(gse135893_ipf_perctable, aes(ciliatedphenotype, totalciliated, fill = ciliatedphenotype)) + 
  geom_boxplot(width=0.5, show.legend = F) + geom_jitter(color="black", shape=16, position=position_jitter(0.2)) + theme_bw() + 
  scale_fill_manual(values=c("#999999", "#E69F00")) + theme(legend.position = "none")


#############################ciliated clusterprofiler#####################
#Supplementary Tables 2A and 2B are generated in IPA

gse135893_ipf_ciliatedhighvshealthy <- FindMarkers(gse135893_ipf_control_noig, test.use = 'MAST', 
                                                   group.by = 'ciliated', ident.1 = 'Ciliated_high', ident.2 = 'Healthy')
gse135893_ipf_ciliatedhighvshealthy$gene <- row.names(gse135893_ipf_ciliatedhighvshealthy)
gse135893_ipf_ciliatedhighvshealthy_sig <- subset(gse135893_ipf_ciliatedhighvshealthy, p_val_adj<0.05)
xlsx::write.xlsx(gse135893_ipf_ciliatedhighvshealthy_sig[,c(6, 2, 5)], file = 'gse135893_ipf_ciliatedhighvshealthy.xlsx', row.names = F)

gse135893_ipf_ciliatedlowvshealthy <- FindMarkers(gse135893_ipf_control_noig, test.use = 'MAST', 
                                                  group.by = 'ciliated', ident.1 = 'Ciliated_low', ident.2 = 'Healthy')
gse135893_ipf_ciliatedlowvshealthy$gene <- row.names(gse135893_ipf_ciliatedlowvshealthy)
gse135893_ipf_ciliatedlowvshealthy_sig <- subset(gse135893_ipf_ciliatedlowvshealthy, p_val_adj<0.05)
xlsx::write.xlsx(gse135893_ipf_ciliatedlowvshealthy_sig[,c(6, 2, 5)], file = 'gse135893_ipf_ciliatedlowvshealthy.xlsx', row.names = F)

ciliatedlowlist <- row.names(gse135893_ipf_ciliatedlowvshealthy_sig)
ciliatedlowentrez <- bitr(ciliatedlowlist, fromType = "SYMBOL", toType = c("ENTREZID"), OrgDb = org.Hs.eg.db)
ciliatedlowlistentrez <- merge(ciliatedlowentrez, gse135893_ipf_ciliatedlowvshealthy_sig, by.x = 'SYMBOL', by.y='gene')
ciliatedlowlistentrez <- ciliatedlowlistentrez[order(ciliatedlowlistentrez$avg_logFC, decreasing = T),]
ciliatedlowlist_vector <- ciliatedlowlistentrez[,4]
names(ciliatedlowlist_vector) <- ciliatedlowlistentrez$ENTREZID
ciliatedlowlist_go <- gseGO(geneList = ciliatedlowlist_vector, OrgDb = org.Hs.eg.db, ont = "BP", nPerm = 1000, 
                            minGSSize = 15, maxGSSize = 500, pvalueCutoff = 0.2, verbose = TRUE)
ciliatedlowlist_kegg <- gseKEGG(geneList = ciliatedlowlist_vector, organism = 'hsa', nPerm = 1000, minGSSize = 15, 
                                pvalueCutoff = 0.2, verbose = TRUE)
ciliatedlowlist_goresults <- ciliatedlowlist_go@result
ciliatedlowlist_keggresults <- ciliatedlowlist_kegg@result

ciliatedlowlist_reactome <- gsePathway(ciliatedlowlist_vector, nPerm=10000, minGSSize = 15,
                                       pvalueCutoff=1, pAdjustMethod="BH", verbose=TRUE)
ciliatedlowlist_reactome_results <- ciliatedlowlist_reactome@result
ciliatedlowlist_reactome_results <- ciliatedlowlist_reactome_results[order(ciliatedlowlist_reactome_results$NES, decreasing = T),]
#Supplementary Table 2C
ciliatedlowlist_reactome_results_up <- ciliatedlowlist_reactome_results[which(ciliatedlowlist_reactome_results$NES>0),]
xlsx::write.xlsx(ciliatedlowlist_reactome_results_up, file = 'gse135893_ciliatedlowlist_reactome_results_up.xlsx')
#emapplot(ciliatedlowlist_reactome)

ciliatedhighlist <- row.names(gse135893_ipf_ciliatedhighvshealthy_sig)
ciliatedhighentrez <- bitr(ciliatedhighlist, fromType = "SYMBOL", toType = c("ENTREZID"), OrgDb = org.Hs.eg.db)
ciliatedhighlistentrez <- merge(ciliatedhighentrez, gse135893_ipf_ciliatedhighvshealthy_sig, by.x = 'SYMBOL', by.y='gene')
ciliatedhighlistentrez <- ciliatedhighlistentrez[order(ciliatedhighlistentrez$avg_logFC, decreasing = T),]
ciliatedhighlist_vector <- ciliatedhighlistentrez[,4]
names(ciliatedhighlist_vector) <- ciliatedhighlistentrez$ENTREZID
ciliatedhighlist_go <- gseGO(geneList = ciliatedhighlist_vector, OrgDb = org.Hs.eg.db, ont = "BP", nPerm = 1000, 
                             minGSSize = 15, maxGSSize = 500, pvalueCutoff = 0.2, verbose = TRUE)
ciliatedhighlist_kegg <- gseKEGG(geneList = ciliatedhighlist_vector, organism = 'hsa', nPerm = 1000, minGSSize = 15, 
                                 pvalueCutoff = 0.2, verbose = TRUE)
ciliatedhighlist_goresults <- ciliatedhighlist_go@result
ciliatedhighlist_keggresults <- ciliatedhighlist_kegg@result
ciliatedhighlist_reactome <- gsePathway(ciliatedhighlist_vector, nPerm=10000, minGSSize = 15,
                                        pvalueCutoff=1, pAdjustMethod="BH", verbose=TRUE)
ciliatedhighlist_reactome_results <- ciliatedhighlist_reactome@result
ciliatedhighlist_reactome_results <- ciliatedhighlist_reactome_results[order(ciliatedhighlist_reactome_results$NES, decreasing = T),]
#Supplementary Table 2D
ciliatedhighlist_reactome_results_up <- ciliatedhighlist_reactome_results[which(ciliatedhighlist_reactome_results$NES>0),]
xlsx::write.xlsx(ciliatedhighlist_reactome_results_up, file = 'gse135893_ciliatedhighlist_reactome_results_up.xlsx')

#Figure 7A:
hist(gse135893_ipf_perctable$totalciliated, col = 'blue')

#PyMINEr and NicheNet
library(CBDD) # Implementation of PyMINEr by Clarivate Analytics
library(CBDDnetworks)
library(nichenetr)
load('~/Rpackages/nichenetr_start_060420.RData') # compilation of NicheNet ligand receptor files by authors of NicheNet
#https://github.com/saeyslab/nichenetr

gse135893_ipfmatrix <- as.matrix(gse135893_ipf@assays$RNA@data)
gse135893_ipfmatrix <- gse135893_ipfmatrix[which(rowSums(gse135893_ipfmatrix)>0),]
gse135893_ipfidents <- as.data.frame(gse135893_ipf@active.ident)
colnames(gse135893_ipfidents) <- 'celltype'
gse135893_ipfidents$celltype <- as.character(gse135893_ipfidents$celltype)
gse135893_ipfidents$cellnames <- row.names(gse135893_ipfidents)
#ligand-receptor interactions obtained from PMID 26198319, formatted to file named 'receptor_ligand.csv'
reclig <- read.csv('/sc/wo/tri_data/jk/2019_07_Hs_Sheppard_IPF_ILD_scRNAseq/receptor_ligand.csv', header = T)
reclig <- subset(reclig, reclig$Evidence %in% c('literature_supported', 'putative'))
reclig <- reclig[,c(2, 3)]
colnames(reclig) <- c('ligand', 'receptor')

ciliated_highcells <- row.names(subset(gse135893_ipf@meta.data, ciliated=='Ciliated_high'))
ciliated_lowcells <- row.names(subset(gse135893_ipf@meta.data, ciliated=='Ciliated_low'))
ciliated_highmatrix <- gse135893_ipfmatrix[,ciliated_highcells]
ciliated_lowmatrix <- gse135893_ipfmatrix[,ciliated_lowcells]
library(psych)
ciliated_highidents <- gse135893_ipfidents[ciliated_highcells,]
ciliated_lowidents <- gse135893_ipfidents[ciliated_lowcells,]
ciliated_high_miner <- PyMINEr(ciliated_highmatrix, reclig, ciliated_highidents$celltype, 'logcounts')
ciliated_high_miner$meanz <- apply(ciliated_high_miner[,5:6], 1, geometric.mean)
ciliated_low_miner <- PyMINEr(ciliated_lowmatrix, reclig, ciliated_lowidents$celltype, 'logcounts')
ciliated_low_miner$meanz <- apply(ciliated_low_miner[,5:6], 1, geometric.mean)
save(ciliated_high_miner, ciliated_low_miner, file = 'gse135893_pyminer_seuratnorm_ipf_cilitedsubsets.RData')

###########ciliated low mac (condition 1, Figure 7B)################
ciliated_low_miner_mac <- ciliated_low_miner[c(grep('Mac', ciliated_low_miner$celltype1), grep('Mono', ciliated_low_miner$celltype1)),]
ciliated_low_miner_mac <- subset(ciliated_low_miner_mac, z1>quantile(ciliated_low_miner_mac$z1, 0.95))
ciliated_low_miner_mac <- subset(ciliated_low_miner_mac, z2>quantile(ciliated_low_miner_mac$z2, 0.95))

grid_col_ligand =rand_color(length(unique(ciliated_low_miner_mac$ligand)))
names(grid_col_ligand) <- unique(ciliated_low_miner_mac$ligand)
grid_col_target = rand_color(length(unique(ciliated_low_miner_mac$receptor)))
names(grid_col_target) <- unique(ciliated_low_miner_mac$receptor)

grid_col_tbl_ligand = tibble(ligand = grid_col_ligand %>% names(), color_ligand_type = grid_col_ligand)
grid_col_tbl_ligand
grid_col_tbl_target = tibble(receptor = grid_col_target %>% names(), color_target_type = grid_col_target)
grid_col_tbl_target
circos_links <- ciliated_low_miner_mac[,c(1:4, 7)]
circos_links = circos_links %>% inner_join(grid_col_tbl_ligand, by = 'ligand') %>% inner_join(grid_col_tbl_target, by = 'receptor')
links_circle = circos_links %>% select(ligand,receptor, meanz)
#circos_links = circos_links %>% mutate(ligand = paste(ligand," "))

ligand_color = circos_links %>% distinct(ligand,color_ligand_type)
grid_ligand_color = ligand_color$color_ligand_type %>% set_names(ligand_color$ligand)
target_color = circos_links %>% distinct(receptor,color_target_type)
grid_target_color = target_color$color_target_type %>% set_names(target_color$receptor)

grid_col =c(grid_ligand_color,grid_target_color)

# give the option that links in the circos plot will be transparent ~ ligand-target potential score
transparency = circos_links %>% mutate(meanz =(meanz-min(meanz))/(max(meanz)-min(meanz))) %>% mutate(transparency = 1-meanz) %>% .$transparency

circos_links$condensedcelltype <- plyr::mapvalues(circos_links$celltype2, from = sort(unique(circos_links$celltype2)),
                                                  to = c('AT1', rep('B', 2), 'Basal', 'Ciliated', 'Club', rep('DC', 3),
                                                         'Fibroblast', 'Goblet', 'Endothelial', rep('Mac', 3), 'Fibroblast',
                                                         'Epithelial', 'NK', 'Sm', 'T', rep('Endothelial', 5)))
receptoroccurence <- as.matrix.data.frame(table(circos_links$condensedcelltype, circos_links$receptor))
rownames(receptoroccurence) <- sort(unique(circos_links$condensedcelltype))
colnames(receptoroccurence) <- sort(unique(circos_links$receptor))
receptoroccurence <- as.data.frame(receptoroccurence)
receptoroccurence_total <- apply(receptoroccurence, 2, function(k) sum(k>0))
condensedcelltype_links <- vector()
for (i in 1:nrow(circos_links)){
  if (receptoroccurence_total[circos_links$receptor[i]]==1){
    condensedcelltype_links[i] <- circos_links$condensedcelltype[i]
  }else{
    condensedcelltype_links[i] <- 'mixed'
  }
}
circos_links$targetcelltype <- condensedcelltype_links
circos_links_mixed <- subset(circos_links, circos_links$targetcelltype=='mixed')
circos_links_unique <- subset(circos_links, circos_links$targetcelltype!='mixed')
circos_links_unique <- circos_links_unique[order(circos_links_unique$targetcelltype),]
circos_links_reorg <- as.data.frame(rbind(circos_links_unique, circos_links_mixed))

target_order = circos_links_reorg$receptor %>% unique()
ligand_order = circos_links_reorg[order(circos_links_reorg$celltype1),]$ligand %>% unique() %>% sort()
order = c(ligand_order,target_order)
#Figure 7B-7C
circos.clear()
chordDiagram(links_circle, directional = 1,order=order,link.sort = TRUE, link.decreasing = FALSE, grid.col = grid_col,transparency = transparency, diffHeight = 0.005, direction.type = c("diffHeight", "arrows"),link.arr.type = "big.arrow", link.visible = links_circle$meanz,annotationTrack = "grid", 
             preAllocateTracks = list(list(track.height = 0.075), list(track.height = 0.2)))
circos.track(track.index = 2, panel.fun = function(x, y) {
  circos.text(CELL_META$xcenter, CELL_META$ylim[1], CELL_META$sector.index,
              facing = "clockwise", niceFacing = TRUE, adj = c(0, 0.55), cex = 0.7)
}, bg.border = NA) #
highlight.sector(circos_links_reorg$ligand, track.index = 1, col = "red", font = 2, facing = 'bending.outside', text.vjust = '0.5mm',
                 text = "Ligands produced by macrophages", cex = 0.6, text.col = "white", niceFacing = TRUE)
lapply(unique(circos_links_reorg$targetcelltype), 
       function(k) highlight.sector(subset(circos_links_reorg, targetcelltype==k)$receptor, track.index = 1, col = rand_color(length(unique(circos_links_reorg$targetcelltype))), font = 2, facing = 'bending.inside', text.vjust = '0.5mm',
                                    text = k, cex = 0.6, text.col = "white", niceFacing = TRUE))

##############ciliated high mac (condition 2, Figure 7C)#####################
ciliated_high_miner_mac <- ciliated_high_miner[c(grep('Mac', ciliated_high_miner$celltype1), grep('Mono', ciliated_high_miner$celltype1)),]
ciliated_high_miner_mac <- subset(ciliated_high_miner_mac, z1>quantile(ciliated_high_miner_mac$z1, 0.9))
ciliated_high_miner_mac <- subset(ciliated_high_miner_mac, z2>quantile(ciliated_high_miner_mac$z2, 0.9))

grid_col_ligand =rand_color(length(unique(ciliated_high_miner_mac$ligand)))
names(grid_col_ligand) <- unique(ciliated_high_miner_mac$ligand)
grid_col_target = rand_color(length(unique(ciliated_high_miner_mac$receptor)))
names(grid_col_target) <- unique(ciliated_high_miner_mac$receptor)

grid_col_tbl_ligand = tibble(ligand = grid_col_ligand %>% names(), color_ligand_type = grid_col_ligand)
grid_col_tbl_ligand
grid_col_tbl_target = tibble(receptor = grid_col_target %>% names(), color_target_type = grid_col_target)
grid_col_tbl_target
circos_links <- ciliated_high_miner_mac[,c(1:4, 7)]
circos_links = circos_links %>% inner_join(grid_col_tbl_ligand, by = 'ligand') %>% inner_join(grid_col_tbl_target, by = 'receptor')
links_circle = circos_links %>% select(ligand,receptor, meanz)
#circos_links = circos_links %>% mutate(ligand = paste(ligand," "))

ligand_color = circos_links %>% distinct(ligand,color_ligand_type)
grid_ligand_color = ligand_color$color_ligand_type %>% set_names(ligand_color$ligand)
target_color = circos_links %>% distinct(receptor,color_target_type)
grid_target_color = target_color$color_target_type %>% set_names(target_color$receptor)

grid_col =c(grid_ligand_color,grid_target_color)

# give the option that links in the circos plot will be transparent ~ ligand-target potential score
transparency = circos_links %>% mutate(meanz =(meanz-min(meanz))/(max(meanz)-min(meanz))) %>% mutate(transparency = 1-meanz) %>% .$transparency

circos_links$condensedcelltype <- plyr::mapvalues(circos_links$celltype2, from = sort(unique(circos_links$celltype2)),
                                                  to = c('Basal', 'Ciliated', 'Club', 'Fibroblast',
                                                         'Endothelial', rep('Mac', 5), 'Fibroblast',
                                                         'Epithelial', 'NK', 'Pericyte', rep('T', 2), 'Endothelial'))
receptoroccurence <- as.matrix.data.frame(table(circos_links$condensedcelltype, circos_links$receptor))
rownames(receptoroccurence) <- sort(unique(circos_links$condensedcelltype))
colnames(receptoroccurence) <- sort(unique(circos_links$receptor))
receptoroccurence <- as.data.frame(receptoroccurence)
receptoroccurence_total <- apply(receptoroccurence, 2, function(k) sum(k>0))
condensedcelltype_links <- vector()
for (i in 1:nrow(circos_links)){
  if (receptoroccurence_total[circos_links$receptor[i]]==1){
    condensedcelltype_links[i] <- circos_links$condensedcelltype[i]
  }else{
    condensedcelltype_links[i] <- 'mixed'
  }
}
circos_links$targetcelltype <- condensedcelltype_links
circos_links_mixed <- subset(circos_links, circos_links$targetcelltype=='mixed')
circos_links_unique <- subset(circos_links, circos_links$targetcelltype!='mixed')
circos_links_unique <- circos_links_unique[order(circos_links_unique$targetcelltype),]
circos_links_reorg <- as.data.frame(rbind(circos_links_unique, circos_links_mixed))

target_order = circos_links_reorg$receptor %>% unique()
ligand_order = circos_links_reorg[order(circos_links_reorg$celltype1),]$ligand %>% unique() %>% sort()
order = c(ligand_order,target_order)

#Figure 7B-7C
circos.clear()
chordDiagram(links_circle, directional = 1,order=order,link.sort = TRUE, link.decreasing = FALSE, grid.col = grid_col,transparency = transparency, diffHeight = 0.005, direction.type = c("diffHeight", "arrows"),link.arr.type = "big.arrow", link.visible = links_circle$meanz,annotationTrack = "grid", 
             preAllocateTracks = list(list(track.height = 0.075), list(track.height = 0.2)))
circos.track(track.index = 2, panel.fun = function(x, y) {
  circos.text(CELL_META$xcenter, CELL_META$ylim[1], CELL_META$sector.index,
              facing = "clockwise", niceFacing = TRUE, adj = c(0, 0.55), cex = 0.7)
}, bg.border = NA) #
               

###################ciliated_low (condition 3, Figure 7D)##################
ciliated_low_miner_ciliated <- ciliated_low_miner[grep('Ciliated', ciliated_low_miner$celltype1),]
ciliated_low_miner_ciliated <- subset(ciliated_low_miner_ciliated, z1>quantile(ciliated_low_miner_ciliated$z1, 0.95))
ciliated_low_miner_ciliated <- subset(ciliated_low_miner_ciliated, z2>quantile(ciliated_low_miner_ciliated$z2, 0.95))

circos.clear()
circos_links <- ciliated_low_miner_ciliated[,c(1:4, 7)]
grid_col_ligand =rand_color(length(unique(ciliated_low_miner_ciliated$ligand)))
names(grid_col_ligand) <- unique(ciliated_low_miner_ciliated$ligand)
grid_col_ligand
grid_col_target = rand_color(length(unique(ciliated_low_miner_ciliated$receptor)))
names(grid_col_target) <- unique(ciliated_low_miner_ciliated$receptor)

grid_col_tbl_ligand = tibble(ligand = grid_col_ligand %>% names(), color_ligand_type = grid_col_ligand)
grid_col_tbl_ligand
grid_col_tbl_target = tibble(receptor = grid_col_target %>% names(), color_target_type = grid_col_target)
grid_col_tbl_target

circos_links = circos_links %>% inner_join(grid_col_tbl_ligand, by = 'ligand') %>% inner_join(grid_col_tbl_target, by = 'receptor')
links_circle = circos_links %>% select(ligand,receptor, meanz)
#circos_links = circos_links %>% mutate(ligand = paste(ligand," ")) # extra space: make a difference between a gene as ligand and a gene as target!

ligand_color = circos_links %>% distinct(ligand,color_ligand_type)
grid_ligand_color = ligand_color$color_ligand_type %>% set_names(ligand_color$ligand)
target_color = circos_links %>% distinct(receptor,color_target_type)
grid_target_color = target_color$color_target_type %>% set_names(target_color$receptor)

grid_col =c(grid_ligand_color,grid_target_color)

# give the option that links in the circos plot will be transparent ~ ligand-target potential score
transparency = circos_links %>% mutate(meanz =(meanz-min(meanz))/(max(meanz)-min(meanz))) %>% mutate(transparency = 1-meanz) %>% .$transparency

circos_links$condensedcelltype <- plyr::mapvalues(circos_links$celltype2, from = sort(unique(circos_links$celltype2)),
                                                  to = c('Basal', 'Ciliated', 'Club', 'Fibroblast', 'Goblet', 'Endothelial', 
                                                         rep('Mac', 2), 'Fibroblast', 'Pericyte', 'Sm', rep('T', 2), rep('Endothelial', 4)))
receptoroccurence <- as.matrix.data.frame(table(circos_links$condensedcelltype, circos_links$receptor))
rownames(receptoroccurence) <- sort(unique(circos_links$condensedcelltype))
colnames(receptoroccurence) <- sort(unique(circos_links$receptor))
receptoroccurence <- as.data.frame(receptoroccurence)
receptoroccurence_total <- apply(receptoroccurence, 2, function(k) sum(k>0))
condensedcelltype_links <- vector()
for (i in 1:nrow(circos_links)){
  if (receptoroccurence_total[circos_links$receptor[i]]==1){
    condensedcelltype_links[i] <- circos_links$condensedcelltype[i]
  }else{
    condensedcelltype_links[i] <- 'mixed'
  }
}
circos_links$targetcelltype <- condensedcelltype_links
circos_links_mixed <- subset(circos_links, circos_links$targetcelltype=='mixed')
circos_links_unique <- subset(circos_links, circos_links$targetcelltype!='mixed')
circos_links_unique <- circos_links_unique[order(circos_links_unique$targetcelltype),]
circos_links_reorg <- as.data.frame(rbind(circos_links_unique, circos_links_mixed))

target_order = circos_links_reorg$receptor %>% unique()
ligand_order = circos_links_reorg[order(circos_links_reorg$celltype1),]$ligand %>% unique() %>% sort()
order = c(ligand_order,target_order)
#Figure 7D-7E
circos.clear()
chordDiagram(links_circle, directional = 1,order=order,link.sort = TRUE, link.decreasing = FALSE, grid.col = grid_col,transparency = transparency, diffHeight = 0.005, direction.type = c("diffHeight", "arrows"),link.arr.type = "big.arrow", link.visible = links_circle$meanz,annotationTrack = "grid", 
             preAllocateTracks = list(list(track.height = 0.075), list(track.height = 0.2)))
circos.track(track.index = 2, panel.fun = function(x, y) {
  circos.text(CELL_META$xcenter, CELL_META$ylim[1], CELL_META$sector.index,
              facing = "clockwise", niceFacing = TRUE, adj = c(0, 0.55), cex = 0.7)
}, bg.border = NA) #
highlight.sector(circos_links_reorg$ligand, track.index = 1, col = "red", font = 2, facing = 'bending.outside', text.vjust = '0.5mm',
                 text = "Ligands produced by ciliated epithelial cells", cex = 0.6, text.col = "white", niceFacing = TRUE)
lapply(unique(circos_links_reorg$targetcelltype), 
       function(k) highlight.sector(subset(circos_links_reorg, targetcelltype==k)$receptor, track.index = 1, col = rand_color(length(unique(circos_links_reorg$targetcelltype))), font = 2, facing = 'bending.inside', text.vjust = '0.5mm',
                                    text = k, cex = 0.6, text.col = "white", niceFacing = TRUE))

###############ciliated_high (condition 4, Figure 7E)#############
ciliated_high_miner_ciliated <- ciliated_high_miner[grep('Ciliated', ciliated_high_miner$celltype1),]
ciliated_high_miner_ciliated <- subset(ciliated_high_miner_ciliated, z1>quantile(ciliated_high_miner_ciliated$z1, 0.95))
ciliated_high_miner_ciliated <- subset(ciliated_high_miner_ciliated, z2>quantile(ciliated_high_miner_ciliated$z2, 0.95))

circos.clear()
circos_links <- ciliated_high_miner_ciliated[,c(1:4, 7)]
grid_col_ligand = rand_color(length(unique(ciliated_high_miner_ciliated$ligand)))
names(grid_col_ligand) <- unique(ciliated_high_miner_ciliated$ligand)
grid_col_target = rand_color(length(unique(ciliated_high_miner_ciliated$receptor)))
names(grid_col_target) <- unique(ciliated_high_miner_ciliated$receptor)

grid_col_tbl_ligand = tibble(ligand = grid_col_ligand %>% names(), color_ligand_type = grid_col_ligand)
grid_col_tbl_target = tibble(receptor = grid_col_target %>% names(), color_target_type = grid_col_target)

#circos_links = circos_links %>% mutate(ligand = paste(ligand," ")) # extra space: make a difference between a gene as ligand and a gene as target!
circos_links = circos_links %>% inner_join(grid_col_tbl_ligand, by = 'ligand') %>% inner_join(grid_col_tbl_target, by = 'receptor')
links_circle = circos_links %>% select(ligand,receptor, meanz)

ligand_color = circos_links %>% distinct(ligand,color_ligand_type)
grid_ligand_color = ligand_color$color_ligand_type %>% set_names(ligand_color$ligand)
target_color = circos_links %>% distinct(receptor,color_target_type)
grid_target_color = target_color$color_target_type %>% set_names(target_color$receptor)

grid_col =c(grid_ligand_color,grid_target_color)

# give the option that links in the circos plot will be transparent ~ ligand-target potential score
transparency = circos_links %>% mutate(meanz =(meanz-min(meanz))/(max(meanz)-min(meanz))) %>% mutate(transparency = 1-meanz) %>% .$transparency

circos_links$condensedcelltype <- plyr::mapvalues(circos_links$celltype2, from = sort(unique(circos_links$celltype2)),
                                                  to = c('Ciliated', 'Club', 'DC', 'Epithelial', 'T', rep('Endothelial', 3)))
receptoroccurence <- as.matrix.data.frame(table(circos_links$condensedcelltype, circos_links$receptor))
rownames(receptoroccurence) <- sort(unique(circos_links$condensedcelltype))
colnames(receptoroccurence) <- sort(unique(circos_links$receptor))
receptoroccurence <- as.data.frame(receptoroccurence)
receptoroccurence_total <- apply(receptoroccurence, 2, function(k) sum(k>0))
condensedcelltype_links <- vector()
for (i in 1:nrow(circos_links)){
  if (receptoroccurence_total[circos_links$receptor[i]]==1){
    condensedcelltype_links[i] <- circos_links$condensedcelltype[i]
  }else{
    condensedcelltype_links[i] <- 'mixed'
  }
}
circos_links$targetcelltype <- condensedcelltype_links
circos_links_mixed <- subset(circos_links, circos_links$targetcelltype=='mixed')
circos_links_unique <- subset(circos_links, circos_links$targetcelltype!='mixed')
circos_links_unique <- circos_links_unique[order(circos_links_unique$targetcelltype),]
circos_links_reorg <- as.data.frame(rbind(circos_links_unique, circos_links_mixed))

target_order = circos_links_reorg$receptor %>% unique()
ligand_order = circos_links_reorg[order(circos_links_reorg$celltype1),]$ligand %>% unique() %>% sort()
order = c(ligand_order,target_order)

#chord diagram
#process below repeated for all four conditions above, producing 4 different diagrams
#Figure 7B-7C
circos.clear()
chordDiagram(links_circle, directional = 1,order=order,link.sort = TRUE, link.decreasing = FALSE, grid.col = grid_col,transparency = transparency, diffHeight = 0.005, direction.type = c("diffHeight", "arrows"),link.arr.type = "big.arrow", link.visible = links_circle$meanz,annotationTrack = "grid", 
             preAllocateTracks = list(list(track.height = 0.075), list(track.height = 0.2)))
circos.track(track.index = 2, panel.fun = function(x, y) {
  circos.text(CELL_META$xcenter, CELL_META$ylim[1], CELL_META$sector.index,
              facing = "clockwise", niceFacing = TRUE, adj = c(0, 0.55), cex = 0.7)
}, bg.border = NA) #
highlight.sector(circos_links_reorg$ligand, track.index = 1, col = "red", font = 2, facing = 'bending.outside', text.vjust = '0.5mm',
                 text = "Ligands produced by macrophages", cex = 0.6, text.col = "white", niceFacing = TRUE)
lapply(unique(circos_links_reorg$targetcelltype), 
       function(k) highlight.sector(subset(circos_links_reorg, targetcelltype==k)$receptor, track.index = 1, col = rand_color(length(unique(circos_links_reorg$targetcelltype))), font = 2, facing = 'bending.inside', text.vjust = '0.5mm',
                                    text = k, cex = 0.6, text.col = "white", niceFacing = TRUE))

#Figure 7D-7E
circos.clear()
chordDiagram(links_circle, directional = 1,order=order,link.sort = TRUE, link.decreasing = FALSE, grid.col = grid_col,transparency = transparency, diffHeight = 0.005, direction.type = c("diffHeight", "arrows"),link.arr.type = "big.arrow", link.visible = links_circle$meanz,annotationTrack = "grid", 
             preAllocateTracks = list(list(track.height = 0.075), list(track.height = 0.2)))
circos.track(track.index = 2, panel.fun = function(x, y) {
  circos.text(CELL_META$xcenter, CELL_META$ylim[1], CELL_META$sector.index,
              facing = "clockwise", niceFacing = TRUE, adj = c(0, 0.55), cex = 0.7)
}, bg.border = NA) #
highlight.sector(circos_links_reorg$ligand, track.index = 1, col = "red", font = 2, facing = 'bending.outside', text.vjust = '0.5mm',
                 text = "Ligands produced by ciliated epithelial cells", cex = 0.6, text.col = "white", niceFacing = TRUE)
lapply(unique(circos_links_reorg$targetcelltype), 
       function(k) highlight.sector(subset(circos_links_reorg, targetcelltype==k)$receptor, track.index = 1, col = rand_color(length(unique(circos_links_reorg$targetcelltype))), font = 2, facing = 'bending.inside', text.vjust = '0.5mm',
                                    text = k, cex = 0.6, text.col = "white", niceFacing = TRUE))

#nichenet output
library(nichenetr)

nichenet_output = nichenet_seuratobj_aggregate(
  seurat_obj = gse135893_ipf, 
  receiver = c('SPP1pos_macs_0', 'Ciliated_1', 'C1QA_mac_2', 'Ciliated_3', 'AT1_4', 'AT2_5', 'C1QA_mac_6', 
               'ACKR1pos_endo_7', 'Monocytes_8', 'AT1_9', 'Th_10', 'AT1_11', 'Macs_12', 'Tc_13', 'HAS1_fibro_14',
               'Diff_ciliated_15', 'ACKR1neg_endo_16', 'Fibroblasts_17', 'Prolif_macs_18', 'Lymph_endo_19',
               'Sm_20', 'Bcells_21', 'Macs_22', 'PC_23', 'AT2_24', 'MC_25', 'AT1_26', 'Macs_27', 'Ciliated_28', 
               'Fibroblast_29'), 
  condition_colname = "ciliated", condition_oi = "Ciliated_high", condition_reference = "Ciliated_low", 
  sender = c('SPP1pos_macs_0', 'Ciliated_1', 'C1QA_mac_2', 'Ciliated_3', 'AT1_4', 'AT2_5', 'C1QA_mac_6', 
             'ACKR1pos_endo_7', 'Monocytes_8', 'AT1_9', 'Th_10', 'AT1_11', 'Macs_12', 'Tc_13', 'HAS1_fibro_14',
             'Diff_ciliated_15', 'ACKR1neg_endo_16', 'Fibroblasts_17', 'Prolif_macs_18', 'Lymph_endo_19',
             'Sm_20', 'Bcells_21', 'Macs_22', 'PC_23', 'AT2_24', 'MC_25', 'AT1_26', 'Macs_27', 'Ciliated_28', 
             'Fibroblast_29'), 
  ligand_target_matrix = ligand_target_matrix, lr_network = lr_network, weighted_networks = weighted_networks, organism = "human",
  filter_top_ligands = FALSE)
#Supplementary Figure 9:
DotPlot(gse135893_ipf, features = nichenet_output$top_ligands[1:30] %>% rev(), cols = "RdYlBu") + RotatedAxis()
save(nichenet_output, file = 'gse135893_nichenet_ipfonly_ciliated_seuratnorm.RData')
######################Figure 8.##########################
library(ISLR)
library(caret)
library(MLeval)
options(stringsAsFactors = F)

#celltype-based classifiers
gse47460_celltypegsva_consensus <- merge(consensusdf_gse47460, t(gse47460_celltypegsva_table), by.x = 0, by.y = 0)
rownames(gse47460_celltypegsva_consensus) <- gse47460_celltypegsva_consensus$Row.names
gse47460_celltypegsva_consensus$Row.names <- NULL
gse47460_celltypegsva_consensus$Geo <- NULL
gse47460_celltypegsva_consensus$consensusclass <- paste0('cluster_', gse47460_celltypegsva_consensus$consensusclass)
gse47460_celltypegsva_consensus$consensusclass <- as.factor(gse47460_celltypegsva_consensus$consensusclass)
set.seed(1)
indxTrain <- createDataPartition(y = gse47460_celltypegsva_consensus$consensusclass,p = 0.7,list = FALSE)
training <- gse47460_celltypegsva_consensus[indxTrain,]
testing <- gse47460_celltypegsva_consensus[-indxTrain,]
set.seed(400)
ctrl <- trainControl(method="repeatedcv", repeats = 5 , classProbs=TRUE, summaryFunction = twoClassSummary, savePredictions = TRUE)
svmFit <- train(consensusclass ~ ., data = training, method = "svmLinear", trControl = ctrl, preProcess = c("center","scale"))
svmPredict <- predict(svmFit,newdata = testing )
library(pROC)
svmPredict <- predict(svmFit,newdata = testing , type="prob")
svmROC <- roc(as.factor(testing$consensusclass),svmPredict[,"cluster_1"])
svmimportance <- varImp(svmFit)
library(gbm)
gbmFit <- train(consensusclass ~ ., data = training, method = "gbm", trControl = ctrl, preProcess = c("center","scale"))
gbmPredict <- predict(gbmFit,newdata = testing , type="prob")
gbmROC <- roc(as.factor(testing$consensusclass),gbmPredict[,"cluster_1"])
gbmimportance <- varImp(gbmFit, scale = F)
glmnetFit <- train(consensusclass ~ ., data = training, method = "glmnet", trControl = ctrl, preProcess = c("center","scale"))
glmnetPredict <- predict(glmnetFit,newdata = testing , type="prob")
glmnetROC <- roc(as.factor(testing$consensusclass),glmnetPredict[,"cluster_1"])
glmnetimportance <- varImp(glmnetFit, scale = F)
#Figure 8B:
print(glmnetimportance)

library(mlbench)
res <- evalm(list(svmFit,gbmFit,glmnetFit),gnames=c('svm','gbm','glmnet'))

#gene-based classifiers
gse47460_phenom_consensus_ipf <- as.data.frame(merge(consensusdf_gse47460, gse47460_phenom, by.x = 'Geo', by.y = 0))
gse47460_phenom_consensus_ipf$Geo <- NULL
gse47460_phenom_consensus_ipf$consensusclass <- paste0('cluster_', gse47460_phenom_consensus_ipf$consensusclass)
gse47460_phenom_consensus_ipf$consensusclass <- as.factor(gse47460_phenom_consensus_ipf$consensusclass)
gse47460_phenom_consensus_ipf <- gse47460_phenom_consensus_ipf[,c(1, 15:19010)]
mads_gse47460=apply(gse47460_phenom_consensus_ipf[,2:18997],2,mad)
gse474760_mostvariable <- names(sort(mads_gse47460, decreasing = T)[1:10000])
gse47460_phenom_consensus_ipf <- gse47460_phenom_consensus_ipf[,c('consensusclass', gse474760_mostvariable)]

set.seed(1)
indxTrain <- createDataPartition(y = gse47460_celltypegsva_consensus$consensusclass,p = 0.7,list = FALSE)
training <- gse47460_celltypegsva_consensus[indxTrain,]
testing <- gse47460_celltypegsva_consensus[-indxTrain,]
set.seed(400)
ctrl <- trainControl(method="repeatedcv", repeats = 5 , classProbs=TRUE, summaryFunction = twoClassSummary, savePredictions = TRUE)
svmFit <- train(consensusclass ~ ., data = training, method = "svmLinear", trControl = ctrl, preProcess = c("center","scale"))
svmPredict <- predict(svmFit,newdata = testing )
library(pROC)
svmPredict <- predict(svmFit,newdata = testing , type="prob")
svmROC <- roc(as.factor(testing$consensusclass),svmPredict[,"cluster_1"])
svmimportance <- varImp(svmFit)
library(gbm)
gbmFit <- train(consensusclass ~ ., data = training, method = "gbm", trControl = ctrl, preProcess = c("center","scale"))
gbmPredict <- predict(gbmFit,newdata = testing , type="prob")
gbmROC <- roc(as.factor(testing$consensusclass),gbmPredict[,"cluster_1"])
gbmimportance <- varImp(gbmFit, scale = F)
glmnetFit <- train(consensusclass ~ ., data = training, method = "glmnet", trControl = ctrl, preProcess = c("center","scale"))
glmnetPredict <- predict(glmnetFit,newdata = testing , type="prob")
glmnetROC <- roc(as.factor(testing$consensusclass),glmnetPredict[,"cluster_1"])
glmnetimportance <- varImp(glmnetFit, scale = F)
#Figure 8C:
res <- evalm(list(svmFit,gbmFit,glmnetFit),gnames=c('svm','gbm','glmnet'))

#Recursive feature elimination
control <- rfeControl(functions=caretFuncs, method="cv", number=5)
results_rfe <- rfe(gse47460_phenom_consensus_ipf[,2:10001], gse47460_celltypegsva_consensus[,1], sizes=c(1:50), rfeControl=control)
print(results_rfe)
predictors(results_rfe)
plot(results_rfe, type=c("g", "o"))

#Figure 8D: same code below repeated for each gene, example of FOXJ1 shown
ggplot(gse47460_phenom_consensus, aes(consensusclass, FOXJ1, fill = consensusclass)) + 
  geom_boxplot(width=0.5) + geom_jitter(color="black", shape=16, position=position_jitter(0.2)) + theme_bw() + 
  scale_fill_manual(values=c("#999999", "#E69F00", "#56B4E9"))

dunnTest(FOX11~consensusclass, data = gse47460_phenom_consensus)
#end

M <- cor(as.matrix(gse47460_celltypegsva_consensus[,2:ncol(gse47460_celltypegsva_consensus)]))
corrplot(M, method = 'number', type = 'upper', order = 'hclust')
p.mat <- cor_pmat(as.matrix(gse47460_celltypegsva_consensus[,2:ncol(gse47460_celltypegsva_consensus)]))
ggcorrplot(M, hc.order = TRUE, outline.col = "white", lab = TRUE, p.mat = p.mat, insig = 'blank')

###############Figure 9.########################
pirfenidone_up <- read.csv('pirfenidone_up_signature_Kwapiszewska_2018_paper_p005_lung_homogenates_lfc141.csv', header = T)
gse47460_pirfenidone_up <- gsva(as.matrix(gse47460_matrix), gset.idx.list = list(pirfenidone_up$Gene))
gse47460_pirfenidone_up_consensus <- merge(t(gse47460_pirfenidone_up), consensusdf2_gse47460, by.x = 0, by.y = 0)
rownames(gse47460_pirfenidone_up_consensus) <- gse47460_pirfenidone_up_consensus$Row.names
gse47460_pirfenidone_up_consensus$Row.names <- NULL
colnames(gse47460_pirfenidone_up_consensus)[1] <- 'Pirfenidone_signature_score'

ggplot(gse47460_pirfenidone_up_consensus, aes(consensusclass, Pirfenidone_signature_score, fill = consensusclass)) + 
  geom_boxplot(width=0.5, show.legend = F) + geom_jitter(color="black", shape=16, position=position_jitter(0.2)) + theme_bw() + 
  scale_fill_manual(values=nicecolors) + 
  ylab('Signature score') + ggtitle('Pirfenidone signature score') + theme(plot.title = element_text(size = 12), legend.position = "none")
dunnTest(Pirfenidone_signature_score~consensusclass, data = gse47460_pirfenidone_up_consensus)

#end