-
Notifications
You must be signed in to change notification settings - Fork 1
/
heatmap-GSI-SD.R
127 lines (112 loc) · 4.54 KB
/
heatmap-GSI-SD.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
########################################################################################
### Title: Heatmap based on GSI and high variable MHL
########################################################################################
library("gplots")
library("RColorBrewer")
library("grDevices")
library("impute")
args = commandArgs(trailingOnly=TRUE)
RawNARemove<-function(data,missratio=0.3){
threshold<-(missratio)*dim(data)[2]
NaRaw<-which(apply(data,1,function(x) sum(is.na(x))>threshold))
zero<-which(apply(data,1,function(x) all(x==0))==T)
NaRAW<-c(NaRaw,zero)
if(length(NaRAW)>0){
dat<-data[-NaRAW,]
}else{
dat<-data;
}
dat
}
gsi<-function(data){
group=names(table(colnames(data)))
index=colnames(data)
gsi<-c()
gmaxgroup<-c()
for(i in 1:nrow(data)){
gsit<-0
gmax<-names(which.max(tapply(as.numeric(data[i,]),index,mean)))
for(j in 1:length(group)){
tmp<-(1-10^(mean(data[i,][which(index==group[j])]))/10^(mean(data[i,][which(index==gmax)])))/(length(group)-1)
gsit<-gsit+tmp
}
gmaxgroup<-c(gmaxgroup,gmax)
gsi<-c(gsi,gsit)
print(c(gmax,gsit))
}
rlt=data.frame(region=rownames(data),group=gmaxgroup,GSI=gsi)
return(rlt)
}
heatmap<-function(mydata){
hclustfunc <- function(x) hclust(x, method="complete")
distfunc <- function(x) dist(x, method="euclidean")
# perform clustering on rows and columns
cl.row <- hclustfunc(distfunc(mydata))
cl.col <- hclustfunc(distfunc(t(mydata)))
# extract cluster assignments; i.e. k=8 (rows) k=5 (columns)
gr.row <- cutree(cl.row, 6)
# gr.col <- cutree(cl.col, 5)
# require(RColorBrewer)
col1 <- brewer.pal(6, "Set1") # the maximum of brewer.pal is 12
#col2 <- brewer.pal(5, "Pastel1")
tol21rainbow= c("#771155", "#AA4488", "#CC99BB", "#114477", "#4477AA", "#77AADD", "#117777", "#44AAAA", "#77CCCC", "#117744", "#44AA77", "#88CCAA", "#777711", "#AAAA44", "#DDDD77", "#774411", "#AA7744", "#DDAA77", "#771122", "#AA4455", "#DD7788")
col2<-tol21rainbow[as.numeric(as.factor(cl.col$labels))]
col=colorRampPalette(c("blue", "yellow"))(20)
heatmaprlt<-heatmap.2(as.matrix(mydata),hclustfun=hclustfunc, distfun=distfunc,
RowSideColors=col1[gr.row],
ColSideColors=col2,
labRow=F,
trace="none",
col=col,
density.info="none")
legend=unique(data.frame(col2,cl.col$labels))
legend(x=0.85,y=0.8,legend=legend[,2],col=as.character(legend[,1]),pch=15,cex = 0.5)
dev.off()
return(heatmaprlt)
}
################################################################################################
#########################################Data Input ############################################
################################################################################################
infile="WGBS_methHap_load_matrix_20Oct2015.txt";
file1<-read.table(infile,head=T,sep="\t",row.names=1,as.is=T,check.names=F)
infile="Heatmap.MHL.txt"
file2<-read.table(infile,head=T,sep="\t",row.names=1,as.is=T,check.names=F)
dim(file1)
dim(file2)
file1<-file1[match(rownames(file2),rownames(file1)),]
file1<-cbind(file1,file2)
# miss value detection and imputation
library("impute")
library("preprocessCore")
library("sva")
f2<-RawNARemove(file1,missratio=0.20)
f2<-impute.knn(data.matrix(f2))$data
colnames(f2)<-gsub("_","-",colnames(f2))
colname2<-unlist(lapply(colnames(f2),function(x) unlist(strsplit(x,"[.]"))[1]))
colnames(f2)<-colname2
# be sure all the sample information has been stored in the following database
saminfo2<-read.table("newsaminfo.txt",head=T,sep="\t",as.is=T)
colnames(f2)<-saminfo2[match(colname2,saminfo2[,1]),2]
colnames(f2)
mydata=f2
# feature selection plan A
GSI<-gsi(mydata)
GSI<-GSI[order(GSI[,3],decreasing=T),]
mydata1<-mydata[match(names(sort(apply(mydata,1,sd),decreasing=T))[1:5000],rownames(mydata)),]
colnames(mydata1)<-colname2
#pdf("heatmap.wgbs.mhl.top5000gsi.pdf")
#rlt1<-heatmap(mydata1)
#dev.off()
hc1 <- hclust( dist(t(mydata1), method = "euclidean") , method="complete")
# feature selection plan B
x<-apply(mydata,1,function(x) sd(x))
y<-order(x,decreasing=T)[1:args[1]]
mydata2<-mydata[match(names(x[y]),rownames(mydata)),]
colnames(mydata2)<-colname2
#file=paste("heatmap.wgbs.mhl.top-sd-",args[1],".pdf",sep="")
#pdf(file)
#rlt2<-heatmap(mydata2)
hc2 <- hclust( dist(t(mydata2), method = "euclidean") , method="complete")
# compare the two cluster result(correlation)
cor(cophenetic(hc1), cophenetic(hc2)) # 0.8836503 (correlation of cophenetic dissimilarity matrix)
save.image("Figure3a.heatmap.RData")