-
Notifications
You must be signed in to change notification settings - Fork 8
/
exploreSubstitutions.R
137 lines (110 loc) · 5.81 KB
/
exploreSubstitutions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
##############################################################################
### Use this script to identify substitutions between trip, teo, and maize ###
### from the SNPs called by angsd. ###
##############################################################################
### Timothy M. Beissinger
### 8-15-2014
### Set working directory
getwd()
setwd("../SNPs")
### Load SNP calls
rawSNPs <- read.table("fooSNPs.txt",header=F,stringsAsFactors=F,sep="\t",na.strings="NN")
SNPs <- rawSNPs[,1:41] #remove extra column
### Load and format header
head <- read.table("../INS/snpCallList.txt",stringsAsFactors=F) #load
head <- unlist(strsplit(head[,1],split="mergedBams/"))[seq(2,74,2)] #seperate directory
head <- unlist(strsplit(head,split="_merged.bam")) #remove tail
head <- c("chr","pos","Major","Minor",head) #add early columns before taxa
### Add header
names(SNPs) <- head
### Function to identify substitutions
whichAreSubs <- function(data,outCols=5,popCols=6:41,missing.limit=0.5){
if(length(which(is.na(data[popCols]))) > (missing.limit*length(popCols)) ) return ("population NA Issue")
if(length(which(is.na(data[outCols]))) > (missing.limit*length(outCols)) ) return ("outgroup NA Issue")
out <- unique(unlist(strsplit(as.character(data[outCols]),split="")))
out <- out[is.na(out)==F]
pop <- unique(unlist(strsplit(as.character(data[popCols]),split="")))
pop <- pop[is.na(pop)==F]
if(length(out)>0 & length(pop)>0 & length(intersect(out,pop))==0) return ("Sub")
}
### Index by millions
length <- nrow(SNPs) # count number of SNPs
floor <- floor(length/1000000) # find lowest million
starts <-c(1, c(1:floor)*1000000 +1)
stops <- c(starts[1:{length(starts)-1}]+999999,length)
# Trip vs. Maize & Teo
TvMT_result <- c()
for(c in 1:length(starts)){
print(paste(starts[c],":",stops[c],sep=""))
SNPsub <- SNPs[starts[c]:stops[c],]
TvMT_result[starts[c]:stops[c]] <- apply(SNPsub,1,whichAreSubs,outCols=5,popCols=6:41,missing.limit=0.2) # trip vs mays
}
#TvMT_result<- apply(SNPs,1,whichAreSubs,outCols=5,popCols=6:41,missing.limit=0.2) # trip vs mays
TvMT <- SNPs[which(as.character(TvMT_result)=="Sub"),c(1:4,5,6)]
write.table(TvMT,file="TvMT.txt",row.names=F,col.names=T,quote=F,sep="\t")
# Trip vs. Maize
TvM_result <- c()
for(c in 1:length(starts)){
print(paste(starts[c],":",stops[c],sep=""))
SNPsub <- SNPs[starts[c]:stops[c],]
TvM_result[starts[c]:stops[c]] <- apply(SNPsub,1,whichAreSubs,outCols=5,popCols=19:41,missing.limit=0.2) # trip vs maize
}
#TvM_result<- apply(SNPs,1,whichAreSubs,outCols=5,popCols=19:41,missing.limit=0.2) # trip vs maize
TvM <- SNPs[which(as.character(TvM_result)=="Sub"),c(1:4,5,19)]
write.table(TvM,file="TvM.txt",row.names=F,col.names=T,quote=F,sep="\t")
# Trip vs. Teo
TvT_result <- c()
for(c in 1:length(starts)){
print(paste(starts[c],":",stops[c],sep=""))
SNPsub <- SNPs[starts[c]:stops[c],]
TvT_result[starts[c]:stops[c]] <- apply(SNPsub,1,whichAreSubs,outCols=5,popCols=6:18,missing.limit=0.2) # trip vs teo
}
#TvT_result<- apply(SNPs,1,whichAreSubs,outCols=5,popCols=6:18,missing.limit=0.2) # trip vs teo
TvT <- SNPs[which(as.character(TvT_result)=="Sub"),c(1:4,5,6)]
write.table(TvT,file="TvT.txt",row.names=F,col.names=T,quote=F,sep="\t")
# Teo vs. Maize
TevM_result <- c()
for(c in 1:length(starts)){
print(paste(starts[c],":",stops[c],sep=""))
SNPsub <- SNPs[starts[c]:stops[c],]
TevM_result[starts[c]:stops[c]] <- apply(SNPsub,1,whichAreSubs,outCols=6:18,popCols=19:41,missing.limit=0.2) # teo vs maize
}
#TevM_result <- apply(SNPs,1,whichAreSubs,outCols=6:18,popCols=19:41,missing.limit=0.2) # teo vs maize
TevM <- SNPs[which(as.character(TevM_result)=="Sub"),c(1:4,6,19)]
write.table(TevM,file="TevM.txt",row.names=F,col.names=T,quote=F,sep="\t")
# Print some summary stats
cat(c("Trip-Maize/Teo Subs = ",nrow(TvMT), "\n"))
cat(c("Trip-Maize Subs = ",nrow(TvM), "\n"))
cat(c("Trip-Teo Subs = ",nrow(TvT), "\n"))
cat(c("Teo-Maize Subs = ",nrow(TevM), "\n"))
# Save image
save.image("SubstitutionSpace.RData")
### Identify investigated SNPs in TvMT
TvMT_tested <- SNPs$pos[which(as.character(TvMT_result) == "NULL" | as.character(TvMT_result) =="Sub" )]
### Determine expected distribution of distance between substitutions
uni_subs <- sample(TvMT_tested,nrow(TvMT))
uni_subs <- uni_subs[order(uni_subs)]
uni_gap <- uni_subs-c(1,uni_subs[1:{length(uni_subs)-1}])
### Determine observed distribution of distance between substitutions
gap <- TvMT$pos-c(1,TvMT$pos[1:{length(TvMT$pos)-1}])
### Density plots--observed vs expected
pdf("Observed_vs_expected.pdf")
par(mfrow=c(2,1))
hist(TvMT$pos,breaks=100,xlab="Substitution positions",main="Observed distribution of substitutions at the start of chromosome 1")
hist(uni_subs,breaks=100,xlab="Substitution positions",main="Expected distribution of substitutions at the start of chromosome 1")
par(mfrow=c(1,1))
plot(density(gap),col="red",lwd=3,main="Density of substitution spacing",xlab="Space between subs",xlim=c(0,50000),ylim=c(0,0.003))
lines(density(uni_gap),col="blue",lwd=3)
legend("topright","(x,y)",c("Observed distance between substitutions","Expected distribution between substitutions"),lwd=3,col=c("red","blue"))
dev.off()
### Load large TvMT
TvMT <- read.table("TvMT.txt",header=T,stringsAsFactors=F)
TvMT_1 <- TvMT[which(TvMT$chr==1),]
hist(TvMT_1$pos,breaks=10000)
gap <- TvMT_1$pos-c(1,TvMT_1$pos[1:{length(TvMT_1$pos)-1}])
uni_samp <- sample(max(TvMT_1$pos),nrow(TvMT_1))[1:100]
uni_samp <- uni_samp[order(uni_samp)]
uni_gap <- uni_samp-c(1,uni_samp[1:{length(uni_samp)-1}])
plot(density(uni_gap,bw=500000),col="blue",lwd=3,xlim=c(0,1.5e7))
lines(density(gap,bw=500000),col="red",lwd=3,main="Density of substitution spacing",xlim=c(0,1.5e7))
legend("topright","(x,y)",c("Observed distance between substitutions","Expected distribution between substitutions"),lwd=3,col=c("red","blue"))