-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path06_relatedness.Rmd
153 lines (116 loc) · 5.68 KB
/
06_relatedness.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
---
title: "06_relatedness"
author: "John Dou"
date: "December 13, 2020"
output: html_document
---
```{batch}
module load Bioinformatics
module load plink/1.9
module load R
cd /nfs/turbo/bakulski1/People/johndou/ALS_Goutman/06_Relate/
```
## relatedness check
```{batch}
# Check for relationships between individuals with a pihat > 0.2.
plink --bfile ../05_Heterozygosity/als09_PAR_het --extract ../05_Heterozygosity/indepSNP.prune.in --genome --min 0.2 --out pihat_min0.2
# nobody with pihat > 0.2
plink --bfile ../05_Heterozygosity/als09_PAR_het --extract ../05_Heterozygosity/indepSNP.prune.in --genome --out all_relation
```
FID1 IID1 FID2 IID2 RT EZ Z0 Z1 Z2 PI_HAT PHE DST PPC RATIO
62 618 75 1003 UN NA 0.0000 0.0000 1.0000 1.0000 -1 0.999992 1.0000 NA
93 81 508 _0081 UN NA 0.0000 0.0000 1.0000 1.0000 -1 1.000000 1.0000 NA
215 9970 215 _9970 OT 0 0.0000 0.0000 1.0000 1.0000 -1 0.999996 1.0000 NA
216 86 507 _0086 UN NA 0.0000 0.0000 1.0000 1.0000 -1 0.999996 1.0000 NA
234 8288 234 _8288 OT 0 0.0000 0.0000 1.0000 1.0000 -1 1.000000 1.0000 NA
339 233 432 670 UN NA 0.2227 0.5121 0.2652 0.5213 -1 0.865215 1.0000 11.8355
495 779 496 701 UN NA 0.0001 0.0000 0.9999 0.9999 -1 0.999985 1.0000 NA
```{r}
relatedness = read.table("all_relation.genome", header=T)
pdf("relatedness.pdf")
par(pch=16, cex=1)
with(relatedness,plot(Z0,Z1, xlim=c(0,1), ylim=c(0,1), type="n", xlab="Z0 (proportion of loci where pair shares zero alleles IBD)", ylab="Z1 (proportion of loci where pair shares one allele IBD)"))
with(subset(relatedness,RT=="PO") , points(Z0,Z1,col=4))
with(subset(relatedness,RT=="UN") , points(Z0,Z1,col='black'))
legend(1,1, xjust=1, yjust=1, legend=levels(relatedness$RT), pch=16, col='black')
dev.off()
mycol <- c(rgb(20, 20, 235, max = 255, alpha = 125))
pdf("hist_relatedness.pdf")
h <- hist(relatedness[,10], breaks=500, plot=FALSE)
par(mar=c(5.1,6.3,4.1,2.1))
plot(h, main="Relatedness", xlab= "Pihat", ylab="", axes=FALSE, xlim=c(0,0.25), col=mycol, cex.lab=1.5)
axis(1, cex.axis=1.5)
axis(2, las=2, cex.axis=1.5)
mtext("Frequency", side=2, line=5, cex=1.5)
abline(v=0.2, col="red")
dev.off()
rep <- relatedness[relatedness$Z0<0.1 & relatedness$Z1<0.1,]
unexp_rel <- relatedness[relatedness$Z1>0.4,]
write.csv(rep, file='suspected_replicates.csv', row.names=F, quote=F)
write.csv(unexp_rel, file='unexpected_relation.csv', row.names=F,)
```
Z0 and Z1 near 0:
FID1 IID1 FID2 IID2 RT EZ Z0 Z1 Z2 PI_HAT PHE DST PPC RATIO
26664 62 618 75 1003 UN NA 0e+00 0 1.0000 1.0000 -1 0.999992 1 NA
39512 93 81 508 _0081 UN NA 0e+00 0 1.0000 1.0000 -1 1.000000 1 NA
79412 215 9970 215 _9970 OT 0 0e+00 0 1.0000 1.0000 -1 0.999996 1 NA
79690 216 86 507 _0086 UN NA 0e+00 0 1.0000 1.0000 -1 0.999996 1 NA
84598 234 8288 234 _8288 OT 0 0e+00 0 1.0000 1.0000 -1 1.000000 1 NA
119164 495 779 496 701 UN NA 1e-04 0 0.9999 0.9999 -1 0.999985 1 NA
High Z1:
FID1 IID1 FID2 IID2 RT EZ Z0 Z1 Z2 PI_HAT PHE DST PPC RATIO
339 233 432 670 UN NA 0.2227 0.5121 0.2652 0.5213 -1 0.865215 1 11.8355
## for actual replicates and for cryptic relatadness, drop one with lower call rate, drop both non-replicate sample matches
```{batch}
plink --bfile ../05_Heterozygosity/als09_PAR_het --missing
```
```{r}
relatedness = read.table("all_relation.genome", header=T)
rep <- relatedness[relatedness$Z0<0.1 & relatedness$Z1<0.1,]
unexp_rel <- relatedness[relatedness$Z1>0.4,]
rep$IID1 <- as.character(rep$IID1)
rep$IID2 <- as.character(rep$IID2)
unexp_rel$IID1 <- as.character(unexp_rel$IID1)
unexp_rel$IID2 <- as.character(unexp_rel$IID2)
samp_miss <- read.table(file="plink.imiss", header=TRUE)
samp_miss <- samp_miss[samp_miss$IID %in% c(rep$IID1, rep$IID2, unexp_rel$IID1, unexp_rel$IID2), ]
actual_reps <- c(2:5)
rep_actual <- rep[actual_reps,]
rep_mistake <- rep[-actual_reps,]
#for inappropriate matches
remove_these <- data.frame(FID=c(rep_mistake$FID1, rep_mistake$FID2),
IID=c(rep_mistake$IID1, rep_mistake$IID2))
remove_these$FID <- as.character(remove_these$FID)
remove_these$IID <- as.character(remove_these$IID)
remove_these <- remove_these[remove_these$IID != 1003, ]
#for actual replicates, drop one with more missing
for(i in 1:nrow(rep_actual)){
s1 = c(FID=rep_actual[i,'FID1'], IID=rep_actual[i,'IID1'])
s2 = c(FID=rep_actual[i,'FID2'], IID=rep_actual[i,'IID2'])
s1.miss = samp_miss[samp_miss$IID==s1['IID'], 'F_MISS']
s2.miss = samp_miss[samp_miss$IID==s2['IID'], 'F_MISS']
if(s1.miss > s2.miss){
remove_these <- rbind(remove_these, s1)
}else{
remove_these <- rbind(remove_these, s2)
}
}
#loop unecessary for this since only one pair, but concept the same
for(i in 1:nrow(unexp_rel)){
s1 = c(FID=unexp_rel[i,'FID1'], IID=unexp_rel[i,'IID1'])
s2 = c(FID=unexp_rel[i,'FID2'], IID=unexp_rel[i,'IID2'])
s1.miss = samp_miss[samp_miss$IID==s1['IID'], 'F_MISS']
s2.miss = samp_miss[samp_miss$IID==s2['IID'], 'F_MISS']
if(s1.miss > s2.miss){
remove_these <- rbind(remove_these, s1)
}else{
remove_these <- rbind(remove_these, s2)
}
}
write.table(remove_these, file='relatedness_filter.txt', quote=F, row.names=F)
```
```{batch}
plink --bfile ../05_Heterozygosity/als09_PAR_het --remove relatedness_filter.txt --make-bed --out als10_PAR_rel
plink --bfile ../05_Heterozygosity/als09_nPAR_female_het --remove relatedness_filter.txt --make-bed --out als10_nPAR_female_rel
plink --bfile ../05_Heterozygosity/als09_nPAR_male_het --remove relatedness_filter.txt --make-bed --out als10_nPAR_male_rel
```