-
Notifications
You must be signed in to change notification settings - Fork 1
/
GATK_pipeline_v2_part2.sh
executable file
·268 lines (201 loc) · 8.97 KB
/
GATK_pipeline_v2_part2.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
#!/bin/bash
#
# all submission arguments here
########################################################################################################
# PART 2: From raw basecalls to GATK-ready reads
########################################################################################################
## Recalibration
# - GATK BQSR (Base Quality Score Recalibration) using BaseRecalibrator
# - GATK PrintReads
# - GATK BQSR (Base Quality Score Recalibration) using BaseRecalibrator
# - GATK AnalyzeCovariates
## Variant Calling
# - GATK HaplotypeCaller
# - bgzip + tabix .g.vcf
# - GATK ValidateVariants
## File cleanup
# - file cleanup
########################################################################################################
# execute using:
# bsub -J "GATKp2[1-20]" < scripts/GATK_pipeline_v2_part2.sh
source ./utils.sh
mkdir -p $root_folder/logs
####################################################
# Variables dependent on sample
####################################################
sample=$(sed "${LSB_JOBINDEX}q;d" bams_to_process.txt)
samplelog=$root_folder/logs/$sample.log
count=$(grep $sample exclude.txt | wc -l)
if [ "$count" -ge "1" ]; then
exit
fi
########################################################################################################
# GATK BQSR (Base Quality Score Recalibration)
# STEP 1: Creating covariates table masking known indel, SNP sites
########################################################################################################
if [ ! -f $root_folder/logs/part_2_GATK_BQSRpre_finished_$sample.txt ]; then
echo "$(date '+%d/%m/%y_%H:%M:%S'),---Starting GATK BaseRecalibrator Pre---" >> "$samplelog"
time (java -Xmx12g -Djava.io.tmpdir=/tmp \
-jar $path_GATK \
-T BaseRecalibrator \
-R $path_ref \
-nct 16 \
-I $root_folder/input_bams/$sample.reheader.bam \
-knownSites $bundle2_8/b37/dbsnp_138.b37.vcf \
-knownSites $bundle2_8/b37/Mills_and_1000G_gold_standard.indels.b37.vcf \
-knownSites $bundle2_8/b37/1000G_phase1.indels.b37.vcf \
-o $root_folder/BQSR/$sample.recal_table)
exitValue=$?
if [ $exitValue == 0 ]; then
echo "$(date '+%d/%m/%y_%H:%M:%S'), Finished GATK BaseRecalibrator Pre" >> "$samplelog" \
&& touch $root_folder/logs/part_2_GATK_BQSRpre_finished_$sample.txt
else
echo "$(date '+%d/%m/%y_%H:%M:%S'), ERROR: GATK BaseRecalibrator Pre not completed. ExitValue = $exitValue" >> "$samplelog"
exit $exitValue
fi
else
echo "$(date '+%d/%m/%y_%H:%M:%S'),***Skipping GATK BaseRecalibrator Pre since it was already computed***" >> "$samplelog"
fi
########################################################################################################
# GATK BQSR (Base Quality Score Recalibration)
# STEP 2: Writing the new BAM with recalibrated Q scores
########################################################################################################
if [ ! -f $root_folder/logs/part_2_GATK_PrintReads_finished_$sample.txt ]; then
echo "$(date '+%d/%m/%y_%H:%M:%S'),---Starting GATK PrintReads---" >> "$samplelog"
time (java -Xmx12g -Djava.io.tmpdir=/tmp \
-jar $path_GATK \
-T PrintReads \
-R $path_ref \
-nct 16 \
-I $root_folder/input_bams/$sample.reheader.bam \
-BQSR $root_folder/BQSR/$sample.recal_table \
-o $root_folder/input_bams/$sample.recal.bam)
exitValue=$?
if [ $exitValue == 0 ]; then
echo "$(date '+%d/%m/%y_%H:%M:%S'),---GATK PrintReads finished---" >> "$samplelog" \
&& touch $root_folder/logs/part_2_GATK_PrintReads_finished_$sample.txt
else
echo "$(date '+%d/%m/%y_%H:%M:%S'), ERROR: GATK PrintReads not completed. ExitValue = $exitValue" >> "$samplelog"
exit $exitValue
fi
else
echo "$(date '+%d/%m/%y_%H:%M:%S'),***Skipping GATK PrintReads since it was already computed***" >> "$samplelog"
fi
####################################################
# GATK BQSR post
####################################################
if [ ! -f $root_folder/logs/part_2_GATK_BQSRpost_finished_$sample.txt ]; then
echo "$(date '+%d/%m/%y_%H:%M:%S'),---Starting GATK BaseRecalibrator Post---" >> "$samplelog"
# Count covariates in recal.BAM to compare before/after recalibration
time (java -Xmx12g -Djava.io.tmpdir=/tmp \
-jar $path_GATK \
-T BaseRecalibrator \
-R $path_ref \
-nct 16 \
-I $root_folder/input_bams/$sample.reheader.bam \
-knownSites $bundle2_8/b37/dbsnp_138.b37.vcf \
-knownSites $bundle2_8/b37/Mills_and_1000G_gold_standard.indels.b37.vcf \
-knownSites $bundle2_8/b37/1000G_phase1.indels.b37.vcf \
-BQSR $root_folder/BQSR/$sample.recal_table \
-o $root_folder/BQSR/$sample.recal_table_post)
exitValue=$?
if [ $exitValue == 0 ]; then
echo "$(date '+%d/%m/%y_%H:%M:%S'), Finished GATK BaseRecalibrator Post" >> "$samplelog" \
&& touch $root_folder/logs/part_2_GATK_BQSRpost_finished_$sample.txt
else
echo "$(date '+%d/%m/%y_%H:%M:%S'), ERROR: GATK BaseRecalibrator Post not completed. ExitValue = $exitValue" >> "$samplelog"
exit $exitValue
fi
else
echo "$(date '+%d/%m/%y_%H:%M:%S'),***Skipping GATK BaseRecalibrator Post since it was already computed***" >> "$samplelog"
fi
####################################################
# GATK AnalyzeCovariates
####################################################
if [ ! -f $root_folder/logs/part_2_GATK_AnalyzeCovariates_finished_$sample.txt ]; then
echo "$(date '+%d/%m/%y_%H:%M:%S'),---Starting GATK AnalyzeCovariates---" >> "$samplelog"
# Create a pdf plot to see the results of recalibration
time (java -Xmx15g -Djava.io.tmpdir=/tmp \
-jar $path_GATK \
-T AnalyzeCovariates \
-R $path_ref \
-before $root_folder/BQSR/$sample.recal_table \
-after $root_folder/BQSR/$sample.recal_table_post \
-plots $root_folder/BQSR/$sample.recal_plots.pdf \
-csv $root_folder/BQSR/$sample.recal_plots.csv \
-l DEBUG \
)
# Catching failed java jobs ("time" doesn't change the exit value of java above)
exitValue=$?
if [ $exitValue == 0 ]; then
echo "$(date '+%d/%m/%y_%H:%M:%S'),---GATK AnalyzeCovariates finished---" >> "$samplelog" \
&& touch $root_folder/logs/part_2_GATK_AnalyzeCovariates_finished_$sample.txt
else
echo "$(date '+%d/%m/%y_%H:%M:%S'), ERROR: GATK AnalyzeCovariates not completed. ExitValue = $exitValue" >> "$samplelog"
exit $exitValue
fi
else
echo "$(date '+%d/%m/%y_%H:%M:%S'),***Skipping GATK AnalyzeCovariates since it was already computed***" >> "$samplelog"
fi
####################################################
# Genotyping - HaplotypeCaller
####################################################
if [ ! -f $root_folder/logs/part_2_HaplotypeCaller_finished_$sample.txt ]; then
echo "Starting HaplotypeCaller"
echo "$(date '+%d/%m/%y_%H:%M:%S'),---Starting HaplotypeCaller---" >> "$samplelog"
time (java -Xmx12g -Djava.io.tmpdir=/tmp \
-Djava.library.path=$root_folder \
-jar $path_GATK \
-T HaplotypeCaller \
-R $path_ref \
-I $root_folder/input_bams/$sample.recal.bam \
--emitRefConfidence GVCF \
--dbsnp $bundle2_8/b37/dbsnp_138.b37.vcf \
-o $path_rds_vcfs/$sample.g.vcf.gz \
-pairHMM VECTOR_LOGLESS_CACHING )
# -nct 16 \
# Catching failed java jobs ("time" doesn't change the exit value of java above)
exitValue=$?
if [ $exitValue == 0 ]; then
echo "$(date '+%d/%m/%y_%H:%M:%S'),---Finished HaplotypeCaller---" >> "$samplelog" \
&& touch $root_folder/logs/part_2_HaplotypeCaller_finished_$sample.txt
else
echo "$(date '+%d/%m/%y_%H:%M:%S'),---HaplotypeCaller failed---" >> "$samplelog"
exit $exitValue
fi
else
echo "$(date '+%d/%m/%y_%H:%M:%S'),***Skipping HaplotypeCaller since it was previously computed***" >> "$samplelog"
fi
####################################################
# Check the validity of the g.vcf.gz file
####################################################
if [ ! -f $root_folder/logs/part_2_GVCF_validation_finished_$sample.txt ]; then
echo "Checking GVCF validity"
echo "$(date '+%d/%m/%y_%H:%M:%S'),---Starting GVCF validation after HaplotypeCaller---" >> "$samplelog"
time (java -Xmx12g -Djava.io.tmpdir=/tmp \
-jar $path_GATK \
-T ValidateVariants \
-R $path_ref \
-V $path_rds_vcfs/$sample.g.vcf.gz \
--validationTypeToExclude ALL \
) >> "$samplelog"
# --validateGVCF \
# NOTE that in case of invalid VCF, GATK will exit anyway
exitValue=$?
if [ $exitValue == 0 ]; then
echo "$(date '+%d/%m/%y_%H:%M:%S'),---GVCF validation after HaplotypeCaller COMPLETED---">> "$samplelog" \
&& touch $root_folder/logs/part_2_GVCF_validation_finished_$sample.txt
else
echo "$(date '+%d/%m/%y_%H:%M:%S'),---GVCF validation after HaplotypeCaller NOT COMPLETED---">> "$samplelog"
exit $exitValue
fi
else
echo "$(date '+%d/%m/%y_%H:%M:%S'),***Skipping gVCF.gz validation since it was previously computed***" >> "$samplelog"
fi
####################################################
# File cleanup
####################################################
echo "$(date '+%d/%m/%y_%H:%M:%S'),---Removing last BAMs (reheader and recal)---" >> "$samplelog"
rm -f $root_folder/input_bams/$sample.reheader.*
rm -f $root_folder/input_bams/$sample.recal.*
echo "$(date '+%d/%m/%y_%H:%M:%S'), Finished removing last BAMs (reheader and recal)" >> "$samplelog"