-
Notifications
You must be signed in to change notification settings - Fork 0
/
script.sh
executable file
·381 lines (348 loc) · 17 KB
/
script.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
#!/bin/bash
# This script takes nanopore FastQ data and does several types of variant calling.
# Variant calling is done against a downloaded (ncbi) reference, and denovo assembled references.
# A sample-mapping table corresponds the appropriate wild type sample with its treatments
# All software is supposed to be in dedicated conda environments.
# First, this script takes the raw data and does flye assemblies for all samples.
# When assembled, a snapshot of the assembly graph is generated with Bandage.
# Second, it polishes these flye assemblies with medaka.
# The original sample and corresponding WT reads are mapped to the polished assembly for later visualisation.
# Third, it annotates these pollished assemblies with both prokka and bakta for comparison
# Finally, this script does two forms of variant calling to all relevant references.
# The relevant references are defined in a mapping table, corresponding samples to their WT.
# Variants are called with medaka again, which includes minimap2 readmapping.
# Structural variants are called with Sniffles.
# To this end, nanopore reads are mapped with a dedicated aligner: ngmlr.
# For visualisation of denovo assemlbes in IGV, I recommend loading a polished genome and the appropriate gff files from bakta and prokka.
# BAM files for assembly verification are available next to the consensus fasta file.
# For visualisation of variant calling on any reference, I recommend loading the appropriate reference with its GFF files.
# Next, load the VCF files generated by medaka (one file per sample) and sniffles (one file per reference)
# Read mappings are available as minimap2 mappings by medaka and as ngmlr mappings as well. You can load both or just one of the two.
# define where stuff is:
basedir=/stor/anabaena # this is where we create our output
fqdir=/stor/azolla_sequencing/nanopore/anabaena # a dir with nanopore .fastq.gz files to process
baktaDB=/stor/scripts/baktaDB # bakta db for annotation
condadir=/home/laura/miniconda3 # (mini)conda(3) directory
# define references
ncbi="$basedir"/reference/GCF_000009705.1_ASM970v1_genomic.fna # a reference genome to call variants on
WT="$basedir"/denovo/WT/polished-medaka/consensus.fasta # the name(s) of your wild type sample to use as a reference for variant calling
CSV15="$basedir"/denovo/CSV15/polished-medaka/consensus.fasta
CSAM="$basedir"/denovo/CSAM/polished-medaka/consensus.fasta
refs=( "$WT" "$CSV15" "$ncbi" "$CSAM" )
ref_names=( 'WT' 'CSV15' 'ncbi' 'CSAM')
maptab="$basedir"/WT_sample.txt
CPU=$(nproc)
# Coloured text
BLD=$(tput bold)
GRN=$(tput setaf 2)
RED=$(tput setaf 1)
BLU=$(tput setaf 4)
NML=$(tput sgr0)
# check mapping table presence
if [ ! -f "$maptab" ]
then echo -e "$BLD""$RED""Wildtype to sample mapping table is not found, make sure to set this propperly. $NML"
echo -e "expecting mapping table at path $maptab"
exit 1
fi
# functions to use in the script
function checkprog {
if [ ! $(command -v "$1") ]
then echo -e "$BLD""$RED""Command $1 is not found $NML"
echo 'Make sure the propper conda environments are installed'
exit 1
fi
}
function checkwd {
if [ ! -d "$wd" ]
then mkdir "$wd"
fi
}
# A trick to swtich conda environments while using this script, adapt to your particular conda installation.
if [ ! -f "$condadir"/etc/profile.d/conda.sh ]
then echo -e "$BLD""$RED""quiting for we need the conda environments in the `envs` directory to proceed $NML"
exit
else source "$condadir"/etc/profile.d/conda.sh
fi
# get an array of our samples directly from the available sequencing files and check if any samples are found
samples=( $(find "$fqdir" -maxdepth 1 -name '*.fastq.gz' -printf '%P\n') )
if [ "${#samples[@]}" -lt 1 ]
then echo -e "$BLD""$RED""no samples found $NML"
exit
fi
# for each sample, make a de novo assembly with flye
echo -e "$BLD""$GRN""Checking if all denovo assemblies are present $NML"
conda activate flye
wd="$basedir"/denovo
checkwd
for s in "${samples[@]}"
do name=$(echo "$s" | sed 's/\.fastq\.gz//g' )
if [ ! -d "$wd/$name" ]
then checkprog flye
checkprog Bandage
# assemble with flye expecting a genome of 6.4Mb
flye --nano-hq "$fqdir/$s" \
--genome-size 6.4M \
--threads "$CPU" \
--scaffold \
--out-dir "$wd/$name"
Bandage image "$wd/$name"/assembly_graph.gfa \
"$wd/$name".png
fi
done
conda deactivate
# for each sample, polish the assembly with medaka
echo -e "$BLD""$GRN""Checking if all denovo assemblies are polished $NML"
conda activate medaka
for s in "${samples[@]}"
do name=$(echo "$s" | sed 's/\.fastq\.gz//g' )
if [ ! -d "$basedir"/denovo/"$name/polished-medaka" ]
then checkprog medaka_consensus
medaka_consensus -i "$fqdir/$s" \
-d "$basedir"/denovo/"$name"/assembly.fasta \
-o "$basedir"/denovo/"$name/polished-medaka" \
-m r941_min_sup_variant_g507 \
-t 6
fi
done
conda deactivate
# for each sample, map the sample reads to the polished assembly
echo -e "$BLD""$GRN""Checking if all denovo assemblies have sample reads mapped back $NML"
conda activate nanopore
for s in "${samples[@]}"
do name=$(echo "$s" | sed 's/\.fastq\.gz//g' )
if [ ! -f "$basedir"/denovo/"$name/polished-medaka/$name.bam" ]
then checkprog minimap2
checkprog samtools
# check if a minimap2 index is already present:
if [ ! -f "$basedir"/denovo/"$name"/polished-medaka/consensus.fasta.mmi ]
then minimap2 "$basedir"/denovo/"$name"/polished-medaka/consensus.fasta \
-d "$basedir"/denovo/"$name"/polished-medaka/consensus.fasta.mmi
fi
# run minimap2, then sort the samfile and convert to bam
minimap2 "$basedir"/denovo/"$name"/polished-medaka/consensus.fasta \
"$fqdir/$s" \
-x map-ont \
-t "$CPU" \
-Y \
-a \
| samtools sort -@ 6 -l 9 -m 9G \
> "$basedir"/denovo/"$name/polished-medaka/$name.bam"
# index the bamfile for igv
samtools index "$basedir"/denovo/"$name/polished-medaka/$name.bam"
fi
done
conda deactivate
# for each sample, map the WT reads to the polished assembly
echo -e "$BLD""$GRN""Checking if all denovo assemblies have WT reads mapped back $NML"
conda activate nanopore
for s in "${samples[@]}"
do name=$(echo "$s" | sed 's/\.fastq\.gz//g' )
refname=$(grep '\W'"$name" "$maptab" | grep -v ncbi | cut -f 1 )
if [ ! -f "$basedir"/denovo/"$name/polished-medaka/$refname.bam" ]
then checkprog minimap2
checkprog samtools
# check if a minimap2 index is already present:
if [ ! -f "$basedir"/denovo/"$name"/assembly.fasta.mmi ]
then minimap2 "$basedir"/denovo/"$name"/assembly.fasta \
-d "$basedir"/denovo/"$name"/assembly.fasta.mmi
fi
# run minimap2, then sort the samfile and convert to bam
minimap2 "$basedir"/denovo/"$name"/polished-medaka/consensus.fasta \
"$fqdir/$refname".fastq.gz \
-x map-ont \
-t 6 \
-Y \
-a \
| samtools sort -@ 6 -l 9 -m 9G \
> "$basedir"/denovo/"$name/polished-medaka/$refname.bam"
# index the bamfile for igv
samtools index "$basedir"/denovo/"$name/polished-medaka/$refname.bam"
fi
done
conda deactivate
# Annotate all assembled and polished genomes with prokka
echo -e "$BLD""$GRN""Checking if all polished assemblies are annotated with prokka $NML"
conda activate prokka
for s in "${samples[@]}"
do name=$(echo "$s" | sed 's/\.fastq\.gz//g' )
wd="$basedir"/denovo/"$name"/polished-medaka_prokka-annotation
if [ ! -d "$wd" ]
then checkprog prokka
prokka --outdir "$wd" \
--addgenes \
--genus 'nostoc' \
--prefix "$name" \
--kingdom 'Bacteria' \
--cpus 0 \
--rfam \
"$basedir"/denovo/"$name/polished-medaka/consensus.fasta"
fi
done
conda deactivate
# Annotate all assembled and polished genomes with bakta
conda activate bakta
if [ ! -d "$baktaDB"/amrfinderplus-db ]
then echo -e "$BLD""$RED""amrfinderplus-db is not setup correctly, doing that now $NML"
checkprog amrfinder_update
amrfinder_update --database "$baktaDB"/amrfinderplus-db
fi
echo -e "$BLD""$GRN""Checking if all polished assemblies are annotated with bakta $nml"
for s in "${samples[@]}"
do name=$(echo "$s" | sed 's/\.fastq\.gz//g' )
wd="$basedir"/denovo/"$name"/polished-medaka_bakta-annotation
if [ ! -d "$wd" ]
then checkprog bakta
bakta --output "$wd" \
--db "$baktaDB" \
--genus 'nostoc' \
--prefix "$name" \
--compliant \
--complete \
--keep-contig-headers \
--threads "$CPU" \
"$basedir"/denovo/"$name/polished-medaka/consensus.fasta"
fi
done
conda deactivate
# call variants with medaka on all selected reference sequences
conda activate medaka
for r in $(seq 1 1 "${#refs[@]}" )
do count=$(echo "$r -1" | bc) # correct for 0based counting
refname="${ref_names[$count]}" # define refname
echo -e "$BLD""$GRN""Checking if all samples are used for variant calling on reference $refname $NML"
wd="$basedir"/haplotypes_"$refname"/medaka
checkwd
for s in "${samples[@]}"
do name=$(echo "$s" | sed 's/\.fastq\.gz//g' )
if [ ! -d "$wd/$name" ]
then checkprog medaka_haploid_variant
# check if this reference-sample combo should be ran,
# otherwise continue to the next combo
if [ $(grep \^"$refname" "$maptab" | grep "$name" -c ) -eq 1 ]
then medaka_haploid_variant -i "$fqdir/$s" \
-r "${refs[$count]}" \
-o "$wd/$name" \
-t "$CPU" \
-m r941_min_sup_variant_g507
fi
fi
done
done
conda deactivate
# map reads with ngmlr for variant calling with sniffles
conda activate nanopore
for r in $(seq 1 1 "${#refs[@]}" )
do count=$(echo "$r -1" | bc) # correct for 0based counting
refname="${ref_names[$count]}" # define refname
echo -e "$BLD""$GRN""Checking if all samples are mapped to $refname with ngmlr $NML"
wd="$basedir"/haplotypes_"$refname"/mapped_ngmlr
checkwd
for s in "${samples[@]}"
do name=$(echo "$s" | sed 's/\.fastq\.gz//g' )
if [ ! -f "$wd/$name".sorted.bam ]
then checkprog ngmlr
checkprog samtools
# check if this reference-sample combo should be ran,
# otherwise continue to the next combo
if [ $(grep \^"$refname" "$maptab" | grep "$name" -c ) -eq 1 ]
then ngmlr -q "$fqdir/$s" \
-r "${refs[$count]}" \
--rg-sm "$name" \
-t "$CPU" \
-x ont \
-o "$wd/$name".sam
# now process the bam file with samtools for later use and visualisation
samtools sort -@ 6 -m 9G "$wd/$name".sam \
> "$wd/$name".sorted.bam
samtools index "$wd/$name".sorted.bam
rm "$wd/$name".sam
fi
fi
done
done
#call variants with sniffles on the reference of of choice
for r in $(seq 1 1 "${#refs[@]}" )
do count=$(echo "$r -1" | bc) # correct for 0based counting
refname="${ref_names[$count]}" # define refname
echo -e "$BLD""$GRN""Checking if all samples are used for structural variant calling on reference $refname $NML"
wd="$basedir"/haplotypes_"$refname"/sniffles
checkwd
for s in "${samples[@]}"
do name=$(echo "$s" | sed 's/\.fastq\.gz//g' )
if [ ! -f "$wd/$name".vcf ]
then checkprog sniffles
# check if this reference-sample combo should be ran,
# otherwise continue to the next combo
if [ $(grep \^"$refname" "$maptab" | grep "$name" -c ) -eq 1 ]
then sniffles --input "$wd"/../mapped_ngmlr/"$name".sorted.bam \
--reference "${refs[$count]}" \
--sample-id "$name" \
--snf "$wd/$name".snf \
--vcf "$wd/$name".vcf
fi
fi
done
# now combine all sniffles calls in one vcf
if [ ! -f "$wd"/"$refname"_multi-sample.vcf ]
then sniffles --input "$wd/"*.snf --vcf "$wd"/"$refname"_multi-sample.vcf
# arbitrary fix to remove whole chromosome duplications
grep -Pv '\tSniffles2\.DUP' "$wd"/"$refname"_multi-sample.vcf > ./temp.txt
mv temp.txt "$wd"/"$refname"_multi-sample.vcf
# create a fasta file with the sequences of all insertions
while read line
do name=$( echo $line | cut -d ' ' -f 1,2,3 | sed 's/\W/_/g')
seq=$( echo $line | cut -f 5 -d ' ')
nonseq=$(echo $seq | tr -d A | tr -d C | tr -d T | tr -d G | sed 's/\w//g' | wc -c )
# check if seq is a seq, then print
if [ "$nonseq" -eq 1 ]
then echo \>"$name"
echo "$seq"
fi
unset name seq nonseq
done < <(grep -v '#' "$wd"/"$refname"_multi-sample.vcf \
| grep PASS \
| grep '\.INS\.' \
) \
> "$wd"/"$refname"_multi-sample_insertions.fasta
# create a fasta file with the sequences of all deletions
bedtools getfasta \
-fi "${refs[$count]}" \
-fo "$wd"/"$refname"_multi-sample_deletions.fasta \
-bed <( \
grep -v '#' "$wd"/"$refname"_multi-sample.vcf \
| grep PASS \
| grep '\.DEL\.' \
| vcf2bed \
| cut -f 1,2,9 \
| sed 's/\t[PI].*;END=/\t/g' \
| cut -d ';' -f 1 \
)
fi
echo -e "$BLD""$BLU""In reference $refname the following variants were annotated by sniffles2 $NML"
echo -e "$BLU"'Insertions:' $(grep -v '#' "$wd"/"$refname"_multi-sample.vcf | grep PASS | grep '\.INS\.' | wc -l )
echo -e "$BLU"'Deletions:' $(grep -v '#' "$wd"/"$refname"_multi-sample.vcf | grep PASS | grep '\.DEL\.' | wc -l )
echo -e "$BLU"'Recombinations:' $(grep -v '#' "$wd"/"$refname"_multi-sample.vcf | grep PASS | grep '\.BND\.' | wc -l )
done
conda deactivate
conda activate igv
# searching for sequences of interest in all denovo assemblies
echo -e "$BLD""$GRN""hecking blat searches marking sequences of interest in the de-novo assemblies $NML"
wd="$basedir"/igv_configs/
checkwd
for s in "${samples[@]}"
do name=$(echo "$s" | sed 's/\.fastq\.gz//g' )
if [ ! -f "$wd"/"$name"_blathits.psl ]
then checkprog blat
blat "$basedir"/denovo/"$name"/polished-medaka/consensus.fasta \
reference/queries.fasta \
-t=DNA -q=DNA \
"$wd"/"$name"_blathits.psl
fi
done
conda deactivate
# export R markdown page with some statistics on the flye assemblies
if [ ! -f ./denovo/flyestats.html ]
then Rscript -e "rmarkdown::render('flyestats.Rmd',output_file='./denovo/flyestats.html')"
else echo -e "$GRN""not updating denovo/flyestats html because it already exists. $NML"
fi
echo -e "$BLD""$GRN"'Script finished'"$NML"