-
Notifications
You must be signed in to change notification settings - Fork 0
/
trycycler_assemble_read_subsets.sh
230 lines (201 loc) · 8.51 KB
/
trycycler_assemble_read_subsets.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
#!/bin/bash
# Run Flye, Raven, and Minipolish assembler to assemble subsets of Nanopore reads generated using command "trycycler subsample".
# Copyright (C) 2023 Yu Wan <wanyuac@126.com>
# Licensed under the GNU General Public Licence version 3 (GPLv3) <https://www.gnu.org/licenses/>.
# First version: 27 July 2023; last update: 17 December 2023
# Help information ###############
display_usage() {
echo "
This script iteratively runs Raven, Flye, and Miniasm-minipolish to assemble subsets (<100) of Nanopore reads
generated using command 'trycycler subset'. Output assemblies (in subdirectory '1_assemblies' in the current
working directory) can be used for Trycycler command 'trycycler cluster'.
Preprequsites: please ensure flye, raven, minimap2, miniasm, minipolish, and any2fasta are accessible in
your environment. These programs can be installed with Conda.
Parameters:
-d=*: directory of input FASTQ files (*.fastq, mandatory, and do not include the final forward slash)
-p=*: number of assembly-polishing rounds (default: 2; optional)
-t=*: number of threads (default: 1; optional)
-l=*: expected genome length (e.g., 5m) for Flye (mandatory)
-h: a flag to instruct Flye to treat input reads as of high quality (error rate <5%; optional)
-k=*: length of minimisers used to find overlaps by Raven (default: 15; optional)
-w=*: length of sliding window from which minimisers are sampled by Raven (default: 5; optional)
-s=*: start sample index (default: 1; optional); useful when adding more fastq files into a previous run
Example useage:
/usr/local/bin/Assembly_toolkit/trycycler_assemble_read_subsets.sh -d=\"\$HOME/reads/subsets\" -p=2 -t=8 -l=2.5m -h -k=15 -w=5
/usr/local/bin/Assembly_toolkit/trycycler_assemble_read_subsets.sh -d=\"\$HOME/reads/subsets_extra\" -p=2 -t=8 -l=2.5m -h -k=15 -w=5 -s=13
Output: directory '1_assemblies', which stores assemblies in GFA and FASTA formats, will be created under the current working directory.
Note: when parameter s > 1, the directory of input reads must contain corresponding FASTQ files. For example, when s = 16, the directory
must have sample_16.fastq, etc. Otherwise, assemblers will return an error of missing inputs.
"
}
output_dir='1_assemblies'
# Assembler runners ###############
run_flye() {
local i="$1" # Index of read subset: 01, 02, ..., 12, etc
local d="$2" # Directory of input reads
local g="$3" # Probable genome size (for instance, 5m)
local p="$4" # Number of polishing rounds
local t="$5" # Number of threads
local h="$6" # Accuracy level of Nanopore reads: raw (expected error rate: <15%) or hq (<5%)
local r="$d/sample_${i}.fastq" # Input FASTQ file
local pre="${output_dir}/assembly_${i}"
# Generate a genome assembly from long reads ###############
tmp_dir=$(mktemp -d -t flye-XXXXXXXXXX) # Create a temporary directory
if [ "$h" == true ]
then
echo "[$(date)] Assembling high-quality (error rate <5%) ONT reads $r using Flye (polish: ${p}) with $t threads (temporary output directory: $tmp_dir)"
flye --nano-hq "$r" --threads "$t" --out-dir "$tmp_dir" --iterations "$p" --genome-size "$g"
else
echo "[$(date)] Assembling regular (error rate <15%) ONT reads $r using Flye v$v (polish: ${p}) with $t threads (temporary output directory: $tmp_dir)"
flye --nano-raw "$r" --threads "$t" --out-dir "$tmp_dir" --iterations "$p" --genome-size "$g"
fi
mv $tmp_dir/assembly.fasta ${pre}.fasta
mv $tmp_dir/assembly_graph.gfa ${pre}.gfa
mv $tmp_dir/assembly_info.txt ${pre}.txt
mv $tmp_dir/flye.log ${pre}.log
rm -rf $tmp_dir
}
run_raven() {
local i="$1" # Index of read subset: 01, 02, ..., 12, etc
local d="$2" # Directory of input reads
local k="$3" # K-mer size
local w="$4" # Window length
local p="$5" # Polish rounds
local t="$6" # Number of threads
local r="$d/sample_${i}.fastq"
local pre="${output_dir}/assembly_${i}"
echo "[$(date)] Assembling long reads from FASTQ file $r using Raven (k=${k}, w=${w}; polish=${p}; threads=${t})"
raven --threads "$t" --polishing-rounds "$p" --kmer-len "$k" --window-len "$w" --disable-checkpoints --graphical-fragment-assembly "${pre}.gfa" "$r" > "${pre}.fasta"
}
run_miniasm_and_minipolish() { # Code in the function is adapted from https://github.com/rrwick/Minipolish/blob/main/miniasm_and_minipolish.sh.
# It takes two positional arguments:
# 1) a long read file
# 2) the number of threads to use
local i="$1" # Index of read subset: 01, 02, ..., 12, etc
local d="$2" # Directory of input reads
local t="$3" # Number of threads
local r="$d/sample_${i}.fastq" # Input FASTQ file
local pre="${output_dir}/assembly_${i}" # Prefix of output files
# Create temporary intermediate files.
overlaps=$(mktemp)".paf"
unpolished_assembly=$(mktemp)".gfa"
# Assemble ONT reads and polish
echo "[$(date)] Assembling reads from $r using miniasm_and_minipolish ($t threads)"
minimap2 -x ava-ont -t "$t" "$r" "$r" > "$overlaps" # Find read overlaps with minimap2
miniasm -f "$r" "$overlaps" > "$unpolished_assembly" # Run miniasm to make an unpolished assembly
minipolish --threads "$t" "$r" "$unpolished_assembly" > "${pre}.gfa" # Polish the assembly with minipolish, outputting the result to stdout.
any2fasta "${pre}.gfa" > "${pre}.fasta" # Convert the GFA file to a FASTA file
rm "$overlaps" "$unpolished_assembly" # Clean up
}
# Main ###############
if [ -z $1 ]
then
display_usage
exit
fi
# Read arguments ===============
index_start=1 # Default value
for i in "$@"
do
case $i in
-d=*)
dir_in="${i#*=}" # Directory of input read files sample_01.fastq, sample_02.fastq, ..., sample_xy.fastq
;;
-p=*)
polish="${i#*=}" # Number of polishing rounds
;;
-t=*)
threads="${i#*=}" # Number of threads
;;
-l=*)
genome_len="${i#*=}" # Probable genome length (bp) for Flye
;;
-h)
high_accuracy_reads=true # High-accuracy ONT reads (error rate <5%) for Flye
;;
-k=*)
raven_kmer="${i#*=}" # K-mer length for Raven (default: 15)
;;
-w=*)
raven_window_len="${i#*=}" # Window length for Raven (default: 5)
;;
-s=*)
index_start="${i#*=}" # Update this value with the user-specified start index of FASTQ filenames
;;
*) # Do nothing otherwise.
;;
esac
done
# Initialisation ===============
if [ ! -d "$dir_in" ]
then
echo "[$(date)] Error: directory of input FASTQ files was not found."
exit
fi
if [ -z "genome_len" ]
then
echo "[$(date)] Error: expected genome length was not specified for Raven."
exit
fi
if [ -z "$high_accuracy_reads" ]
then
echo "[$(date)] Flye: treats input reads as raw (error rate <15%)."
high_accuracy_reads=false
else
echo "[$(date)] Flye: treats input reads as high-quality (error rate <5%)"
fi
if [ -z "$polish" ]
then
echo "[$(date)] Set the number of assembly-polishing rounds to 2."
polish=2
fi
if [ -z "$threads" ]
then
echo "[$(date)] Thread number was not specified, so set the number to 1."
threads=1
fi
if [ -z "$raven_kmer" ]
then
echo "[$(date)] Set k-mer length to 15 bp for Raven."
raven_kmer=15
fi
if [ -z "$raven_window_len" ]
then
echo "[$(date)] Set window size to 5 for Raven."
raven_window_len=5
fi
if [ ! -d "$output_dir" ]
then
echo "[$(date)] Output directory $output_dir was not found, so create it in the current working directory."
mkdir "$output_dir"
fi
# Assemble subsets of ONT reads ===============
n=$(find $dir_in -name '*.fastq' -type f | wc -l)
if [ $n -gt 0 ]
then
echo "[$(date)] Found $n input FASTQ files"
j=$index_start
m=$((j+n)) # The maximum index + 1
echo "FASTQ filenames start from index $j"
else
echo "[$(date)] Error: no FASTQ file (.fastq) was found in input directory $dir_in".
exit
fi
while [ $j -lt $m ]
do
k=$(printf "%02d" $j) # Add leading zeros to the index when j < 10
run_flye "$k" "$dir_in" "$genome_len" "$polish" "$threads" "$high_accuracy_reads"
((j++))
if [ $j -lt $m ]
then
k=$(printf "%02d" "$j")
run_raven "$k" "$dir_in" "$raven_kmer" "$raven_window_len" "$polish" "$threads"
((j++))
if [ $j -lt $m ]
then
k=$(printf "%02d" "$j")
run_miniasm_and_minipolish "$k" "$dir_in" "$threads"
((j++))
fi
fi
done