-
Notifications
You must be signed in to change notification settings - Fork 0
/
alignments_multi.sh
executable file
·383 lines (304 loc) · 14.6 KB
/
alignments_multi.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
#!/bin/bash
usage="usage: ./alignments_multi.sh <targets FASTA> <genome FASTA> <output directory> [<intronic regions BED>]"
mafft_timeout="15s"
max_introns=5000 # skip intron highlighting if exceeded
if [ $# -lt 3 ]
then
echo "Error in $0: wrong number of command arguments"
echo "$usage"
exit 1
fi
targets=$1
genome=$2
outdir=$3
intronic=$4
source utils/progress.sh
rm -rf "$outdir"/{genomic_sequences,query_sequences,alignments,query_coverage,temp.fasta,temp.bed,introns.tmp,summary.txt,mafft.log}
mkdir -p "$outdir"/{genomic_sequences,query_sequences,alignments,query_coverage}
shopt -s nullglob
iter=1
i_max=$(ls -1q "${outdir}/genomic_ranges" | wc -l)
for f in "${outdir}/genomic_ranges"/*
do
# extract/modify sequences
# 1) genome ("+" strand)
########################
id=$(echo "$f" | sed 's/.*\/\([^/]\+\)\.gff3$/\1/')
bedtools getfasta \
-fi "$genome" \
-bed "$f" \
-fo "${outdir}/genomic_sequences/${id}.fasta"
# switch to lower case
fasta_formatter -i "${outdir}/genomic_sequences/${id}.fasta" \
| awk '{ if ($0 !~ />/) {print tolower($0)} else {print $0} }' \
> "${outdir}/temp.fasta"
mv "${outdir}/temp.fasta" "${outdir}/genomic_sequences/${id}.fasta"
# find and highlight intronic regions
if [ $# -eq 4 ]
then
gff2bed < "$f" | cut -f1-6 > "${outdir}/temp.bed"
bedtools intersect \
-a "${outdir}/temp.bed" \
-b "$intronic" \
-wo \
| awk 'BEGIN{OFS="\t"}{print $4,$5,$6,$7,$8,$9}' \
> "${outdir}/introns.tmp"
n_introns=$(wc -l < "${outdir}/introns.tmp")
range_start=$(cut -f2 "${outdir}/temp.bed")
for k in $(seq 1 "$n_introns")
do
read -r intron_start intron_end <<<"$(sed -n "${k}p" "${outdir}/introns.tmp" | cut -f5,6)"
# intron positions relative to pair
a=$(( intron_start - range_start ))
b=$(( intron_end - range_start ))
cat "${outdir}/genomic_sequences/${id}.fasta" | utils/upint.py $a $b > "${outdir}/temp.fasta"
mv "${outdir}/temp.fasta" "${outdir}/genomic_sequences/${id}.fasta"
done
fi
# 2) query (possibly reverse-complemented)
##########################################
bedtools getfasta \
-s \
-fi "$targets" \
-bed "${outdir}/query_intervals/${id}.gff3" \
-fo "${outdir}/query_sequences/${id}.fasta"
fasta_formatter -i "${outdir}/query_sequences/${id}.fasta" \
| awk '{ if ($0 !~ /^>/) {print tolower($0)} else {print $0} }' > "${outdir}/temp.fasta"
mv "${outdir}/temp.fasta" "${outdir}/query_sequences/${id}.fasta"
echo -e "target ID: $id\n" >> "${outdir}/mafft.log"
# align genomic and query sequences
###################################
timeout "${mafft_timeout}" mafft \
--inputorder \
--preservecase \
--auto \
--addfragments "${outdir}/query_sequences/${id}.fasta" \
"${outdir}/genomic_sequences/${id}.fasta" \
> "${outdir}/alignments/${id}.fasta" 2>> "${outdir}/mafft.log"
# in case of timeout, remove alignment
if [ $? -eq 124 ]; then
rm "${outdir}/alignments/${id}.fasta" 2> /dev/null
fi
# Sometimes mafft --addframgents fails if the reference contains too much ambiguous nucleotides.
# --maxambiguous 1.0 could help, but is only available in MAFFT's online version.
# As a workaround, missing alignments are recomputed without the --addfragments option:
if ! test -s "${outdir}/alignments/${id}.fasta"; then
echo -e "\nWarning: mafft --addfragments failed, alignment repeated without the --addfragments option\n" >> "${outdir}/mafft.log"
cat "${outdir}/genomic_sequences/${id}.fasta" "${outdir}/query_sequences/${id}.fasta" > "${outdir}/mafft_input.tmp"
timeout "${mafft_timeout}" mafft \
--inputorder \
--preservecase \
--auto \
"${outdir}/mafft_input.tmp" \
> "${outdir}/alignments/${id}.fasta" 2>> "${outdir}/mafft.log"
# in case of timeout, keep unaligned input
if [ $? -eq 124 ]; then
echo -e "\nWarning: Alignment failed for ${id}!" | tee "${outdir}/mafft.log"
rm "${outdir}/alignments/${id}.fasta" 2> /dev/null
fasta_formatter -i "${outdir}/mafft_input.tmp" -o "${outdir}/alignments/${id}.fasta"
fi
fi
echo -e "\n\n-------------\n\n" >> "${outdir}/mafft.log"
# visualize query (marker) coverage
fasta_formatter -i "$targets" | grep "^>${id}$" -A1 > "${outdir}/temp.fasta"
./query_cov.py "${outdir}/temp.fasta" "${outdir}/query_intervals/${id}.gff3" > "${outdir}/query_coverage/${id}.fasta"
progress_bar "$iter" "$i_max" 40 processed
(( iter++ ))
done
# check for overlapping loci
############################
./detect_overlap.py "${outdir}/genomic_ranges" "${outdir}"
# calculate MSAs for each group of overlapping loci
###################################################
if grep -q "," "${outdir}/groups_of_overlapping_loci.txt"
then
echo "overlapping loci found, compute group-wise alignments..."
rm -rf "$outdir/"{genomic_ranges_groupwise,genomic_sequences_groupwise,alignments_groupwise}
mkdir -p "$outdir/"{genomic_ranges_groupwise,genomic_sequences_groupwise,alignments_groupwise}
echo -e "\n\n-------------\n-------------\ngroup-wise alignments:\n\n" >> "${outdir}/mafft.log"
i_max=$(grep -c "^group" "${outdir}/groups_of_overlapping_loci.txt")
for i_group in $(seq 1 "$i_max"); do
ids_concatenated=$(sed -n "${i_group}p" "${outdir}/groups_of_overlapping_loci.txt" | sed 's/^group [0-9]\+: //')
IFS=',' read -r -a ids <<< "$ids_concatenated"
if [ ${#ids[@]} -eq 1 ]; then
cp "${outdir}/alignments/${ids[0]}.fasta" "${outdir}/alignments_groupwise/group${i_group}.fasta"
else
# extract/modify sequences
# 1) genome ("+" strand)
########################
for id in "${ids[@]}"; do
cat "${outdir}/genomic_ranges/${id}.gff3"
done > "${outdir}/ranges.tmp"
ref_start=$(cut -f4 "${outdir}/ranges.tmp" | sort -n | head -n1)
ref_end=$(cut -f5 "${outdir}/ranges.tmp" | sort -n | tail -n1)
cat "${outdir}/genomic_ranges/${ids[0]}.gff3" | awk -v "i=$ref_start" -v "j=$ref_end" -v quid="$ids_concatenated" '
BEGIN {
OFS = "\t"
}
{
$4 = i
$5 = j
$9 = quid
print
}
' > "$outdir/genomic_ranges_groupwise/group${i_group}.gff3"
bedtools getfasta \
-fi "$genome" \
-bed "${outdir}/genomic_ranges_groupwise/group${i_group}.gff3" \
-fo "${outdir}/genomic_sequences_groupwise/${i_group}.fasta"
# switch to lower case
fasta_formatter -i "${outdir}/genomic_sequences_groupwise/${i_group}.fasta" \
| awk '{ if ($0 !~ />/) {print tolower($0)} else {print $0} }' \
> "${outdir}/temp.fasta"
mv "${outdir}/temp.fasta" "${outdir}/genomic_sequences_groupwise/${i_group}.fasta"
# find and highlight intronic regions
if [ $# -eq 4 ]
then
gff2bed < "$outdir/genomic_ranges_groupwise/group${i_group}.gff3" | cut -f1-6 > "${outdir}/temp.bed"
bedtools intersect \
-a "${outdir}/temp.bed" \
-b "$intronic" \
-wo \
| awk 'BEGIN{OFS="\t"}{print $4,$5,$6,$7,$8,$9}' \
> "${outdir}/introns.tmp"
n_introns=$(wc -l < "${outdir}/introns.tmp")
range_start=$(cut -f2 "${outdir}/temp.bed")
if [ $n_introns -le $max_introns ]
then
for k in $(seq 1 "$n_introns")
do
read -r intron_start intron_end <<<"$(sed -n "${k}p" "${outdir}/introns.tmp" | cut -f5,6)"
# intron positions relative to pair
a=$(( intron_start - range_start ))
b=$(( intron_end - range_start ))
cat "${outdir}/genomic_sequences_groupwise/${i_group}.fasta" | utils/upint.py $a $b > "${outdir}/temp.fasta"
mv "${outdir}/temp.fasta" "${outdir}/genomic_sequences_groupwise/${i_group}.fasta"
done
else
echo -e "\nWarning: group with $n_introns introns detected, skip intron highlighting"
fi
fi
echo -e "group ${i_group} (note: group indices can be changed later)\n" >> "${outdir}/mafft.log"
# 2) combine queries (already extracted)
########################################
for id in "${ids[@]}"; do
cat "${outdir}/query_sequences/${id}.fasta"
done > "${outdir}/queries.tmp"
# align genomic and query sequences
###################################
timeout "${mafft_timeout}" mafft \
--inputorder \
--preservecase \
--auto \
--addfragments "${outdir}/queries.tmp" \
"${outdir}/genomic_sequences_groupwise/${i_group}.fasta" \
> "${outdir}/alignments_groupwise/group${i_group}.fasta" 2>> "${outdir}/mafft.log"
# in case of timeout, remove alignment
if [ $? -eq 124 ]; then
rm "${outdir}/alignments_groupwise/group${i_group}.fasta" 2> /dev/null
fi
# Sometimes mafft --addframgents fails if the reference contains too much ambiguous nucleotides.
# --maxambiguous 1.0 could help, but is only available in MAFFT's online version.
# As a workaround, missing alignments are recomputed without the --addfragments option:
if ! test -s "${outdir}/alignments_groupwise/group${i_group}.fasta"; then
echo -e "\nWarning: mafft --addfragments failed, alignment repeated without the --addfragments option\n" >> "${outdir}/mafft.log"
cat "${outdir}/genomic_sequences_groupwise/${i_group}.fasta" "${outdir}/queries.tmp" > "${outdir}/mafft_input.tmp"
timeout "${mafft_timeout}" mafft \
--inputorder \
--preservecase \
--auto \
"${outdir}/mafft_input.tmp" \
> "${outdir}/alignments_groupwise/group${i_group}.fasta" 2>> "${outdir}/mafft.log"
# in case of timeout, keep unaligned input
if [ $? -eq 124 ]; then
echo -e "\nWarning: Alignment failed for current group!" | tee "${outdir}/mafft.log"
rm "${outdir}/alignments_groupwise/group${i_group}.fasta" 2> /dev/null
fasta_formatter -i "${outdir}/mafft_input.tmp" -o "${outdir}/alignments_groupwise/group${i_group}.fasta"
fi
fi
echo -e "\n\n-------------\n\n" >> "${outdir}/mafft.log"
fi
progress_bar "$i_group" "$i_max" 40 processed
done
fi
# summarize results
###################
echo -e "\nsummarize results..."
Rscript --vanilla summarize.r "${outdir}"
if grep -q "," "${outdir}/groups_of_overlapping_loci.txt"
then
Rscript --vanilla summarize_groupwise.r "${outdir}"
fi
# add group column to summary.txt (if overlapping loci present)
if grep -q "," "${outdir}/groups_of_overlapping_loci.txt"
then
cat "${outdir}"/groups_of_overlapping_loci.txt | awk -F "[ ,]" '{gsub(/:/, "", $2); for (i=3; i<=NF; i++) {print $i"\t"$2}}' | sort -k 1b,1 > group_mapping.txt
sort -k 1b,1 "${outdir}/summary.txt" > "${outdir}/summary.tmp"
join -t$'\t' -j 1 "${outdir}/summary.tmp" group_mapping.txt > "${outdir}/summary.txt"
fi
# sort table(s)
sort -k3,3nr -k2,2nr -k1,1 "${outdir}/summary.txt" > "${outdir}/summary.tmp"
mv "${outdir}/summary.tmp" "${outdir}/summary.txt"
if grep -q "," "${outdir}/groups_of_overlapping_loci.txt"
then
sort -k3,3nr -k2,2nr -k1,1 "${outdir}/summary_groupwise.txt" > "${outdir}/summary_groupwise.tmp"
mv "${outdir}/summary_groupwise.tmp" "${outdir}/summary_groupwise.txt"
fi
# change group indices for better clarity (according to order of appearance in summary.txt):
if grep -q "," "${outdir}/groups_of_overlapping_loci.txt"
then
# 1) in summary.txt (store order in group_order.tmp)
gawk -v "outdir=$outdir" '
BEGIN {
PROCINFO["sorted_in"] = "@ind_num_asc"
max = 1
OFS = "\t"
}
{
if (!new[$6]) {
new[$6] = max
$6 = max
max++
}
else {
$6 = new[$6]
}
print
}
END {
for (i in new) {
printf new[i]" " > outdir"/group_order.tmp"
}
}' "$outdir"/summary.txt | column -t > "$outdir"/summary.tmp
mv "$outdir"/summary.tmp "$outdir"/summary.txt
# 2) in summary_groupwise.txt
awk '{for (i=1; i<=NF; i++) {print "s/^"i"\\t/"$i":\\t/"}}' "$outdir"/group_order.tmp > "$outdir"/replacement.sed
sed -i -f "$outdir"/replacement.sed "$outdir"/summary_groupwise.txt
sed -i 's/:\t/\t/' "$outdir"/summary_groupwise.txt
column -t "$outdir"/summary_groupwise.txt > "$outdir"/summary.tmp
mv "$outdir"/summary.tmp "$outdir"/summary_groupwise.txt
# 3) in alignment file names
arr=( $(cat "$outdir"/group_order.tmp) )
for i in "${!arr[@]}"; do
old=$(( i + 1 ))
new="${arr[$i]}"
mv "$outdir"/alignments_groupwise/group${old}.fasta "$outdir"/alignments_groupwise/group${new}.tmp
done
for i in "${!arr[@]}"; do
old=$(( i + 1 ))
new="${arr[$i]}"
mv "$outdir"/alignments_groupwise/group${new}.tmp "$outdir"/alignments_groupwise/group${new}.fasta
done
# 4) in list of groups
awk '{for (i=1; i<=NF; i++) {print "s/^group "i":/group "$i"/"}}' "$outdir"/group_order.tmp > "$outdir"/replacement.sed
sed -f "$outdir"/replacement.sed "$outdir"/groups_of_overlapping_loci.txt | sort -k2,2n | awk '{$2=$2":"; print}' > "$outdir"/groups_of_overlapping_loci.tmp
mv "$outdir"/groups_of_overlapping_loci.tmp "$outdir"/groups_of_overlapping_loci.txt
rm "$outdir"/{group_order.tmp,replacement.sed}
fi
# final cleanup
rm -rf "$outdir"/{genomic_sequences,query_sequences,genomic_ranges,query_intervals,temp.fasta,temp.bed,introns.tmp,mafft_input.tmp,summary.tmp} 2> /dev/null
rm -f "${targets}.fai" "${genome}.fai" 2> /dev/null
if grep -q "," "${outdir}/groups_of_overlapping_loci.txt"
then
rm -rf "$outdir"/{genomic_sequences_groupwise,genomic_ranges_groupwise,ranges.tmp,queries.tmp,summary_groupwise.tmp} 2> /dev/null
fi