-
Notifications
You must be signed in to change notification settings - Fork 0
/
influenza_consensus.sh
executable file
·197 lines (167 loc) · 7.25 KB
/
influenza_consensus.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
#!/usr/bin/env bash
#set -eo pipefail
usage() {
echo "
influenza_consensus.sh [options]
Required arguments:
-i|--input .csv file with first column as sample name and second column as path to fastq, no headers required
-o|--output Path to output directory
--db Path to Centrifuge database for raw read taxonomic classification
Optional arguments:
-t|--threads Number of threads [Default = 32]
-s|--segment Target specific Influenza A genomic segments for consensus calling with each segment number delimited by a comma (Example: -s 1,2,5,6)
--subsample Specify the target coverage for consensus calling [Default = 1000]
-m|--model Specify the flowcell chemistry used for Nanopore sequencing [Default = r941_min_high_g360]
--notrim Disable adaptor trimming by Porechop
--keep-tmp Keep all temporary files
-h|--help Display help message
"
}
# define global variables
script_dir=$(dirname $(realpath $0))
script_name=$(basename $0 .sh)
DB_PATH="/db/viralRefSeq_InfA_custom"
# parse arguments
opts=`getopt -o hi:o:t:s:m: -l help,input:,output:,threads:,db:,notrim,segment:,model:,keep-tmp,subsample:,mode: -- "$@"`
eval set -- "$opts"
if [ $? != 0 ] ; then usage; exit 1 ; fi
if [[ $1 =~ ^--$ ]] ; then echo "influenza_consensus: Invalid arguments used, exiting"; usage; exit 1 ; fi
while true; do
case "$1" in
-i|--input) INPUT_PATH=$2; shift 2;;
-o|--output) OUTPUT_PATH=$2; shift 2;;
--db) DB_PATH=$2; shift 2;;
-t|--threads) THREADS=$2; shift 2;;
--mode) MODE=$2; shift 2;;
-m|--model) MODEL=$2; shift 2;;
--subsample) SUBSAMPLE=$2; shift 2;;
--notrim) TRIM=0; shift 1;;
--keep-tmp) KEEP_TMP=1; shift 1;;
-s|--segment) SEGMENTS=$2; shift 2;;
--) shift; break ;;
-h|--help) usage; exit 0;;
esac
done
# check if required arguments are given
if test -z $INPUT_PATH; then echo "influenza_consensus: Required argument -i is missing, exiting"; exit 1; fi
if test -z $OUTPUT_PATH; then echo "influenza_consensus: Required argument -o is missing, exiting"; exit 1; fi
if test -z $DB_PATH; then echo "influenza_consensus: Required argument --db is missing, exiting"; exit 1; fi
# check dependencies
medaka_consensus -h 2&>1 /dev/null
if [[ $? != 0 ]]; then echo "influenza_consensus: medaka cannot be called, check its installation"; exit 1; fi
seqtk 2&>1 /dev/null
if [[ $? != 1 ]]; then echo "influenza_consensus: seqtk cannot be called, check its installation"; exit 1; fi
snakemake -h > /dev/null
if [[ $? != 0 ]]; then echo "influenza_consensus: snakemake cannot be called, check its installation"; exit 1; fi
centrifuge -h > /dev/null
if [[ $? != 0 ]]; then echo "influenza_consensus: centrifuge cannot be called, check its installation"; exit 1; fi
porechop -h > /dev/null
if [[ $? != 0 ]]; then echo "influenza_consensus: porechop cannot be called, check its installation"; exit 1; fi
# validate model parameter input if specified
readarray models < ${script_dir}/medaka_models.txt
if ! test -z $MODEL; then
# test if invalid characters used
if ! [[ $(echo "${models[@]}" | grep -w "${MODEL}") ]]; then
echo "Invalid medaka model passed to the -m argument, exiting"
echo "Supported models include: $(echo ${models[@]} | sed 's/ /, /g')"
exit 1
fi
else
# Set default model if not specified
MODEL="r941_min_high_g360"
fi
# validate segment parameter input if specified
if ! test -z $SEGMENTS; then
for segment in $(echo $SEGMENTS | sed 's/,/\n/g'); do
# check if individual elements are valid integers between [1:8]
if [[ $(echo -n $segment | wc -m) -ne 1 ]]; then
echo "Invalid characters passed to the -s argument, exiting"
exit 1
fi
if ! [[ $segment =~ ^[1-8]$ ]]; then
echo "Invalid characters passed to the -s argument, exiting"
exit 1
fi
done
else
# Set default segments to produce = [1:8] if not specified
if test -z $SEGMENTS; then SEGMENTS="1,2,3,4,5,6,7,8"; fi
fi
# validate input samples.csv
if ! test -f $INPUT_PATH; then echo "influenza_consensus: Input sample file does not exist, exiting"; exit 1; fi
while read lines; do
sample=$(echo $lines | cut -f1 -d',')
path=$(echo $lines | cut -f2 -d',')
if ! test -d $path; then
echo "influenza_consensus: ${sample} directory cannot be found, check its path listed in the input file, exiting"
exit 1
fi
done < $INPUT_PATH
# validate database path
if ! test -f ${DB_PATH}.1.cf; then echo "influenza_consensus: Specified Centrifuge database does not exist, exiting"; exit 1; fi
# create output directory if does not exist
if ! test -d $OUTPUT_PATH; then mkdir -p $OUTPUT_PATH; fi
# Set default number of threads if not specified
if test -z $THREADS; then THREADS=32; fi
# Set default trim mode to true if not specified
if test -z $TRIM; then TRIM=1; fi
# Set default keep temporary files (KEEP_TMP) to 0 if not specified
if test -z $KEEP_TMP; then KEEP_TMP=0; fi
# Set default subsample depth to 1000 if not specified
if test -z $SUBSAMPLE; then SUBSAMPLE=1000; fi
# Set default read selection mode to dynamic if not specified
if test -z $MODE; then MODE="dynamic"; fi
# validate read selection mode if specified
if ! [[ $MODE =~ ^(dynamic|static)$ ]]; then echo "${script_name}: Invalid mode option passed to the --mode argument, accepted values are dynamic/static, exiting"; exit 1; fi
# Remove existing analysis html report and summary statistics file in OUTPUT_PATH
if test -f $OUTPUT_PATH/InfA_analysis_viz.html; then rm $OUTPUT_PATH/InfA_analysis_viz.html; fi
if test -f $OUTPUT_PATH/summary_statistics.csv; then rm $OUTPUT_PATH/summary_statistics.csv; fi
while read lines; do
sample=$(echo $lines | cut -f1 -d',')
if test -d $OUTPUT_PATH/$sample; then
rm -rf $OUTPUT_PATH/$sample
fi
done < $INPUT_PATH
# call snakemake
snakemake --snakefile $script_dir/SnakeFile --cores $THREADS \
--keep-going \
--config samples=$(realpath $INPUT_PATH) \
outdir=$(realpath $OUTPUT_PATH) \
segments="$SEGMENTS" \
pipeline_dir=$script_dir \
centrifuge_db=$DB_PATH \
trim=$TRIM \
model=$MODEL \
threads=$THREADS \
subsample=$SUBSAMPLE \
mode=$MODE \
--nolock \
--use-conda
# clean up temporary directories
if [[ $KEEP_TMP -eq 0 ]]; then
while read lines; do
sample=$(echo $lines | cut -f1 -d',')
if test -f $OUTPUT_PATH/$sample/$sample.fastq*; then rm $OUTPUT_PATH/$sample/$sample.fastq*; fi
for dir in nanoplot fastq medaka centrifuge binned_fastq porechop draft_consensus subsample_fastq; do
if test -d $OUTPUT_PATH/$sample/$dir; then
rm -rf $OUTPUT_PATH/$sample/$dir
fi
done
done < $INPUT_PATH
fi
# check status of consensus building for each segment
# and rename file to contain sample names
IFS=","
read -r -a segment_array <<< $SEGMENTS
while read lines; do
sample=$(echo "$lines" | cut -f1 -d',')
echo -e "influenza_consensus: \033[0;33mConsensus Building Summary for $sample\033[0m"
for i in ${segment_array[@]}; do
if test -f $OUTPUT_PATH/$sample/consensus/segment_${i}.fa; then
echo -e " \033[0;33mSegment ${i}\033[0m: \033[0;32mSUCCESS\033[0m"
mv $OUTPUT_PATH/${sample}/consensus/segment_${i}.fa $OUTPUT_PATH/${sample}/consensus/${sample}_segment_${i}.fa
else
echo -e " \033[0;33mSegment ${i}\033[0m: \033[0;31mFAIL\033[0m"
fi
done
done < $INPUT_PATH