-
Notifications
You must be signed in to change notification settings - Fork 0
/
from_seqs_to_OTU.sh
196 lines (138 loc) · 7.29 KB
/
from_seqs_to_OTU.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
#!/bin/bash
########################################################################################
########################################################################################
# This is a collection of bash command, used for the initial steps of data analysis;
# from raw sequences to OTUs. Those command were used for microbiota analysis of article
# "IL-13 mRNA tissue content identifies two subsets of adult ulcerative colitis patients
# with different clinical and mucosa-associated microbiota profiles"
# By authors: Alessia Butera, Monica Di Paola, Francesco Vitali, Daniela De Nitto,
# Francesco Covotta, Francesco Borrini, Roberta Pica, Carlotta De Filippo, Duccio Cavalieri,
# Alessandro Giuliani, Annamaria Pronio, Monica Boirivan
# Script author: Francesco Vitali
# DISCLAIMER: this is the collection of bash command that were used for analysis.
# However, for publication purpose, absolute filepath of my local PC were changed with
# variables (declared at the beginning of the script). As such, they were not tested and
# are not guaranteed to work
# DISCLAIMER: Use these command and the information contained here at your own risk!
# I'm not responsible for loss of data
########################################################################################
########################################################################################
# Raw reads files are assumed to be in $currpath/Raw_reads !!
#################################################
# SETTING UP VARIABLES AND CREATING DIRECTORIES #
currpath=$(pwd) # top project directory
core=6 # number of core to use
mkdir $currpath/cutadapt/
mkdir $currpath/quality_control/
mkdir $currpath/renamed/
mkdir $currpath/seq_fasta
mkdir $currpath/cutadapt/trimmed
mkdir $currpath/cutadapt/trimmed/assembled
cd $currpath
##########################################################################################
# Renaming files from the sequencing service provider are usually redundant and "messy". #
# Here samples are renamed. Code should be edited depending on the type of name
# name assumed here is like SampleID-V3-V4_S239_L001_R1_001.fastq.gz
# where SampleID and R1 are the interesting part. "cut" is used to divide
# the name string on a desired character
# output would be SampleID-V3-V4_R1.fastq.gz
name=0
este=".fastq.gz"
for i in *.fastq.gz
do
sample=$(echo "$i" | cut -d "_" -f1)
read=$(echo "$i" | cut -d "_" -f4)
mv $i $currpath/renamed/$sample"_"$read$este
echo -e "$i\t-->\t$sample"_"$read$este" >> log_renamer.txt
done
##################################
# Counting raw reads per sample #
for i in $currpath/renamed/*.fastq.gz
do
echo -n $i ' \t ' >> seq_count_16S.txt
echo $(zcat $i | wc -l) / 4 | bc >> seq_count_16S.txt
done
#######################################################################
# Creating a text file with sample names, it will be used for looping #
for i in $currpath/renamed/*.fastq.gz
do
echo "$i" | cut -d "_" -f1 | >> names.txt
sed 'n; d' names.txt > names_single.txt
done
# coping the file to other folders
cp names_single.txt $currpath/cutadapt/
cp names_single.txt $currpath/cutadapt/trimmed/
#############################################
# Removing sequencing primers with CUTADAPT #
# Reference: http://journal.embnet.org/index.php/embnetjournal/article/view/200
# Guide: https://cutadapt.readthedocs.io/en/stable/guide.html
while read file
do
echo "Running cutadapt on file "${file}""
'#qui le seq per 16S della BMr, cambiare se altri primer'
cutadapt -a CCTACGGGNBGCA...GGATTAGATACCCBNGTAGTC -A GACTACNVGGGTATCTAATCC...CTGSTGCVNCCCGTAGG \
--trim-n -o $currpath/cutadapt/"${file}_R1_cutadapt.fastq.gz" -p $currpath/cutadapt/"${file}_R2_cutadapt.fastq.gz" \
$currpath/renamed/"${file}_R1.fastq.gz" $currpath/renamed/"${file}_R2.fastq.gz" 1>> report_cutadapt_primer.txt
done < names_single.txt
cp report_cutadapt_primer.txt $currpath/quality_control
#################################################################
# Evaluating quality of the reads to choose trimming parameters #
fastqc $currpath/cutadapt/*.fastq.gz -o $currpath/quality_control/ -t $core
multiqc $currpath/quality_control/. -o $currpath/quality_control/
##################################################################
# TRIMMING: removing low quality region towards the end of reads #
# Reference: https://github.com/najoshi/sickle
while read file
do
echo "Running sickle on file "${file}""
echo "Running sickle on file "${file}"" >> stats_trim.txt
sickle pe -f $currpath/cutadapt/"${file}_R1_cutadapt.fastq.gz" -r $currpath/cutadapt/"${file}_R2_cutadapt.fastq.gz" \
-o $currpath/cutadapt/trimmed/"${file}_trimmed_R1.fastq" -p $currpath/cutadapt/trimmed/"${file}_trimmed_R2.fastq" -s $currpath/cutadapt/trimmed/"${file}_singles" \
-t sanger -q 20 -l 200 -g 1>> stats_trim.txt
done < names_single.txt
#####################################
# Joining forward and reverse read #
# Reference: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3933873/
# Guide: https://cme.h-its.org/exelixis/web/software/pear/doc.html
while read file
do
echo "Running pear on file "${file}""
pear -f $currpath/cutadapt/trimmed/"${file}_trimmed_R1.fastq" -r $currpath/cutadapt/trimmed/"${file}_trimmed_R2.fastq" \
-o $currpath/cutadapt/trimmed/assembled/"${file}" -j $core -p 0.001 -u 0 -q 30 1>> stats_assembly.txt
done < names_single.txt
#################################################################
# Evaluating quality on the final, pre-treated and joined reads #
fastqc $currpath/cutadapt/trimmed/assembled/*.fastq.gz -o $currpath/quality_control/ -t $core
multiqc $currpath/quality_control/. -o $currpath/quality_control/
##########################
# Convert fastq in fasta #
for i in $currpath/cutadapt/trimmed/assembled/*.fastq
do
echo "Converting "$i""
name=$(echo "$i" | cut -d "." -f1,2)
fastq_to_fasta -i ./$i -o $currpath/seq_fasta/$name.fasta
done
###################################
# Validate mapping file for QIIME #
validate_mapping_file.py -m $currpath/mapping_colitis.csv -p -b
####################
# Add QIIME header #
add_qiime_labels.py -i $currpath/seq_fasta -m $currpath/mapping_colitis.csv -c InputFileName -o $currpath/seqs_with_header
#####################
# Chimera filtering #
# Reference: https://www.ncbi.nlm.nih.gov/pubmed/27781170
# Guide: https://github.com/torognes/vsearch
vsearch -uchime_ref $currpath/seqs_with_header/combined_seqs.fna --db /usr/local/lib/python2.7/dist-packages/qiime_default_reference/gg_13_8_otus/rep_set/97_otus.fasta \
--nonchimeras $currpath/combined_seqs_chimera_filtered.fna --threads $core
##############################
# Open reference otu picking #
cat > $currpath/otu_picking_param.txt
pick_otus:enable_rev_strand_match True
pick_otus:otu_picking_method uclust
pick_open_reference_otus.py -i $currpath/combined_seqs_chimera_filtered.fna -r /usr/local/lib/python2.7/dist-packages/qiime_default_reference/gg_13_8_otus/rep_set/97_otus.fasta \
-o $currpath/open_ref_otu_picking_uclust -s 0.1 -m uclust -p $currpath/otu_picking_param.txt -a -O 4
########################################
# Discard low sequence coverage OTUs #
#REFERENCE: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3531572/
filter_otus_from_otu_table.py -i $currpath/open_ref_otu_picking_uclust/otu_table_mc2_w_tax_no_pynast_failures.biom -o $currpath/otu_table_mc2_w_tax_no_pynast_failures_filtered.biom \
--min_count_fraction 0.00005