-
Notifications
You must be signed in to change notification settings - Fork 0
/
Filter_assembly.sh
139 lines (112 loc) · 3.87 KB
/
Filter_assembly.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#!/bin/bash
# Function to display usage information
usage() {
echo "Usage: $0 -i <fasta_file> -b <busco_file> -a <asm_stats_file> -f <blob_file> -o <output_name>"
echo
echo "Options:"
echo " -i <fasta_file> Path to the fasta file"
echo " -b <busco_file> Path to the BUSCO file"
echo " -a <asm_stats_file> Path to the assembly stats file"
echo " -f <blob_file> Path to the blob file"
echo " -o <output_name> Desired name for the output file"
echo " -h Display this help message"
exit 1
}
# Check if seqkit is installed
if ! command -v seqkit &> /dev/null; then
echo "Error: seqkit is not installed. Please install seqkit to proceed."
exit 1
fi
# Check if no arguments were passed and display usage
if [ $# -eq 0 ]; then
usage
fi
# Read command-line arguments
while getopts "i:b:a:f:o:h" flag; do
case "${flag}" in
i) fasta=${OPTARG};;
b) busco_file=${OPTARG};;
a) asm_stats_filename=${OPTARG};;
f) blob_file=${OPTARG};;
o) output_name=${OPTARG};;
h) usage;;
*) usage;;
esac
done
# Check if all required arguments are provided
if [ -z "${fasta}" ] || [ -z "${busco_file}" ] || [ -z "${asm_stats_filename}" ] || [ -z "${blob_file}" ] || [ -z "${output_name}" ]; then
echo "Error: Missing required arguments."
usage
fi
############## Logging braces open #####################
{
###### Filtering assembly based on contigs size
contigs_siz_fil() {
local fileA=$1
local fileB=$2
# Extracting contigs from fileA
fil_contigs=$(sed '1,/>>>>>>> Coverage per contig/d' "$fileA" | awk -F'\t' 'NR>1 && $3 < 1000 {print $2}')
# contigs to compare (example contigs)
contigs=($fil_contigs)
# Using awk to filter contigs that do not match the column of fileB
unmatched_contigs=$(awk -F'\t' -v contigs="${contigs[*]}" '
BEGIN {
#Split the contigs into an array
split(contigs, elementArray, " ")
#Create an associative array to store contigs
for (i in elementArray) {
comparecontigs[elementArray[i]] = 1
}
}
# Skip lines starting with #
/^#/ {
next
}
{
#Check if the element is in the files column
if ($3 in comparecontigs) {
delete comparecontigs[$3]
}
}
END {
#Print contigs that were not found in the file
for (e in comparecontigs) {
print e
}
}' "$fileB")
echo "$unmatched_contigs"
}
######### Filtering assembly based on contaminated contigs
contigs_cont_fil() {
local fileA=$1
#Extract contaminated contigs
cont_contigs=$(awk -F "," 'NR > 1 {
# Remove double quotes from fields
gsub(/"/, "", $6);
gsub(/"/, "", $7);
# Check if 6th column is neither "Arthropoda" nor "no-hit"
if ($6 != "Arthropoda" && $6 != "no-hit") {
print $7;
}
}' "$fileA")
echo "$cont_contigs"
}
echo -e "File $asm_stats_filename is loaded\n"
echo -e "File $busco_file is loaded\n"
echo -e "File $fasta is loaded\n"
echo -e "File $blob_file is loaded\n"
# Call the size filter function
siz_fil_contigs=$(contigs_siz_fil "$asm_stats_filename" "$busco_file")
#echo "$siz_fil_contigs" #> size_fil_contig.txt
# Call the contaminantion filter function
cont_fil_contigs=$(contigs_cont_fil "$blob_file")
#echo "$cont_fil_contigs"
echo -e "$siz_fil_contigs\n$cont_fil_contigs" > contigs_rmvd.txt
#SeqKit to filter scaffold
seqkit grep -v -f contigs_rmvd.txt $fasta > $output_name
echo -e "Filtered fasta is stored in $output_name\n"
ori_contig_num=$(grep -c "^>" $fasta)
fil_contig_num=$(grep -c "^>" $output_name)
echo -e "Number of contigs\nBefore filteration:$ori_contig_num\nAfter filteration :$fil_contig_num\n"
} 2>&1 | tee -a file.log
####################### Logging braces closed #####################