-
Notifications
You must be signed in to change notification settings - Fork 6
/
translate_fasta.awk
executable file
·155 lines (128 loc) · 5.5 KB
/
translate_fasta.awk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#!/usr/bin/awk -f
# AUTHOR: Pablo Vinuesa, @pvinmex, https://www.ccg.unam.mx/~vinuesa/
# source: https://github.com/vinuesa/intro2linux
# translate_fasta.awk VERSION:0.1
# AIM: translates a fasta file containing one or more CDSs
# using the universal genetic code
BEGIN {
progname = "translate_fasta.awk"
version = 0.2 # dec 26, 2020
# check that the script receives input either from file or pipe
if ( ARGC < 2 )
print_help(progname, version)
# Model FASTA file
RS=">"
FS=OFS="\n"
# print the FASTA sequences in ascending order by locus_tag number
PROCINFO["sorted_in"] = "@ind_num_asc"
input_fasta=ARGV[1]
# initialize a hash named "c" holding the codon-aminoacid pairs
# based on the universal genetic code
c["ATA"]="I"; c["ATC"]="I"; c["ATT"]="I"; c["ATG"]="M";
c["ACA"]="T"; c["ACC"]="T"; c["ACG"]="T"; c["ACT"]="T";
c["AAC"]="N"; c["AAT"]="N"; c["AAA"]="K"; c["AAG"]="K";
c["AGC"]="S"; c["AGT"]="S"; c["AGA"]="R"; c["AGG"]="R";
c["CTA"]="L"; c["CTC"]="L"; c["CTG"]="L"; c["CTT"]="L";
c["CCA"]="P"; c["CCC"]="P"; c["CCG"]="P"; c["CCT"]="P";
c["CAC"]="H"; c["CAT"]="H"; c["CAA"]="Q"; c["CAG"]="Q";
c["CGA"]="R"; c["CGC"]="R"; c["CGG"]="R"; c["CGT"]="R";
c["GTA"]="V"; c["GTC"]="V"; c["GTG"]="V"; c["GTT"]="V";
c["GCA"]="A"; c["GCC"]="A"; c["GCG"]="A"; c["GCT"]="A";
c["GAC"]="D"; c["GAT"]="D"; c["GAA"]="E"; c["GAG"]="E";
c["GGA"]="G"; c["GGC"]="G"; c["GGG"]="G"; c["GGT"]="G";
c["TCA"]="S"; c["TCC"]="S"; c["TCG"]="S"; c["TCT"]="S";
c["TTC"]="F"; c["TTT"]="F"; c["TTA"]="L"; c["TTG"]="L";
c["TAC"]="Y"; c["TAT"]="Y"; c["TAA"]="*"; c["TAG"]="*";
c["TGC"]="C"; c["TGT"]="C"; c["TGA"]="*"; c["TGG"]="W";
unknown = "X"
}
# -------------------- #
# >>> MAIN PROGRAM <<< #
# -------------------- #
NR > 1 {
# read the CDS (DNA) sequence into the global seqs hash
read_fasta(input_fasta)
}
END {
# 1. translate the CDS
for (h in seqs) { if(length(seqs[h]) >= 3) translate_dna(h, seqs[h]) }
# 2. print tranlsated fasta
for (h in prots) if ($0 != "") printf ">%s\n%s\n", h, prots[h]
}
#---------------------------------------------------------------------------------------------------------#
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> FUNCTION DEFINITIONS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
#---------------------------------------------------------------------------------------------------------#
function read_fasta(fasta, s, h, i)
{
# fill hash seqs with the DNA/CDS sequences
s=""
h=$1
for (i=2; i<=NF; i++) s=s$i
seqs[h]=s
}
#---------------------------------------------------------------------------------------------------------
function translate_dna(header, dna_seq, s, i, p)
{ # receives two arguments: the fasta header and the CDS sequence to be translated
# Initialize variables:
# do-while loop control variable i (nt counter)
# and p, which will hold the translation product
{i=1; p=""; triplet_counter=0}
# Here we run a do-while loop; the do loop is a variation of the while looping statement.
# The do loop always executes the body once and then repeats the body as long as the condition is true
# We use the do-while loop, to get a first triplet string saved in s;
# then the while loop keeps going until substr() got the last triplet, resulting in an empty s="".
do {
# First check that the script got some input
# if not, exit with an error message
if(length(dna_seq) == 0) {
print "ERROR: need a DNA sequence string to translate (valid DNA sequence, divisible by 3) "
exit 1
# Check that the DNA sequence string is divisible by 3 using the modulo operator
# if not, exit with an error message
} else if(length(dna_seq)%3) {
print "ERROR: input DNA sequence not divisible by 3 ..."
exit 2
}
# use substr() to split the input sequence (dna_seq) into triplets saved in s
s = substr(dna_seq, i, 3)
# keep track of processed triplets (codons)
triplet_counter++
# check that the input corresponds to valid nucleotides
if ( s ~ /[^acgtACGT]+/ ) {
print "ERROR: input triplet", triplet_counter, "=", s,
"contains a non-valid nucleotide symbol ..."
exit 3
}
# make sure that input nt symbols are uppercase to match the hash keys
s = toupper(s)
# use the codon hash c as lookup table to translate the s triplet
# appending c[s] to the growing peptide p
{
# break out of loop if we get no more triplets
# out of the input DNA string with substr()
if (c[s]=="") {
break
}
else if (s in c == 0) {
# if the triplet is not contained in c, append "X" to p
p=p unknown
} else {
# append aminoacid c[s] to growing peptide
p = p c[s]
}
}
i=i+3 # increment the counter of processed dna nucleotides by 3
}
# run while loop until substring cannot retrieve any more triplets
while (s!="")
prots[header]=p
}
#---------------------------------------------------------------------------------------------------------
function print_help(prog, vers) # (prog, vers)
{
print "USAGE: examples for", prog, "v"vers
print " ./"prog, "CDSs.fasta" > "/dev/stderr"
print " OR" > "/dev/stderr"
print " cat CDSs.fasta | ./"prog, "-" > "/dev/stderr"
exit 1
}