-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfasta_translate.py
executable file
·122 lines (88 loc) · 3.72 KB
/
fasta_translate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import sys
import argparse
from Bio import SeqIO
from Bio.Data import CodonTable
from dnachisel.biotools import reverse_translate
from collections import Counter
from jakomics import colors
import jak_utils
# OPTIONS #####################################################################
parser = argparse.ArgumentParser(description='', formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument('-f', '--fasta',
required=True)
parser.add_argument('--frames',
nargs="*",
default=[1])
parser.add_argument('--reverse',
action='store_true',
help='Reverse translate a fasta file of proteins')
args = parser.parse_args()
###
def report_problem_aa(seq_record):
standard_aa = ['A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
problems = False
aa_freq = Counter(str(seq_record.seq))
aa_prob_freq = {k:aa_freq[k] for k in aa_freq.keys() if k not in standard_aa}
if len(aa_prob_freq) > 0:
print(f"{colors.bcolors.YELLOW}WARNING\tnon-standard AAs\t{aa_prob_freq}\t{seq_record.description}{colors.bcolors.END}", file = sys.stderr)
problems = True
if problems:
return(1)
else:
return(0)
def fix_ambiguous_aa(seq, add_stop):
'''
The pyrrolysine and selenocysteine can be changed at the codon table step in the main code,
but other IUPAC ambiguous characters need to just be changed to one of the
'''
replacements = {
"X": "O",
"B": 'D', # Aspartic acid or Asparagine
"Z": 'E' # Glutamic acid or Glutamine
}
seq = str(seq).upper() + "*"
for aa, codon in replacements.items():
seq = seq.replace(aa, codon)
return(seq)
###
if __name__ == "__main__":
jak_utils.header()
if args.reverse == False:
for seq_record in SeqIO.parse(args.fasta, "fasta"):
for frame in args.frames:
frame = int(frame)
absFrame = frame
if frame > 0:
seq = seq_record.seq
elif frame < 0:
seq = seq_record.seq.reverse_complement()
absFrame = abs(frame)
# trim seq to modulus 3 to remove the biopython error
trim = len(seq[absFrame-1:]) % 3
trim = 0 - trim
if trim != 0:
seq = seq[:trim]
print(">"+seq_record.id+"_"+str(frame))
print(seq[absFrame-1:].translate(table=11))
else:
CodonTable.unambiguous_dna_by_name['Standard'].back_table['U'] = 'TGA'
CodonTable.unambiguous_dna_by_name['Standard'].back_table['O'] = 'TAG'
CodonTable.unambiguous_dna_by_name['Standard'].forward_table['TGA'] = 'U'
CodonTable.unambiguous_dna_by_name['Standard'].forward_table['TAG'] = 'O'
failed = 0
warnings = 0
for seq_record in SeqIO.parse(args.fasta, "fasta"):
# report problems
warnings = warnings + report_problem_aa(seq_record)
# fix problematic characters
fixed_seq = fix_ambiguous_aa(seq_record.seq, add_stop = True)
try:
s = reverse_translate(fixed_seq, table="Standard")
print(f">{seq_record.description}")
print(s)
except KeyError as e:
print(f"{colors.bcolors.RED}ERROR - {e} not found: {seq_record.id}{colors.bcolors.END}", file=sys.stderr)
failed += 1
continue
print(f"{colors.bcolors.YELLOW}Warnings: {warnings}{colors.bcolors.END}", file=sys.stderr)
print(f"{colors.bcolors.RED}Failures: {failed}{colors.bcolors.END}", file=sys.stderr)