-
Notifications
You must be signed in to change notification settings - Fork 0
/
genomeAnalyzer.py
191 lines (153 loc) · 7.34 KB
/
genomeAnalyzer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
# !/usr/bin/env python3
# University of California, Santa Cruz - BME 160
# Biomolecular Engineering and Bioinformatics
# Name: (zmmason)
import sys
class FastAreader:
"""
Define objects to read FastA files.
instantiation:
thisReader = FastAreader ('testTiny.fa')
usage:
for head, seq in thisReader.readFasta():
print (head,seq)
"""
def __init__(self, fname=''):
"""contructor: saves attribute fname """
self.fname = fname
def doOpen(self):
""" Handle file opens, allowing STDIN."""
if self.fname == '':
return sys.stdin
else:
return open(self.fname)
def readFasta(self):
""" Read an entire FastA record and return the sequence header/sequence"""
header = ''
sequence = ''
with self.doOpen() as fileH:
header = ''
sequence = ''
# skip to first fasta header
line = fileH.readline()
while not line.startswith('>'):
line = fileH.readline()
header = line[1:].rstrip()
for line in fileH:
if line.startswith('>'):
yield header, sequence
header = line[1:].rstrip()
sequence = ''
else:
sequence += ''.join(line.rstrip().split()).upper()
yield header, sequence
class NucParams:
"""Program to calculate the physical-chemical properties of a protein sequence."""
rnaCodonTable = {
# RNA codon table
# U
'UUU': 'F', 'UCU': 'S', 'UAU': 'Y', 'UGU': 'C', # UxU
'UUC': 'F', 'UCC': 'S', 'UAC': 'Y', 'UGC': 'C', # UxC
'UUA': 'L', 'UCA': 'S', 'UAA': '-', 'UGA': '-', # UxA
'UUG': 'L', 'UCG': 'S', 'UAG': '-', 'UGG': 'W', # UxG
# C
'CUU': 'L', 'CCU': 'P', 'CAU': 'H', 'CGU': 'R', # CxU
'CUC': 'L', 'CCC': 'P', 'CAC': 'H', 'CGC': 'R', # CxC
'CUA': 'L', 'CCA': 'P', 'CAA': 'Q', 'CGA': 'R', # CxA
'CUG': 'L', 'CCG': 'P', 'CAG': 'Q', 'CGG': 'R', # CxG
# A
'AUU': 'I', 'ACU': 'T', 'AAU': 'N', 'AGU': 'S', # AxU
'AUC': 'I', 'ACC': 'T', 'AAC': 'N', 'AGC': 'S', # AxC
'AUA': 'I', 'ACA': 'T', 'AAA': 'K', 'AGA': 'R', # AxA
'AUG': 'M', 'ACG': 'T', 'AAG': 'K', 'AGG': 'R', # AxG
# G
'GUU': 'V', 'GCU': 'A', 'GAU': 'D', 'GGU': 'G', # GxU
'GUC': 'V', 'GCC': 'A', 'GAC': 'D', 'GGC': 'G', # GxC
'GUA': 'V', 'GCA': 'A', 'GAA': 'E', 'GGA': 'G', # GxA
'GUG': 'V', 'GCG': 'A', 'GAG': 'E', 'GGG': 'G' # GxG
}
dnaCodonTable = {key.replace('U', 'T'): value for key, value in rnaCodonTable.items()}
def __init__(self, inString=''):
"""
initializing atributes of the dna/rna sequence provided from the fastAreader.
- created dictionaries: nucleotide composition, amino acid composition, codon composition
- all dictionaries hold a zero value to be updated when 'addSequence' adds to them
"""
self.nucComp = {'A': 0, 'C': 0, 'G': 0, 'T': 0, 'U': 0, 'N': 0}
self.codonComp = {} # creates dict to hold codons from the rna & dna codon tables with '0' set values
for i in range(len(NucParams.rnaCodonTable)):
newCodon = (NucParams.rnaCodonTable.keys() or NucParams.dnaCodonTable.keys())
newCodon = dict.fromkeys(newCodon, 0)
self.codonComp.update(newCodon)
self.aaComp = { # empty dict for aa
'A': 0, 'G': 0, 'M': 0, 'S': 0, 'C': 0,
'H': 0, 'N': 0, 'T': 0, 'D': 0, 'I': 0,
'P': 0, 'V': 0, 'E': 0, 'K': 0, 'Q': 0,
'W': 0, 'F': 0, 'L': 0, 'R': 0, 'Y': 0
}
self.addSequence(inString) # gives the sequences to 'addSequence'
def addSequence(self, inSeq):
"""
Method accepts new sequences in the form of nucleotides (ACGTUN), and is presumed to start in frame 1.
This data is added to the dictionary data in the '__init__' method
"""
for nuc in inSeq: # for loop to cyle and count individual nucs in the input string to add to 'self.nucComp'"""
if nuc in self.nucComp: # count valid nuc
self.nucComp[nuc] += 1 # adds 1 to dict if found in sequence
invalBase = 'N' # invalid base to search and remove codon that contains it
self.codonList = [(inSeq[i:i + 3]) for i in
range(0, len(inSeq), 3)] # splits the string sequence every 3 nuc's and puts into list
for codon in self.codonList:
if invalBase not in codon: # removing codon with invalid base
if codon in self.codonComp: # count valid codon
self.codonComp[codon] += 1 # adds 1 to dict if found in sequence
aaList = [] # list to hold aa's from rna & dna dicts
for aa in NucParams.rnaCodonTable or NucParams.dnaCodonTable:
aaCount = NucParams.rnaCodonTable[aa] or NucParams.dnaCodonTable[aa], self.codonComp[
aa] # sets format '(aa, 0)'
aaList.append(aaCount) # adds each aa to list
for aa, count in aaList:
total = self.aaComp.get(aa, 0) + count # uses dict of aa's and adds for each occurrance
self.aaComp[aa] = total # adds total of each aa to dict
def aaComposition(self):
"""
Method will return a dictionary of counts over the 20 amino acids and stop codons
in the RNA/DNA codon tables.
"""
return (self.aaComp)
def nucComposition(self):
"""
Method returns a dictionary of counts of valid nucleotides (ACGTNU) found in
the analysis.
"""
return (self.nucComp)
def codonComposition(self):
"""
This returns a dict that counts # of specific codons (ACGTU). Codons that contain 'N' bases are
discarded in this method. All codon counts are stored as RNA codons.
"""
return (self.codonComp)
def nucCount(self):
"""
This returns an integer value which is the sum of every valid nucleotide (ACGTUN) found.
"""
return sum(self.nucComp.values())
def main ():
"""
program takes a fatsa file translated by the 'FastAreader' and runs through 'nucParams'
in 'sequenceAnalysis.py'. It is able to then analyze genetic sequences (DNA/RNA) to produce &
print computational data for that fasta file relating to: amino acids, codons, nucleotides.
"""
myReader = FastAreader() # make sure to change this to use stdin if using a filename for testing
myNuc = NucParams()
for head, inSeq in myReader.readFasta():
myNuc.addSequence(inSeq)
gcCount = (myNuc.nucComposition().get('G')+myNuc.nucComposition().get('C'))
print("Sequence length = {:.2f} Mb\n" .format(myNuc.nucCount()/1e6)) # prints seq length
print("GC content = {:.1f}%\n" .format(gcCount/myNuc.nucCount()*100)) # prints gc content in seq length
nucs = sorted(myNuc.rnaCodonTable.items(), key=lambda aa: aa[1]) # sort codons in alpha order, by Amino Acid
for nuc, aa in nucs: # calculate relative codon usage for each codon and print
val = myNuc.codonComp[nuc]/(sum(myNuc.codonComp.values())) # ratio of specific codon to total codons
print ('{:s} : {:s} {:5.1f} ({:6d})'.format(nuc, aa, val*100, myNuc.codonComp[nuc]))
if __name__ == "__main__":
main()