-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSequence_Analysis.py
587 lines (491 loc) · 28.6 KB
/
Sequence_Analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
# !/usr/bin/env python3
# University of California, Santa Cruz
# Biomolecular Engineering and Bioinformatics
# Name: Zachary Mason (zmmason), Justin Chan (jumchan)
# Group Members: None
class ProteinParam:
"""
The class, ProteinParam, is formatted to have one constructor and seven
methods that calculate their own quantitative characteristics of the
input amino acid sequence.
"""
aa2mw = {
'A': 89.093, 'G': 75.067, 'M': 149.211, 'S': 105.093, 'C': 121.158,
'H': 155.155, 'N': 132.118, 'T': 119.119, 'D': 133.103, 'I': 131.173,
'P': 115.131, 'V': 117.146, 'E': 147.129, 'K': 146.188, 'Q': 146.145,
'W': 204.225, 'F': 165.189, 'L': 131.173, 'R': 174.201, 'Y': 181.189
}
mwH2O = 18.015
aa2abs280 = {'Y': 1490, 'W': 5500, 'C': 125}
aa2chargePos = {'K': 10.5, 'R': 12.4, 'H': 6}
aa2chargeNeg = {'D': 3.86, 'E': 4.25, 'C': 8.33, 'Y': 10}
aaNterm = 9.69
aaCterm = 2.34
def __init__(self, protein):
"""
To make a ProteinParam object, a input string that represents an amino
acid sequence need to be passed in.
The input sequence is then formatted in all uppercase and valid single
letter amino acid codes in a string and then formatted with a dictionary
of values with the counts of every valid amino acid.
This design of a dictionary with the count of amino acids was decided
because it would be the easiest to have when calculating return values
for the methods, molarExtinction(), molecularWeight(), and charge().
"""
realAASeq = ""
listOfChar = list(protein.upper()) # Makes AA sequence input capitalized
realAA = self.aa2mw.keys() # Creates list of keys for valid amino acids
for aminoAcid in listOfChar: # Checks to see if all characters are valid amino acid codes
for realAminoAcid in realAA:
if aminoAcid == realAminoAcid:
realAASeq += aminoAcid
aaSeq = realAASeq
aminoAA = list(self.aa2mw.keys())
numOfAA = [0] * len(aminoAA) # Creates beginning counts of each valid amino acid
indivAAList = list(aaSeq)
for aminoAcid in indivAAList: # Counts how many of each valid amino acids there are
for singleAA in aminoAA:
if singleAA == aminoAcid:
numOfAA[aminoAA.index(singleAA)] += 1
self.countAA = dict(zip(aminoAA, numOfAA)) # Creates a dictionary with amino acid counts
# Single letter amino acid codes are keys and their counts in the input sequence are their values
def aa_Count(self):
"""
Returns how total valid single letter amino acid codes in the input amino acid sequence
Calculated by using a for loop through the dictionary of amino acid counts to return the
sum of all the counts
input: dictionary containing amino acid counts of input sequence
output: valid number of amino acids in input sequence
"""
count = 0
for countAA in self.countAA.values(): # Counts total amino acids by taking sum of counts in dictionary
count += countAA
return count
def pI(self):
"""
Returns the theoretical isoelectric point. The theoretical isoelectric point is the pH
where the input amino acid sequence is neutralized or when its net charge is closest to 0
The method pI is basically a for loop that loops through pHs 0-14 by every 0.01 pH and
uses the charge() to check the charge at that pH. If the charge is lower than the saved
charge, it saves the charge and that pH. This is done to decide which pH provides the
most neutral charge for the amino acid sequence and returns the pH or theoretical
isoelectric point.
input: dictionary containing amino acid counts of input sequence
output: the pH where the amino acid sequence is neutralized or has a net charge closest to 0
"""
bestCharge = 1000000 # Represents smallest charge through the pH range 0-14
bestpH = 0.00 # Represents pH with the most neutral chrage for the amino acid sequence
for pH in range(0, 1401, 1): # Loops through pH 0-14 every 0.01
realpH = pH / 100
thisCharge = abs(self.charge(realpH)) # Absolute value of charge at pH using charge()
if thisCharge < bestCharge: # Checks if the recent pH produces a smaller charge
bestCharge = thisCharge
bestpH = realpH
return bestpH
def aaComposition(self):
"""
Return a dictionary containing valid single letter amino acid codes and their counts. The method
is pretty simple because the dictionary is made in the constructor and aaComposition() simply
returns it.
input: dictionary containing amino acid counts of input sequence
output: dictionary containing amino acid counts of input sequence
"""
return self.countAA
def charge(self, pH):
"""
Returns the charge of the amino acid sequence given a pH.
In other words, charge returns the charge of an amino acid sequence given a passed in pH.
The method charge() is private because its sole use is only to be used in another method, pI().
charge() utilizes a for loop and a series of if statements to determine if amino acid
is negatively and positively charged in the amino acid dictionary of counts given the dictionaries
aa2chargePos and aa2chargeNeg. If it is, it uses an equation with the amino acid's pKa and current
pH that adds to the positive or negative charge to be used to calculate the net charge. It also
factors the N and C terminus into the charge() after using object attributes, aaNterm and aaCterm
and return the net charge.
Positvely Charged AAs: K,R,H (N terminus)
Negatively Charged AAs: D,E,C,Y (C terminus)
Class Attributes Used: aa2chargePos, aa2chargeNeg, aaNterm, aaCterm
The class attributes used were need to get the pKa's of the amino acids which were saved in
floats and in dictionaries in the class to find the net charge of the protein sequence.
input: dictionary containing amino acid counts of input sequence
output: charge of the amino acid at a specific pH
"""
posCharge = 0.00 # Charge contribution from positively charged amino acids and N terminus
negCharge = 0.00 # Charge contribution from negatively charged amino acids and C terminus
if self.aa_Count() != 0:
for aminoAcid, countAminoAcid in self.countAA.items(): # Loops through amino acids
if self.aa2chargePos.get(aminoAcid,
'unknown') != 'unknown': # Checks for positively charged amino acids
posCharge += countAminoAcid * (10 ** (self.aa2chargePos.get(aminoAcid))) / (
10 ** self.aa2chargePos.get(aminoAcid) + 10 ** pH)
elif self.aa2chargeNeg.get(aminoAcid,
'unknown') != 'unknown': # Checks for negatively charged amino acids
negCharge += countAminoAcid * (10 ** pH) / (10 ** self.aa2chargeNeg.get(aminoAcid) + 10 ** pH)
posCharge += (10 ** self.aaNterm) / (10 ** self.aaNterm + 10 ** pH) # Factors N terminus in net charge
negCharge += (10 ** pH) / (10 ** self.aaCterm + 10 ** pH) # Factors C terminus in net charge
return posCharge - negCharge
def molarExtinction(self):
"""
Returns the molar extinction coefficent. The molar extinction coefficient is how much light
a protein absorbs at a given wavelength.
A for loop is used to go through every item of the amino acid count dictionary to determine
if an amino acid is one that is part of the calculation of the molar extinction coefficient.
If the amino acid is either Y, W, C, then it is factored in calculating the molar extinction
coefficient by multiplying its count and extinct coefficients.
Class Attributes Used: aa2abs280
The class attribute used here was used to find the molar extinction coefficients of particular
amino acids that contributed to the molar extinction coefficient of the total protein sequence.
input: dictionary containing amino acid counts of input sequence
output: molar extinction coefficient of a protein
"""
molEx = 0.000 # Molar Extinction Coefficient of AA Sequence
for aminoAcidAA, countAminoAcidAA in self.countAA.items(): # Loops through AA count dictionary
if self.aa2abs280.get(aminoAcidAA, 'unknown') != 'unknown': # Checks if amino acid affects light absorption
molEx += countAminoAcidAA * self.aa2abs280.get(aminoAcidAA)
return molEx
def massExtinction(self):
"""
Returns mass extinction coefficient. Divides molar extinction coefficient by molecular weight
to calculate mass extinction coefficient
massExtinction() utilizes molecularWeight() and molarExtinction() methods to return mass
extinction coefficient.
input: dictionary containing amino acid counts of input sequence
output: mass extinction coefficient of a protein
"""
myMW = self.molecularWeight()
return self.molarExtinction() / myMW if myMW else 0.0 # Ensures molecular weight is greater than 0
def molecularWeight(self):
"""
Returns the molecular weight of the amino acid sequence by adding weight of amino acid
sequence.
molecularweight() goes through the amino acid count dictionary and counts how many of each
amino acid there is and uses it to multiply it by its molecular weight in the aa2mw dictionary.
It simply then returns sum of these amino acids in the sequence.
Class Attributes Used: aa2mw
This attribute was used to determine the molecular weight of each amino acid saved as
a dictionary.
input: dictionary containing amino acid counts of input sequence
output: molecular weight of the amino acid sequence
"""
weight = 0.000
if self.aa_Count() != 0:
for aminoAA, countAA in self.countAA.items(): # Loops through each dictionary item of countAA
weight += self.aa2mw.get(aminoAA) * countAA
weight -= self.mwH2O * countAA # Get rid of H20 weight in the amino acid
weight += self.mwH2O
return weight
class NucParams:
"""
The class, NucParams, has one constructor and five methods that use to count amino acid,
single nucleotides, and codons of an DNA/RNA nucleotide sequence.
The class has 3 class attributes that are dictionaries that relate counts of nucleotides
and codons in the input string and the relationships between codons and amino acid letter
codes. These dictionaries are used in all the methods to return counts and the dictionaries
themselves to be used to gain information about a DNA/RNA nucleotide sequence.
NucParams takes an input string and runs for loops that stores single nucleotides and codons.
This was done because given the methods, it would be much easier to create dictionaries of
the input sequence in the constructor using addSequence() and return them in methods. Furthermore,
many of the methods rely a lot on each other so it seemed best to make them attributes of the class.
Three of the methods in the class return dictionaries of valid nucleotides, codons, and amino acids
in the input sequnece. One method returns the total count of nucleotides and the last method allows
to add on the previous sequence with new sequence that can be analyzed together.
"""
rnaCodonTable = {
# U
'UUU': 'F', 'UCU': 'S', 'UAU': 'Y', 'UGU': 'C', # UxU
'UUC': 'F', 'UCC': 'S', 'UAC': 'Y', 'UGC': 'C', # UxC
'UUA': 'L', 'UCA': 'S', 'UAA': '-', 'UGA': '-', # UxA
'UUG': 'L', 'UCG': 'S', 'UAG': '-', 'UGG': 'W', # UxG
# C
'CUU': 'L', 'CCU': 'P', 'CAU': 'H', 'CGU': 'R', # CxU
'CUC': 'L', 'CCC': 'P', 'CAC': 'H', 'CGC': 'R', # CxC
'CUA': 'L', 'CCA': 'P', 'CAA': 'Q', 'CGA': 'R', # CxA
'CUG': 'L', 'CCG': 'P', 'CAG': 'Q', 'CGG': 'R', # CxG
# A
'AUU': 'I', 'ACU': 'T', 'AAU': 'N', 'AGU': 'S', # AxU
'AUC': 'I', 'ACC': 'T', 'AAC': 'N', 'AGC': 'S', # AxC
'AUA': 'I', 'ACA': 'T', 'AAA': 'K', 'AGA': 'R', # AxA
'AUG': 'M', 'ACG': 'T', 'AAG': 'K', 'AGG': 'R', # AxG
# G
'GUU': 'V', 'GCU': 'A', 'GAU': 'D', 'GGU': 'G', # GxU
'GUC': 'V', 'GCC': 'A', 'GAC': 'D', 'GGC': 'G', # GxC
'GUA': 'V', 'GCA': 'A', 'GAA': 'E', 'GGA': 'G', # GxA
'GUG': 'V', 'GCG': 'A', 'GAG': 'E', 'GGG': 'G' # GxG
}
dnaCodonTable = {key.replace('U', 'T'): value for key, value in rnaCodonTable.items()}
def __init__(self, inString=''):
"""
The constructor takes in an optional string parameter that is suppose to represent the DNA/RNA
nucleotide sequence. If there is no sequence, it gives a default empty string.
The constructor saves two additional dictionaries to the class that count single nucleotides
and codons of the input string.
To get these counts, the constructor uses addSequence() to first capitalize all letters and
gets rid of spaces before adding to dictionaries. Then it uses for loops to go through the
input string by character to determine if it is a valid single nucleotide and then by a string
of three characters to determine of it a valid codon and saves these in dictionaries.
The design of dictionaries was used because it seemed that since the constructor and addSequence()
do practically the same thing, sort nucleotides and codons into dictionaries and many of these
methods involve returning these dictionaries, it was decided that it was better to have class
attributes as dictionaries.
"""
self.validNucDic = {'A': 0, 'C': 0, 'T': 0, 'G': 0, 'U': 0, 'N': 0} # Valid nucleotide count dictionary
self.rnaCodonCount = {
# Valid codon count dictionary
# U
'UUU': 0, 'UCU': 0, 'UAU': 0, 'UGU': 0, # UxU
'UUC': 0, 'UCC': 0, 'UAC': 0, 'UGC': 0, # UxC
'UUA': 0, 'UCA': 0, 'UAA': 0, 'UGA': 0, # UxA
'UUG': 0, 'UCG': 0, 'UAG': 0, 'UGG': 0, # UxG
# C
'CUU': 0, 'CCU': 0, 'CAU': 0, 'CGU': 0, # CxU
'CUC': 0, 'CCC': 0, 'CAC': 0, 'CGC': 0, # CxC
'CUA': 0, 'CCA': 0, 'CAA': 0, 'CGA': 0, # CxA
'CUG': 0, 'CCG': 0, 'CAG': 0, 'CGG': 0, # CxG
# A
'AUU': 0, 'ACU': 0, 'AAU': 0, 'AGU': 0, # AxU
'AUC': 0, 'ACC': 0, 'AAC': 0, 'AGC': 0, # AxC
'AUA': 0, 'ACA': 0, 'AAA': 0, 'AGA': 0, # AxA
'AUG': 0, 'ACG': 0, 'AAG': 0, 'AGG': 0, # AxG
# G
'GUU': 0, 'GCU': 0, 'GAU': 0, 'GGU': 0, # GxU
'GUC': 0, 'GCC': 0, 'GAC': 0, 'GGC': 0, # GxC
'GUA': 0, 'GCA': 0, 'GAA': 0, 'GGA': 0, # GxA
'GUG': 0, 'GCG': 0, 'GAG': 0, 'GGG': 0 # GxG
}
self.addSequence(inString) # Calls addSequence() to add to the dictionary counts
def addSequence(self, inSeq):
"""
Adds a new DNA/RNA nucleotide sequence to the previous DNA/RNA nucleotide sequence
to be counted.
An input string is taken as parameter which represents the sequence added to the previous
sequence.
The method is used to format the string and then use it to add to the counts of valid
nucleotides and codons. addSequence() capaitalizes and gets rid of white spaces to format
the DNA/RNA sequence. It then loops through the string by charcater and then by three letters
to check for valid nucleotides and codons and adds them to the dictionary.
Class Attributes Used: validNucDic, rnaCodonCount
input: an input string that represents a DNA/RNA sequence
output: no output
"""
addedDNACap = inSeq.upper() # Capitalizes DNA/RNA sequence
addedDNANoSpace = addedDNACap.replace(' ', '') # Gets rid of spaces
for singNuc in addedDNANoSpace:
if singNuc in self.validNucDic:
self.validNucDic[singNuc] += 1 # Adds to count
rnaSeqConv = addedDNANoSpace.replace('T', 'U')
for i in range(0, len(rnaSeqConv), 3):
if rnaSeqConv[i:i + 3] in self.rnaCodonCount:
self.rnaCodonCount[rnaSeqConv[i:i + 3]] += 1 # adds to count
def aaComposition(self):
"""
Determines the amino acid counts of the 20 amino acids and stop codon and
returns the counts in a dictionary.
Creates a dictionary called aa_Count which is an dictionary with all amino acids
that begin with a count 0. The codon count dictionary is then looped and checks
to see which codon belongs to which amino acid using rnaCodonTable and will add
to a item in aa_Count if it determines a codon belongs to a particular amino acid.
Class Attributed Used: rnaCodonCount, rnaCodonTable
input: A RNA codon count dictionary that counts codons in a DNA/RNA sequence
output: amino acid count dictionary in a RNA/DNA sequence
"""
aa_Count = {
'F': 0, 'L': 0, 'I': 0, 'M': 0, 'V': 0,
'S': 0, 'P': 0, 'T': 0, 'A': 0, 'Y': 0,
'H': 0, 'Q': 0, 'N': 0, 'K': 0, 'D': 0,
'E': 0, 'C': 0, 'W': 0, 'R': 0, 'G': 0, '-': 0
}
for codonCode in self.rnaCodonCount.keys():
if self.rnaCodonCount.get(codonCode) != 0: # Checks if count is not 0
aa_Count[self.rnaCodonTable.get(codonCode)] += self.rnaCodonCount.get(codonCode)
return aa_Count
def nucComposition(self):
"""
Returns a dictionary containing valid single nucleotide counts in the DNA/RNA
sequence.
The method is very simple which it simply returns the class attribute of
validNucDic
Class Attributes Used: validNucDic
input: A dictionary containing single nucleotide counts in input sequence
output: A dictionary containing single nucleotide counts in input sequence
"""
return self.validNucDic
def codonComposition(self):
"""
Returns a dictionary containing valid rna codon counts in the DNA/RNA
sequence.
The method is very simple which it simply returns the class attribute of
rnaCodonCount
Class Attributes Used: rnaCodonCount
input: A dictionary containing valid codon counts in input sequence
output: A dictionary containing valid codon counts in input sequence
"""
return self.rnaCodonCount
def nucCount(self):
"""
Returns a count of total valid nucleotides in the DNA/RNA sequence.
The method is very simple which it simply returns an integer of total
nucleotides in the sequence.
Class Attributes Used: validNucDic
input: A dictionary containing valid single nucleotide counts in input sequence.
output: An integer of how many total valid nucleotides there are in the sequence.
"""
count = 0
for nucC in self.validNucDic.values(): # Loops through nucleotide counts in validNucDict
count += nucC # adds counts to total count
return count
import sys
class FastAreader:
"""
Define objects to read FastA files.
instantiation:
thisReader = FastAreader ('testTiny.fa')
usage:
for head, seq in thisReader.readFasta():
print (head,seq)
"""
def __init__(self, fname=''):
"""contructor: saves attribute fname """
self.fname = fname
def doOpen(self):
""" Handle file opens, allowing STDIN."""
if self.fname == '':
return sys.stdin
else:
return open(self.fname)
def readFasta(self):
""" Read an entire FastA record and return the sequence header/sequence"""
header = ''
sequence = ''
with self.doOpen() as fileH:
header = ''
sequence = ''
# skip to first fasta header
line = fileH.readline()
while not line.startswith('>'):
line = fileH.readline()
header = line[1:].rstrip()
for line in fileH:
if line.startswith('>'):
yield header, sequence
header = line[1:].rstrip()
sequence = ''
else:
sequence += ''.join(line.rstrip().split()).upper()
yield header, sequence
class OrfFinder():
"""
Create a list of open reading frames that contain its frame, positions, and length
The OrfFinder is set up to add open reading frames to openReadingFrames, a class
attribute that is a list of all the ORFs found in a DNA sequence. openReadingFrames
is needed to store the ORFs found in the sequence and is used to create the output
file.
The class contains a constructor and three methods that are used to generate ORFs
and to add them to openReadingFrames. A majority of the program is done in findOrfs()
because it felt it would be simplier to find and keep positons, frames, and length of ORFs
done in one method which could be easily added to openReadingFrames.
"""
def __init__(self, inString="", minGene=0, largeOrf=False, startCodons={'ATG'}, stopCodons={'TAA', 'TAG', 'TGA'}):
"""
Creates and initializes an OrfFinder by finding ORFs and adding them to openReadingFrames
openReadingFrames contained all the information needed for the formatted output file so
that was the only class attribute made and is accessible throughout the class.
The constructor uses the methods, reverseComplimentaryStrand() and findOrfs() to add ORFs
to the openReadingFrames list that will be used to generate a formatted output file.
Keyword Arguments:
self -- OrfFinder object
inString -- A string of a DNA sequence (default - empty string)
minGene -- Minimum length of a gene (default - 100)
largeOrf -- Decides if the largest ORF or all ORFs are added (default -- False, Only largest ORF reported)
startCodons -- Start codons used to find starts of ORFs (default - {'ATG'})
stopCodons -- Stop codons used to find stops of ORFS (default - {'TAA', 'TAG', 'TGA'})
"""
# Creates a complimentary strand of DNA
complimentStrand = self.reverseComplimentaryStrand(inString)
self.openReadingFrames = []
# Finds and adds ORFs for DNA sequence
self.findOrfs(inString, 1, minGene, startCodons, stopCodons, largeOrf)
# Finds and adds ORFs for reverse DNA sequence
self.findOrfs(complimentStrand, -1, minGene, startCodons, stopCodons, largeOrf)
def reverse_comp_strand(self, fwd_strand):
""" Creates a reversed complimentary strand of a DNA sequence. """
complimentStrand = fwd_strand.lower().replace('a', 'T').replace('t', 'A').replace('c', 'G').replace('g',
'C')
return complimentStrand[::-1] # Reverses and returns the new complimentary strand of DNA
def findOrfs(self, strand, typeStrand, minLength, startCodons, stopCodons, largeOrf):
"""
Generates ORFs that are added to openReadingFrames
findOrfs() mainly decides what type of ORFs will be added to the list of ORFS.
It uses a few for loops amd many parameters to determine start and stop position
of an ORF based on the user's specifications.
input: A string of a DNA sequence that will be searched for ORFs
output: None
The method creates a list of values that represent an ORF. It consists of
Frame -- the frame the ORF was found
Start Position -- The position the ORF begins on (including the Start Codon(s))
Stop Position -- The position the ORF ends on (including the Stop Codon(s))
Length -- The length of the ORF
Keyword Arguments:
Most of the keyword arguments of these methods were passed in the constructor.
These arguments are the same as the same ones passed in.
typeStrand -- an integer that determines if it a DNA sequence is forward DNA
or the reversed DNA sequence.
"""
# Loops through frames of a DNA sequence
for frame in range(1, 4, 1):
startPos = [] # List of start positions on the DNA sequence
noStopYet = True # Determines if stop codon has been reached
# Used for boundary stop codon cases that occur in the beginning
# Loops through codons
for position in range(frame - 1, len(strand), 3):
# Checks for start codons and if they can be added based on largeOrf argument
if strand[position:position + 3] in startCodons and (largeOrf == False or len(startPos) == 0):
startPos.append(position + 1) # Adds start positions to startPos[]
# Checks for stop codons
elif strand[position:position + 3] in stopCodons:
endPos = position + 3 # Finds end position
# Checks for boundary stop case ORFs
if noStopYet == True and len(startPos) == 0 and minLength < endPos:
if typeStrand > 0: # Checks for positive frames
self.openReadingFrames.append([frame * typeStrand, 1, endPos, endPos])
elif typeStrand < 0: # Checks for negative frames
self.openReadingFrames.append(
[frame * typeStrand, len(strand) - (endPos - 1), len(strand), endPos])
# Checks if start codons/positions upstream after previous stop
elif len(startPos) != 0:
for pos in startPos:
# Ensures ORFs are larger than minLength
if minLength < (endPos - pos + 1):
if typeStrand > 0: # Checks for positive frames
self.openReadingFrames.append([frame * typeStrand, pos, endPos, endPos - pos + 1])
else:
self.openReadingFrames.append(
[frame * typeStrand, len(strand) - (endPos - 1), len(strand) - (pos - 1),
endPos - pos + 1])
noStopYet = False # Does not allow for more boundary stop cases
startPos = [] # Gets rid of start positions after a stop is reached
# Checks for boundary start cases
if len(startPos) != 0:
for pos in startPos:
# Ensures boundary start ORF cases are larger than minLength
if minLength < (len(strand) - frame + 2 - pos):
if typeStrand > 0: # Checks for positive frames
self.openReadingFrames.append([frame * typeStrand, pos, len(strand), len(strand) - pos + 1])
else:
self.openReadingFrames.append(
[frame * typeStrand, 1, len(strand) - (pos - 1), len(strand) - pos + 1])
# Check if no genes were found
elif noStopYet == True and minLength < len(strand):
# Adds whole DNA sequence as gene if no genes found
self.openReadingFrames.append([frame * typeStrand, 1, len(strand), len(strand)])
def getOrfs(self):
"""
Returns ordered list of ORFs found on the DNA sequence
input: None
output: A list of ORFs that contain their start and stop positions, their frame, and their length
The method sorts the ORFs by
1. length: descending order
2. frame: ascending order
3. start position: ascending order
"""
self.openReadingFrames.sort(key=lambda entry: (entry[3], -entry[0], -entry[1]), reverse=True)
return self.openReadingFrames