forked from mtarbit/Rosalind-Problems
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathe004-gc.py
executable file
·98 lines (78 loc) · 2.51 KB
/
e004-gc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/env python
# GC Content
# ==========
#
# DNA strings must be labeled when they are consolidated into a database. A
# commonly used method of string labeling is called FASTA format. In this
# format, the string is introduced by a line that begins with ">", followed by
# some information naming and characterizing the string. Subsequent lines
# contain the string itself; the next line starting with ">" indicates the label
# of the next string.
#
# In Rosalind's implementation, a string in FASTA format will be labeled by the
# ID "Rosalind_xxxx", where "xxxx" denotes a four-digit code between 0000 and
# 9999.
#
# Given: At most 10 DNA strings in FASTA format (of length at most 1 kbp each).
#
# Return: The ID of the string having the highest GC-content, followed by the
# GC-content of that string. The GC-content should have an accuracy of 4 decimal
# places (see the note below on decimal accuracy).
#
# Sample Dataset
# --------------
# >Rosalind_6404
# CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC
# TCCCACTAATAATTCTGAGG
# >Rosalind_5959
# CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCT
# ATATCCATTTGTCAGCAGACACGC
# >Rosalind_0808
# CCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTCAGACCAGCCCGGAC
# TGGGAACCTGCGGGCAGTAGGTGGAAT
#
# Sample Output
# -------------
# Rosalind_0808
# 60.919540%
def parse_fasta(s):
results = {}
strings = s.strip().split('>')
for s in strings:
if len(s) == 0:
continue
parts = s.split()
label = parts[0]
bases = ''.join(parts[1:])
results[label] = bases
return results
def gc_content(s):
n = len(s)
m = 0
for c in s:
if c == 'G' or c == 'C':
m += 1
return 100 * (float(m) / n)
if __name__ == "__main__":
small_dataset = """
>Rosalind_6404
CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC
TCCCACTAATAATTCTGAGG
>Rosalind_5959
CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCT
ATATCCATTTGTCAGCAGACACGC
>Rosalind_0808
CCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTCAGACCAGCCCGGAC
TGGGAACCTGCGGGCAGTAGGTGGAAT
"""
large_dataset = open('datasets/rosalind_gc.txt').read()
results = parse_fasta(large_dataset)
results = dict([(k, gc_content(v)) for k, v in results.iteritems()])
highest_k = None
highest_v = 0
for k, v in results.iteritems():
if v > highest_v:
highest_k = k
highest_v = v
print highest_k
print '%f%%' % highest_v