-
Notifications
You must be signed in to change notification settings - Fork 0
/
RandomizedMotifSearch.py
129 lines (107 loc) · 5.45 KB
/
RandomizedMotifSearch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import random
import sys
def indexToAcid(i):
return "ACGT"[i]
def acidToIndex(a):
return "ACGT".index(a)
def repeatedRandomizedMotifSearch( k, t, dna ):
bestScore = float('inf')
bestMotifs = []
i = 0
while True:
motifs = randomizedMotifSearch(k,t,dna)
score = Score(motifs)
if score < bestScore:
bestScore = score
bestMotifs = motifs
i = 0
else:
i += 1
if i > 100:
break
return bestMotifs
def randomizedMotifSearch( k, t, dna ):
bestMotifs = randomMotifs(dna,k)
bestScore = Score(bestMotifs)
while True:
profile = profileFromMotifs(bestMotifs,1)
motifs = motifsFromProfile(dna,profile)
score = Score(motifs)
if score < bestScore:
bestMotifs = motifs
bestScore = score
else:
return bestMotifs
def Score( motifs ):
score = 0
for count in countsFromMotifs(motifs,0):
score += sum(count) - max(count)
return score
def motifsFromProfile( dna, profile ):
return [bestKmerForProfile(seq, profile) for seq in dna]
def bestKmerForProfile( seq, profile ):
k = len(profile)
bestProb = -1
bestKmer = ''
for start in range(len(seq)-k+1):
kmer = seq[start:start+k]
prob = probKmerInProfile(kmer, profile)
if prob > bestProb:
bestProb = prob
bestKmer = kmer
return bestKmer
def probKmerInProfile( kmer, profile ):
prob = 1.0
for i in range(len(kmer)):
prob *= profile[i][acidToIndex(kmer[i])]
return prob
def profileFromMotifs( motifs, initCount ):
counts = countsFromMotifs(motifs,initCount)
profile = []
for count in counts:
total = float(sum(count))
probs = [c/total for c in count]
profile.append(probs)
return profile
def countsFromMotifs( motifs, initCount ):
k = len(motifs[0])
for motif in motifs:
assert k == len(motif)
counts = []
for i in range(k):
currCount = [initCount] * 4
for motif in motifs:
currCount[acidToIndex(motif[i])] += 1
counts.append(currCount)
return counts
def randomMotifs( dna, k ):
return [randomKmer(seq,k) for seq in dna]
def randomKmer( seq, k ):
start = random.randint(0, len(seq)-k)
return seq[start:start+k]
# first, import the random package
# Next, copy your RandomizedMotifSearch function (along with all required subroutines)
# from Motifs.py below this line
# Copy the ten strings occurring in the hyperlinked DosR dataset below.
dna = ["GCGCCCCGCCCGGACAGCCATGCGCTAACCCTGGCTTCGATGGCGCCGGCTCAGTTAGGGCCGGAAGTCCCCAATGTGGCAGACCTTTCGCCCCTGGCGGACGAATGACCCCAGTGGCCGGGACTTCAGGCCCTATCGGAGGGCTCCGGCGCGGTGGTCGGATTTGTCTGTGGAGGTTACACCCCAATCGCAAGGATGCATTATGACCAGCGAGCTGAGCCTGGTCGCCACTGGAAAGGGGAGCAACATC",
"CCGATCGGCATCACTATCGGTCCTGCGGCCGCCCATAGCGCTATATCCGGCTGGTGAAATCAATTGACAACCTTCGACTTTGAGGTGGCCTACGGCGAGGACAAGCCAGGCAAGCCAGCTGCCTCAACGCGCGCCAGTACGGGTCCATCGACCCGCGGCCCACGGGTCAAACGACCCTAGTGTTCGCTACGACGTGGTCGTACCTTCGGCAGCAGATCAGCAATAGCACCCCGACTCGAGGAGGATCCCG",
"ACCGTCGATGTGCCCGGTCGCGCCGCGTCCACCTCGGTCATCGACCCCACGATGAGGACGCCATCGGCCGCGACCAAGCCCCGTGAAACTCTGACGGCGTGCTGGCCGGGCTGCGGCACCTGATCACCTTAGGGCACTTGGGCCACCACAACGGGCCGCCGGTCTCGACAGTGGCCACCACCACACAGGTGACTTCCGGCGGGACGTAAGTCCCTAACGCGTCGTTCCGCACGCGGTTAGCTTTGCTGCC",
"GGGTCAGGTATATTTATCGCACACTTGGGCACATGACACACAAGCGCCAGAATCCCGGACCGAACCGAGCACCGTGGGTGGGCAGCCTCCATACAGCGATGACCTGATCGATCATCGGCCAGGGCGCCGGGCTTCCAACCGTGGCCGTCTCAGTACCCAGCCTCATTGACCCTTCGACGCATCCACTGCGCGTAAGTCGGCTCAACCCTTTCAAACCGCTGGATTACCGACCGCAGAAAGGGGGCAGGAC",
"GTAGGTCAAACCGGGTGTACATACCCGCTCAATCGCCCAGCACTTCGGGCAGATCACCGGGTTTCCCCGGTATCACCAATACTGCCACCAAACACAGCAGGCGGGAAGGGGCGAAAGTCCCTTATCCGACAATAAAACTTCGCTTGTTCGACGCCCGGTTCACCCGATATGCACGGCGCCCAGCCATTCGTGACCGACGTCCCCAGCCCCAAGGCCGAACGACCCTAGGAGCCACGAGCAATTCACAGCG",
"CCGCTGGCGACGCTGTTCGCCGGCAGCGTGCGTGACGACTTCGAGCTGCCCGACTACACCTGGTGACCACCGCCGACGGGCACCTCTCCGCCAGGTAGGCACGGTTTGTCGCCGGCAATGTGACCTTTGGGCGCGGTCTTGAGGACCTTCGGCCCCACCCACGAGGCCGCCGCCGGCCGATCGTATGACGTGCAATGTACGCCATAGGGTGCGTGTTACGGCGATTACCTGAAGGCGGCGGTGGTCCGGA",
"GGCCAACTGCACCGCGCTCTTGATGACATCGGTGGTCACCATGGTGTCCGGCATGATCAACCTCCGCTGTTCGATATCACCCCGATCTTTCTGAACGGCGGTTGGCAGACAACAGGGTCAATGGTCCCCAAGTGGATCACCGACGGGCGCGGACAAATGGCCCGCGCTTCGGGGACTTCTGTCCCTAGCCCTGGCCACGATGGGCTGGTCGGATCAAAGGCATCCGTTTCCATCGATTAGGAGGCATCAA",
"GTACATGTCCAGAGCGAGCCTCAGCTTCTGCGCAGCGACGGAAACTGCCACACTCAAAGCCTACTGGGCGCACGTGTGGCAACGAGTCGATCCACACGAAATGCCGCCGTTGGGCCGCGGACTAGCCGAATTTTCCGGGTGGTGACACAGCCCACATTTGGCATGGGACTTTCGGCCCTGTCCGCGTCCGTGTCGGCCAGACAAGCTTTGGGCATTGGCCACAATCGGGCCACAATCGAAAGCCGAGCAG",
"GGCAGCTGTCGGCAACTGTAAGCCATTTCTGGGACTTTGCTGTGAAAAGCTGGGCGATGGTTGTGGACCTGGACGAGCCACCCGTGCGATAGGTGAGATTCATTCTCGCCCTGACGGGTTGCGTCTGTCATCGGTCGATAAGGACTAACGGCCCTCAGGTGGGGACCAACGCCCCTGGGAGATAGCGGTCCCCGCCAGTAACGTACCGCTGAACCGACGGGATGTATCCGCCCCAGCGAAGGAGACGGCG",
"TCAGCACCATGACCGCCTGGCCACCAATCGCCCGTAACAAGCGGGACGTCCGCGACGACGCGTGCGCTAGCGCCGTGGCGGTGACAACGACCAGATATGGTCCGAGCACGCGGGCGAACCTCGTGTTCTGGCCTCGGCCAGTTGTGTAGAGCTCATCGCTGTCATCGAGCGATATCCGACCACTGATCCAAGTCGGGGGCTCTGGGGACCGAAGTCCCCGGGCTCGGAGCTATCGGACCTCACGATCACC"]
# set t equal to the number of strings in Dna, k equal to 15, and N equal to 100.
t=10
k=15
N=100
# Call RandomizedMotifSearch(Dna, k, t) N times, storing the best-scoring set of motifs
# resulting from this algorithm in a variable called BestMotifs
R = 1000
BestMotifs= repeatedRandomizedMotifSearch(k,t,dna)
# Print the BestMotifs variable
print(BestMotifs)
# Print Score(BestMotifs)
print (Score(BestMotifs))