-
Notifications
You must be signed in to change notification settings - Fork 0
/
GibbsSampler.py
116 lines (102 loc) · 8.9 KB
/
GibbsSampler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#GibbsSampler.py
import random
dna = ['TGTCTCGACAGGGGTCCTACAAGGGTTCCACCGACAGGCATACTCCTGTGTATATCATAGGGTAAGGAACCACTCTCACCGGCCACCCTGTACGACTATAGTTTACGCATGATGCATGGGACTGCATTAGGATGCGGCATGGTACCGGCGTATATGAATCCGCAATGCTCCCGCTAGGTACCGAGAAAGCACGTCATATCTATGGCTATGGGGAGGCGAAGACTGGTCCCTGCGGCGTCCACTCAAGCAAATGTCGTCTGGCCTCGCATTGATCCGGTCGGTGGGATCCTGACAGTTTTGATTCAACACTACACCATGTCTCGACAGGGGT',
'CCTACAAGGGTTCCACCGACAGGCATACTCCTGTGTATATCATAGGGTAAGGAACCACTCTCACCGGCCACCCTGTACGACTATAGTTTACGCATGATGCATGGGACTGCATTAGGATGCGGCATGGTACCGGCGTATATGAATCCGCAATGCTCCCGCTAGGTACCGAGAAAGCACGTCATATCTATGGCTATGGGGAGGCGAAGACTGGTCCCTGCGGCGTCCACTCAAGCAAATGTTTACTTGATTTCATTCGTCTGGCCTCGCATTGATCCGGTCGGTGGGATCCTGACAGTTTTGATTCAACACTACACCATGTCTCGACAGGGGT',
'GCATTATGTGACTGCCTTGAAGTTGGGGACATTTGGCGTGTATCGAGATACCCAAACATTTGAGACGTACGCTCGCACCTCTTGACTGCGCAAGCGGCGTTGATCGCCGTCAATGTGATCTACTGTACGCATCCTATTATCGAAATGTCGTTCTACCGCTTTTACAGTTTACCGCAGATTTCATTTCGATAGCTAAAGTCCGTCAATAGTCTTCAGCAACGCGCCGGCTTAAGATCACTCTGCAAGCAGAACCAAACGCCTCAACCATAGTAGATTGGCATCGTACGACATAACACCCATAGGGGCCTCAGGCACCTTTAGTTCGCCTTAC',
'GCCCGCTCATGTGGCGGTGACAATGAGCAAGGGGGTGTCAATAGGATGGCGTTAGTCCGATGTAGGTGGATCCAATCGACGCTGGGGCGCGCCATAGCTGCCGCTTGGGAACAATGGAGGTGCTGGCTGTTCTTTTTTGCCATGCCTCAACTAAGTGGGTCACGCGAATATAGTATGCAACTCACCTTTCATTTAAAGTACGTGGATGGTGCCTCTCTACGGACACATTTAGCACTGTTCATTGCGCAAAAGGGACCCTTAAGTACCCGTGATTCGGTTATGTGCAGCTCTAAGAGTTGTAGCCGCGAGGTGCATAGTGTCTGGAATAGGG',
'GTTTTAGGGTAAGATGTCTTTCTTTAGCAGAAGGCATTGGCTGCACTCACGTGTTGAGAAATACTGCTGAATGAAATCCCGAGAGCTGTCGCGTGAGTCAATGGTCTCCAGGAATTGTGGGGTGATTATAGTTAGATCCCATACCGTTCTGGGAGAATAAGGGGGAGATTAGCTCGACGGGCCAAGGTACCAGTTCACACCCATCAGGCCTACCCAATGTCAACACCAAACACTCGGTTCAGACTGGTCTACTATGAGGAACGTAATCACAACTTATCAGTCGGGGAATGTCACTGTCACCTTTTCAGGTATAGCGCGGACCGCTGAGGAG',
'CATAGTGCTGACGAGTGATTCGAGCTGCCACCTTAAGAGCCATCGTACAGACTAACACTCATCAGCGCCGATGAACTTACGAGGTTGGAGACAACATGCCACCTTCAATTTAAGTACACAAATCAGTATTACCTGTCGCTGGGATTTCATAGTGGTATGTCCTTGTGCGCCGACATCTCCGTAGCATGGCGCACTCGGGAGCATATCGAAGGATTGCTCTTAACGCTGTCTCCACTGCGTGAAATTTTCATCGTGGTATCGGCTAACCTTCGTTGCAGACCCGAATGTTGAGAAGTACCGCATCGAGTTAGCAGATCGAATTACGGTGTTA',
'AAGCGGGACGGCTACGCCGATGGCACCGGAACGAAGCATGCCGGTATGGACACAAATGACGAATAGGTACCCATTCCATCAATATGTTGTCGATAGTTATTGTAAGCTGAGTGGATGGCCGTCAGTGTACGGCGCCACATACCGTAGGCGCAAGGTATTAATCGAATATGCGGATTCAGGGCCTCCGAAAGTTGACGGATTAATTTGAGAGATTTCATTGTTGTCGCGTTACACAGTGGAGACGTTTATACCCCACCTCTAAGACAGGGTCAATTCCATTTCCTTATCTTTCCCACCACTGACTTTCATTTACCGTCTATTGGCGGATTGG',
'TTAGCACAGTAATTGATCGGGAAAGGCCTAACGACTATGCCGACCCCCGCCAATGCTCTGGTGTCTGAATAACAGGGGTACAGTCCGTGTCAAGGACGCCTGCCCGCTAACCCGAGCTCGTTTCATCGGCGGCCCTGAGCAGAATCATCATTTAACTAGAATTCTCCGGACTAATCTGACTCAGCTCACCAAGCAGATTTCATAACACAGGGCCATACTATCGTGATCGCGTTGTTACGCAGCAAATACAGGACGAAGAGCGCGCCAACCAATGGCAGGGAGTCGGCCGGAGCCTTGTCATCTAATTTATCCCGTAGGAACAGGGGCGACG',
'TAAATCGACTACGCTCATTAGCACGGTTCATTAGTGTTGATTCCGAGGATAATTACTAGTTGTGTATCCCAAGATTCACGCGAGTCAAATCTCTCTGGATCTCATGGCCAGGTAGCAAGGGATGCTTTTTAAGTTGTTCGCTTCGTATTCACATCCTGACAAAATGAGACAGAATTATGGGTCGCGCCTGGAATATTGCAACCGCATCATGAATCGAAATGTTTCTGACCTGGAGCTGGGCTGAGAGGAGTGGGCGAGGTCTCACAGACGAACTAAGGTCATCAAACACACTACTCGGAGGTCTAAGTCCCGGCACCGACACTTCGTCTGC',
'GACTTGCTACCGGGATAGACCTGGATATTCCCCACGAAAGTTACAGAGACCACCGGTGGAGGGCGCTAGGTGGACGGGCAACTGACCTTTAGCAGATTTTGATTGTTGTCAATCGTGCGCATCTCAGGAGTCAACCTTAGTTGAAAGTGAGGCCTGACCAAGATGCATGGCTGGATAGGGGCGATCCAAGCTATGGGATTAAGTGTAAGATCCCCCGCATAACGGCGTGAGACACTCTTCGAATAATCAAGAGGTCTCTCAATATTCATAACTTCAGCAGCGAATAGACGTAAGCCCCGGTGCACGAAACAACAGGTGTACGTGGTATTCA',
'TGCCTAAATACGTTCTGGGGTCGTGCTCTTGGGGCTTTCTGGTGTGTACTCGGTTTACTCAACCAATTTGACTCCATTAAGAAGTATTAGCGCCTTTCATTGCTACATGGTACGCGTCGATACGCGCGGAAGGGACGCCTCACTTTTGACCTGATCCGGCCCGCCTTCGCTCACTACCAATACAATGAATCTGTGACTCATCACTTCTCACGTTTGGCATACTCTACGTATCGAGCTTCTAGTACGTCCCAACAGAGATGCCAGTTAGCTCAGCCAGACGCGAGACTTAACAGGAACTCCTAATCCGGCCAAGGGCCGAGGGGATTAAGAA',
'CGGCGGAGTCTACGTTTACCATCTGGTCTTTCAACTAGTTGATCGAAAGTTAGTAACCTTTGCTGATCATCATAATTGTCGTCTCCCCTATTACATTCCACTCTGCACACGGCGACATTCCAACAGTGGGGACCCACGTTCCGTCGTCCGCGTAAAGCACTATTCTCACCATCGTATACACAGATTTCATTATTAATTAGCCATCCTTCACAAGTTTCCCGATCATACGTGTCCCTTATCTAGACACCACGTCAGGTTTGAGGGTGGTACAGGAGCGCTAGTGAGTTTTGGGCGCATCCTACCACGACAAGGTGAGAATGTCAGTCTTGTC',
'GAGTATAGATTATAAATGTTCGGATTAGCAGATTGACTTTATCCTGAAAAGCAAAATCGGTGGCGCGGTAGATGGGTGCTAACTAAGCACTGCGTGCCGTAGCGTTTGCTGATAGACTCGATTAGGACAGAGTCATTCGTGGGAATAGCCTATTTGGTAATGACCTAACGCGTGAGACACGTACATATAAGTGGGTACCTTGAGAGGGTGACCAGTACAGCTTTCAGCCAGCCCGATTGCGTCATCGGGCGAAAAATGGCCGGCCATATTTTCTCTTTTATGTGTCACAACTTTCATATTCCTCCAAGACGTGCCAGTTTCTGGTGGGCGT',
'GAAGTTTCGACAACGTAGGAAGTGAGCGCGTAGCGATAAGAAGCGACGCCGACTCAAGACAGCCCCGGTATAGCTAGGACAGTAACTGCTCCGGTTGCAGCCTTTTTCGTCTCCTTATTCCTCGATCTGCCTGCTCGCTTGAGTCTTACAGGTATATCTCATAGCGGGCAGTGGCCGGGAGAAGACGCGGTTTCCTCTAGCGTAACCGCATAAATTCAACGCACCGTCTTGGTAATGACCAGTGCAAGCTGGTATTCCTTAGCAGGCATCATTCGCCCTGGGGGTAGCACGCTGCGTCAGCTACTAGGATCTACCTAGTATGTGGATTTCT',
'CAAACTTTGAAGATCCGCTGGCACCCCTTAGGGCCTGAATAACCTTAGCACACTATCACTCTTAGTTAATCCGGCGAACCTGGCGTGGCACTTATACCTTCTTCACCCGTTTCCCCATGTAAGGCCACAGTGCTGGCGACTCGCCGATCAGAGCTAGTAGCGGCTAACATTTGGGAATAATCGGTAAATCTATCTTTAGCAGATTTCCCCTCGGCCACATGGTATCAGGGTAGATCTCCCGTGTCAAAGGATGGTTGCGCGACCAATGGCCGAGCGGTGAAGGACAATGTAAGTAGCTAGGTTGTAATAGCCCGAAGGTCCGTGGATGAAC',
'GGGAACTCGAGGATCGGACTATTCCTGGCCGAAATCAGGGCCACTAACTAAAAAACTGCGCTCAGTACGAACCTAATTTTCGAATCATCCATAGACGCGGGTCGGGACAGAGGGGGATGATAGTCTACTCTAGCAGATTTCAGGGTTCGGATGTACAGTAGGAATTTAACGCGAGATGAAATCGTCCTAGTCATCATAACTAAGTTAAGTTCCGGGTCTTTCGACTCAGTCATGCAATCTCTATCGGTAGCCGGTATCGCGTCACGGGTATCCCTGTTCAAAACTCCGCAATGCGTGTATCTTTGAGATCCGCGGACGTGTTAGTCCGCAC',
'TGTTGACCCATGCAGCAATCTGCTAGACGAGGTCCAAAAGTACTATCGAACCTATGCCTCGCCGCCTGTAACCCAGTTAATGTACGGGTTCGCAAGGAAAGCCTACCCTTGAAAGTGGTGCCCGACGCTTCCCAATGACGGAATCACATACTCGTGACCTGGGCACAGCAGTACTTGATCGGAATCAATTATTCGATTTCATTTTTTCTCAGCGGTCCAAACATGGTATTAAACTCGACACCGAAAGCCGGAAAGGCCCATAGACTGTTCACTGTAGTGTAGGTCATGGCCGTATCTGGATAACCGCTGACTTCCTTAGAAGGCAAGAGTG',
'CGTCCCTGGTCCAGCGCGGGAATGAGTGGGCACCTCACGTTGCTCAACAGGTCGTAAGTAGAAGGGAGTGCAGCAGCGATGAAGGCGTAGCAAGACTTCTAGAAGTTTGTCGTAGATTAGACTATTTCATTCGCGTACAACCAGACAACGGCTTGGTCGTCCGCGCACGCGCTCCCCATCACGTTGCTTCAGTGCAAGAACTTAGCTGAAATCTACCGGGTGTGTGGGTAATATTCCTTCCCACTGCCAGTCCAATCTTGAAACGACCGCAACTGCCATTCCTGTGCTAGTCGTGAGCCCAAATCAGCCCCTCGATCTAGCGGTTGGTGTA',
'GACTGATGATCTGCTCCTGACAGCACCGCAAAGGGTACTATGTCCAATGTCTATTTAATAATCGGTCGGTCAAATGAGTTCTTCGCATCCTTTGGGAGTAGATCCATGATTCGCTCACGGGGACCTGTTGGGACCTCGCCTTTTGCATTAAACTCGCCATTACGGAGTGCCTTCGTAGACGCTCACCAGAGAGCTGACAGATGAAGTCCGGAGGGTCATAGTTGCACAACATACCGCATTTATTAGTGAATTTCATTCACCCCAATGTCAATGTGTTAATGGTTCACGTCAAATCAGGTTCTACATATGGGACCCTTCTGCGTCTCATTGC',
'ATGAGAATGTTCTCGTCGCGTTCCAAAAGTTTAAATCGGGCGTAGGGCACCCTTTCCGCTGATTGGGGTGACCACTAGGGGATCTACCAAGCTGCGGAGAGGTTTTTACAGGCAATTAACCAGACAGCATCTACGCGACGGTAAGGGTAAGAGAAGACTTCTCAATCGAGGGATAGATTACCATTAGCCCCTTTCATTAGGCGACCGCGCCCTAGCTTTTAGCATTTACAAAGCACTAACAAATAGTGCTGTTACTAATCTAATGCGTTGGTAGAGGACATTTTAAAGGCAAGGGGTCGCACCTTGAAAATGGACAGTGGACGCGAGTGTC']
k=15
t=20
N=2000
def indexToAcid(i):
return "ACGT"[i]
def acidToIndex(a):
return "ACGT".index(a)
def repeatedGibbsSampler( dna, k, N, R ):
bestMotifs = randomMotifs(dna,k)
bestScore = Score(bestMotifs)
for i in range(0,R):
(motifs,score) = gibbsSampler(dna,k,N)
if score < bestScore:
bestMotifs = motifs
bestScore = score
return bestMotifs
def randomMotifs( dna, k ):
return [randomKmer(seq,k) for seq in dna]
def randomKmer( seq, k ):
start = random.randint(0, len(seq)-k)
return seq[start:start+k]
def Score( motifs ):
score = 0
for count in countsFromMotifs(motifs,0):
score += sum(count) - max(count)
return score
def countsFromMotifs( motifs, initCount ):
k = len(motifs[0])
for motif in motifs:
assert k == len(motif)
counts = []
for i in range(k):
currCount = [initCount] * 4
for motif in motifs:
currCount[acidToIndex(motif[i])] += 1
counts.append(currCount)
return counts
def gibbsSampler( dna, k, N ):
t = len(dna)
motifs = randomMotifs(dna,k)
bestMotifs = motifs
bestScore = Score(bestMotifs)
for step in range(N):
i = random.randint(0, t-1)
profile = profileFromMotifs(motifs[:i] + motifs[i+1:], 1)
pr = [probKmerInProfile(dna[i][s:s+k],profile) for s in range(len(dna[i])-k+1)]
s = Random(pr)
motifs[i] = dna[i][s:s+k]
score = Score(motifs)
if score < bestScore:
bestMotifs = motifs[:]
bestScore = score
return (bestMotifs,bestScore)
def Random( pr ):
total = float(sum(pr))
r = random.random()
partialSum = 0.0
for i in range(len(pr)):
partialSum += pr[i]
if partialSum/total >= r:
return i
return -1
def profileFromMotifs( motifs, initCount ):
counts = countsFromMotifs(motifs,initCount)
profile = []
for count in counts:
total = float(sum(count))
probs = [c/total for c in count]
profile.append(probs)
return profile
def probKmerInProfile( kmer, profile ):
prob = 1.0
for i in range(len(kmer)):
prob *= profile[i][acidToIndex(kmer[i])]
return prob
R = 20
bestMotifs = repeatedGibbsSampler(dna,k,N,R)
for motif in bestMotifs:
print(' '.join(motif))
print(Score(motif))