-
Notifications
You must be signed in to change notification settings - Fork 55
/
rflp.py
208 lines (186 loc) · 8.22 KB
/
rflp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
"""
Simulate and analyze genetic distances from RFLP results
Copyright 2009 Maurice Ling <mauriceling@acm.org> on behalf of
Maurice Ling, Chin-How Lee, Jack Si-Hao Oon, Kun-Cheng Lee.
@see: Lee, CH, Lee, KC, Oon, JSH, Ling, MHT. 2010. Bactome, I:
Python in DNA Fingerprinting. In: Peer-Reviewed Articles from
PyCon Asia-Pacific 2010. The Python Papers 5(3): 6.
Licence: GNU General Public License version 3
Bactome package is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna
import math
def restriction_digest(seq, enzyme, max_band=23130, min_band=2000,
linear=False, p='yes'):
"""
Performs restriction endonuclease digestion on a sequence and group the
resulting fragments into 3 groups so simulate different agarose gel
electrophoresis:
1. fragment length more than the maximum size
2. fragment length between the maximum and minimum size
3. fragment length less than the minimum size
Parameters:
seq = DNA sequence for restriction endonuclease digestion
enzyme = Restriction endonuclease object from Bio.Restriction
package
max_band = size of maximum band in basepairs. Default = 23130
min_band = size of minimum band in basepairs. Default = 2000
linear = flag to define if DNA sequence is linear.
Default = False (DNA is circular)
p = flag to determine if the data is to be printed. Default = yes
Result:
(Number of fragments after digestion,
List of fragments with molecular size above max_band,
List of fragments with molecular size between max_band and min_band,
List of fragments with molecular size below min_band)
"""
digest = enzyme.search(seq, linear=linear)
digest.sort()
fragment = [digest[x+1] - digest[x]
for x in range(len(digest) - 1)]
fragment.sort()
ogel = [x for x in fragment if x > max_band]
gel = [x for x in fragment if x <= max_band and x >= min_band]
ugel = [x for x in fragment if x < min_band]
ogel.sort()
gel.sort()
ugel.sort()
if p == 'yes':
print('Enzyme: ' + str(enzyme))
print('Restriction site: ' + enzyme.site)
print('Number of fragments: ' + str(len(fragment)))
print('Number of fragments (x > ' + str(max_band) + '): ' + \
str(len(ogel)))
print('Number of fragments (' + str(max_band) + ' < x < ' + \
str(min_band) + '): ' + str(len(gel)))
print('Number of fragments (x < ' + str(min_band) + '): ' + \
str(len(ugel)))
print
return (len(fragment), ogel, gel, ugel)
def restriction_supplier(seq, max_band=23130, min_band=2000,
suppliers='ACEGFIHKJMONQPSRUVX',
linear=False, p='yes'):
"""
Performs restriction endonuclease analysis by batch, based on supplier.
Parameters:
seq = DNA sequence for restriction endonuclease digestion
max_band = size of maximum band in basepairs. Default = 23130
mmin_band = size of minimum band in basepairs. Default = 2000
suppliers = restriction enzyme supplier. Default = ACEGFIHKJMONQPSRUVX
where
A = Amersham Pharmacia Biotech
C = Minotech Biotechnology
E = Stratagene
G = Qbiogene
F = Fermentas AB
I = SibEnzyme Ltd.
H = American Allied Biochemical, Inc.
K = Takara Shuzo Co. Ltd.
J = Nippon Gene Co., Ltd.
M = Roche Applied Science
O = Toyobo Biochemicals
N = New England Biolabs
Q = CHIMERx
P = Megabase Research Products
S = Sigma Chemical Corporation
R = Promega Corporation
U = Bangalore Genei
V = MRC-Holland
X = EURx Ltd.
linear = flag to define if DNA sequence is linear.
Default = False (DNA is circular)
p = flag to determine if the data is to be printed. Default = yes
Returns:
{Restriction endonuclease :
(Total number of fragments after digestion,
Number of fragments with molecular size above max_band,
Number of fragments with molecular size between max_band and min_band,
Number of fragments with molecular size below min_band)}
"""
from Bio.Restriction import RestrictionBatch
count = 0
result = {}
for enzyme in RestrictionBatch(first=[],
suppliers=[x.upper() for x in suppliers]):
try:
digest = restriction_digest(seq, enzyme, max_band, min_band,
linear, p)
except MemoryError:
print('Memory Error during ' + str(enzyme) + ' digestion')
result[str(enzyme)] = (digest[0], len(digest[1]),
len(digest[2]), len(digest[3]))
count = count + 1
if p != 'yes':
if count % 10 == 0:
print(str(count) + ' restriction endonuclease processed')
return result
def setCompare(original, test, absent):
"""
Used for processing set-based (unordered or nominal) distance of
categorical data.
Parameters:
original: list of original data
test: list of data to test against original
absent: indicator to define absent data
"""
original_only = float(len([x for x in original
if x not in test]))
test_only = float(len([x for x in test if x not in original]))
both = float(len([x for x in original if x in test]))
return (original_only, test_only, both)
def listCompare(original, test, absent):
"""
Used for processing list-based (ordered or ordinal) distance of
categorical data.
Parameters:
original: list of original data
test: list of data to test against original
absent: indicator to define absent data
"""
original = list(original)
test = list(test)
original_only = 0.0
test_only = 0.0
both = 0.0
for i in range(len(original)):
if original[i] == absent and test[i] == absent: pass
elif original[i] == test[i]: both = both + 1
elif original[i] != absent and test[i] == absent:
original_only = original_only + 1
elif original[i] == absent and test[i] != absent:
test_only = test_only + 1
else: pass
return (original_only, test_only, both)
def Nei_Li(original, test, absent=0, type='Set'):
"""
Nei and Li Distance is distance measure for nominal or ordinal data.
Given 2 lists (original and test), calculates the Nei and Li Distance
based on the formula,
1 - [2 x (number of regions where both species are present)/
[(2 x (number of regions where both species are present)) +
(number of regions where only one species is present)]]
@see: Nei M, Li WH (1979) Mathematical models for studying genetic
variation in terms of restriction endonucleases.
Proc Natl Acad Sci USA 76:5269-5273
@param original: list of original data
@param test: list of data to test against original
@param absent: user-defined identifier for absent of region, default = 0
@param type: {Set | List}, define whether use Set comparison (unordered) or
list comparison (ordered), default = Set
"""
if type == 'Set':
(original_only, test_only, both) = setCompare(original, test, absent)
else:
(original_only, test_only, both) = listCompare(original, test, absent)
return 1-((2*both)/((2*both)+original_only+test_only))