-
Notifications
You must be signed in to change notification settings - Fork 0
/
tadoss.py
340 lines (231 loc) · 11.6 KB
/
tadoss.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
'''
==============================================================
PUBLIC DOMAIN NOTICE
National Institutes of Health
This software is a "United States Government Work" under the
terms of the United States Copyright Act. It was written as
part of the authors' official duties as United States
Government employees and thus cannot be copyrighted. This
software is freely available to the public for use. The
National Institutes of Health and the U.S. Government have not
placed any restriction on its use or reproduction.
Although all reasonable efforts have been taken to ensure
the accuracy and reliability of the software and data, the
NIH and the U.S. Government do not and cannot warrant the
performance or results that may be obtained by using this
software or data. The NIH and the U.S. Government disclaim
all warranties, express or implied, including warranties of
performance, merchantability or fitness for any particular
purpose.
Please cite the authors in any work or product based on this
material.
==============================================================
'''
import numpy as np
import argparse as ap
import warnings
from math import acos, sin, sqrt
import Bio.PDB as bp
from Bio import BiopythonWarning
################### Custom functions
def dotproduct(v1, v2):
return sum((a*b) for a, b in zip(v1, v2))
def length(v):
return sqrt(dotproduct(v, v))
def angle(v1, v2):
return acos( dotproduct(v1, v2) / (length(v1) * length(v2)) )
################### Inputs
parser = ap.ArgumentParser()
parser.add_argument("domain", help="code of the domain to analyze", type=str)
parser.add_argument("file", help="path to the structure file", type=str)
parser.add_argument("-m", "--ext", help="residue extension needed between N anc C terminal",
type=int, default=-1)
parser.add_argument("-l", "--linker", help="linker length between the N and C terminal (default: 0)",
type=int, default=0)
parser.add_argument("-p", "--hinge", help="minimum hinge length, the loop between domains (default: 3)",
type=int, default=3)
parser.add_argument("-w", "--warn", help="print warning messages",
action='store_true')
args = parser.parse_args()
# Code of the domain
domain = args.domain
# Path to the structure file in PDB format and
path = args.file
# Note that if N- and C- termini point in opposite directions, the two termini will form the
# turn of the joint loop which does not contribute to the the effective length.
M = args.ext
# Linker length in residues (equivalent to 21A of effective length)
linker = args.linker
# Size in resdiues of the loop extension (even number) - minimum number of residues to peel to form the connecting loop.
hinge = args.hinge
# Offset in residues at the N and C terminal to start trying the circular permutant cuts
offset_cp = 10
# Silent BioPython warnings
if not args.warn:
warnings.simplefilter('ignore', BiopythonWarning)
################### Parameters
# Temperature of reference
Tref = 350.0
# kcal/mol*K scaling factor for the go model
dS = 0.0054
# Bringing the N- and C-terminus closer together will require peeling off a small part of the native structure, starting from the termini.
# break_L is the maximum number of residues to peel from each termini.
break_L = 10
# Cutting a loop will require peeling off a small part of the native structure, starting from the break point.
# break_L is the maximum number of residues to peel from each end of the loop cut.
break_L_cut = 4
# Average length contribution of each residue.
r0 = 3.5
################### Calculation
print "1 Parsing PDB file..."
# Use BioPython to parse the PDB structure
parser = bp.PDBParser()
structure = parser.get_structure(domain, path)
# Extract all the residues of the structure in a list
res_list = bp.Selection.unfold_entities(structure, 'R')
# Total number of residues in the structure
L_p = len(res_list)
print "2 Parsing contact energies between residues..."
# Parse the GO model contact energy file
contacts = np.genfromtxt("go_{}/go_{}_gomodel_golist.dat".format(domain,domain))
# Number of free residues at N- and C-termini, defined as the number of residues which do not have native contacts.
free_N = L_p
free_C = 0
for c in contacts:
if c[0] < free_N:
free_N = c[0]
if c[1] > free_C:
free_C = c[1]
free_N = int(free_N) - 1
free_C = L_p - int(free_C)
print " Free resdiues at the N and C terminal: {}, {}".format(free_N, free_C)
dist_NC = res_list[L_p - free_C - 1]['CA'] - res_list[free_N]['CA']
dist_NC4 = res_list[L_p - free_C - 5]['CA'] - res_list[free_N+4]['CA']
print " Distance N to C teminal: {} A".format(round(dist_NC,1))
print " Distance N+4 to C-5 teminal: {} A".format(round(dist_NC4,1))
# Calculate the angle between the N and the C termini
c_ter_res1 = res_list[L_p - free_C - 1]['CA'].get_vector() # get the coordinates of residue C_i
c_ter_res2 = res_list[L_p - free_C - 5]['CA'].get_vector() # get the coordinates of the residue C_i - 4
n_ter_res1 = res_list[free_N]['CA'].get_vector() # get the coordinates of residue N_j
n_ter_res2 = res_list[free_N + 4]['CA'].get_vector() # get the coordinates of the residue N_j + 4
v1 = np.array(c_ter_res1) - np.array(c_ter_res2) # get the direction of the C-terminus
v2 = np.array(n_ter_res1) - np.array(n_ter_res2) # get the direction of the N-terminus
angle_NC = angle(v1, v2) # calculate the angle between N and C - terminus
print " Angle between N and C temini: {} rad".format(round(angle_NC, 1))
if M < 0:
# Case 1, they are pointing away to each other (loop needed)
if dist_NC4 < dist_NC:
# instead of M = 0 or 6 as the origional paper, use the effective residue distance M' = 6*sin(ang/2)
M = 6 * sin(angle_NC / 2)
# Case 2, they are pointing to each other (no loop needed)
else:
M = 0
print " Parameter M set to {}".format(round(M,1))
print "3 Calculating the dG of the tandem domain swap..."
# Free energy change (dG_J) of joining the N- and C- termini
energy = [0] * break_L * break_L # List of enthalpy contribution from different i,j combination for dG_J
entropy = [0] * break_L * break_L # List of entropy contribution
zero = False
for i in range(break_L):
for j in range(break_L):
# Approximation of the entropy per residue on N-termini (lost in WT, gained in SWAP)
entropy[i * break_L + j] += dS * Tref * (i)
# Approximation of the entropy per residue on C-termini.
entropy[i * break_L + j] += dS * Tref * (j)
# Adjust the residue indices to the first and last residues in the domain
n_i = i + free_N + 1
c_i = L_p - j - free_C
uniq_ij = []
# The topological requirement of joining the two termini by peeling off residues
if (i + j - M + linker) * r0 > (res_list[n_i - 1]['CA'] - res_list[c_i - 1]['CA']):
if i == 0 and j == 0:
zero = True # Indicate that the (0,0) solution is good
for t in range(len(contacts)):
for ii in range(1, n_i) + range(c_i + 1, L_p + 1):
if ii == contacts[t, 0] or ii == contacts[t, 1]:
if (contacts[t, 0], contacts[t, 1]) not in uniq_ij:
# Energy of native contacts (formed in WT, lost in SWAP)
energy[i * break_L + j] -= contacts[t, 3]
uniq_ij.append((contacts[t, 0], contacts[t, 1]))
# This automatically removes the (0,0) soution even if it is valid, which gives a 0 kcal/mol value
non_zero_ind = np.nonzero(np.array(energy))
energy_array = np.array(energy)[non_zero_ind]
entropy_array = np.array(entropy)[non_zero_ind]
# The minimum change of stabilities
dG_J_array = entropy_array + energy_array
dG_J = np.array(entropy) + np.array(energy)
dG_joinNC = max(dG_J_array)
# Work out how many residues have to be unfolded at each termini to join them
dG_joinNC_index = np.where(dG_J == dG_joinNC)[0][0]
unfold_N = dG_joinNC_index / break_L
unfold_C = dG_joinNC_index % break_L
if zero:
dG_joinNC = max(max(dG_J_array),0)
unfold_N = 0
unfold_C = 0
print " The dG contribution of joining termini is {} kcal/mol, unfolding {} terminal residues".format(round(dG_joinNC,1), unfold_N+unfold_C)
# Free energy change (dG_C) of cutting at certain positions of the structure
dG_cuts = {}
hinge_cuts ={}
# Number of residues at each side of the cut point to extend (unfold)
ext_N = min(hinge / 2, break_L_cut)
ext_C = min((hinge + 1) / 2, break_L_cut)
break_N_cut = break_L_cut - ext_N
break_C_cut = break_L_cut - ext_C
for cp in range(free_N + offset_cp,L_p-offset_cp-free_C+1):
energy_tot = [0] * break_N_cut * break_C_cut
entropy_tot = [0] * break_N_cut * break_C_cut
hinge_len = [0] * break_N_cut * break_C_cut
for i in range(break_N_cut):
for j in range(break_C_cut):
uniq_ij = []
entropy_tot[i * (break_C_cut) + j] = dS * Tref * (i + j) # Loss entropy per residue
hinge_len[i * (break_C_cut) + j] = i+ext_N + j+ext_C
n_i = cp - i - ext_N
c_i = cp + j + ext_C
for t in range(len(contacts)):
for ii in range(n_i, c_i):
if ii == contacts[t, 0] or ii == contacts[t, 1]:
if (contacts[t, 0], contacts[t, 1]) not in uniq_ij:
# Energy of native contacts (formed in WT, lost in SWAP)
energy_tot[i * (break_C_cut) + j] -= contacts[t, 3]
uniq_ij.append((contacts[t, 0], contacts[t, 1]))
energy_tot = np.array(energy_tot)
entropy_tot = np.array(entropy_tot)
hinge_len = np.array(hinge_len)
dgtot = energy_tot + entropy_tot
dG_cuts[cp] = max(dgtot)
hinge_cuts[cp] = hinge_len[np.argmax(dgtot)]
max_dGc = max(dG_cuts.values())
print " The max dG cut position in the structure is {} kcal/mol".format(round(max_dGc),1)
print "4 Saving results for {}".format(domain)
print " Saving dGjoin and domain information to '{}-dG_join.tsv'".format(domain)
with open('{}-dG_join.tsv'.format(domain), 'w') as fout:
fout.write('domain\tfree_N\tfree_C\tunfold_N\tunfold_C\tdist_NC\tLlinker\tangle_NC\tM\tdGjoin\n')
fout.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(domain, free_N, free_C, unfold_N, unfold_C,
dist_NC, linker, angle_NC, M, dG_joinNC))
print " Saving dGcut profile to '{}-dG_cut.tsv'".format(domain)
with open('{}-dG_cut.tsv'.format(domain), 'w') as fout:
fout.write('domain\tposition\tLhinge\tdGcut\n')
for cp in dG_cuts.keys():
fout.write('{}\t{}\t{}\t{}\n'.format(domain, cp+1, hinge_cuts[cp], dG_cuts[cp]))
print " Saving PDB structure with dGcut in B-factors to '{}-dG_cut.pdb'".format(domain)
# Set the B-factor on the CA atoms
for p in range(0,len(res_list)):
if p in dG_cuts.keys():
res_list[p]['CA'].set_bfactor(dG_cuts[p])
else:
res_list[p]['CA'].set_bfactor(min(dG_cuts.values()))
# Remove terminal residues identified as free
for chain in structure[0]:
for id in range(1, free_N+1):
chain.detach_child((' ', id, ' '))
for id in range(L_p-free_C+1, L_p+1):
chain.detach_child((' ', id, ' '))
io = bp.PDBIO()
io.set_structure(structure)
io.save('{}-dG_cut.pdb'.format(domain))
print " Saving total alchemical ddG = dGjoin + max(dGcut) to '{}-ddG_tot.tsv'".format(domain)
with open('{}-ddG_tot.tsv'.format(domain), 'w') as fout:
fout.write('domain\tlength\tdGjoin\tmax_dGc\tddGtot\n')
fout.write('{}\t{}\t{}\t{}\t{}\n'.format(domain, L_p, dG_joinNC, max_dGc, dG_joinNC+max_dGc))