forked from haddocking/haddock-tools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_cif.py
544 lines (474 loc) · 25.4 KB
/
create_cif.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
import os
import logging
import pathlib
import argparse
import json
import re
from itertools import groupby, count
import ihm
import ihm.location
import ihm.dataset
import ihm.representation
import ihm.restraint
import ihm.protocol
import ihm.model
import ihm.dumper
from Bio.PDB import PDBParser as BioParser, Selection, NeighborSearch
VALID_BASES = ['DA', 'DC', 'DG', 'DT', 'A', 'C', 'G', 'U', 'DJ']
VALID_AMINOACIDS = ['ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY', 'HIS', 'ILE', 'LEU', 'LYS', 'MET', 'PHE',
'PRO', 'SER', 'THR', 'TRP', 'TYR', 'VAL', 'ACE', 'ALY', 'ASH', 'CFE', 'CHX', 'CSP', 'CTN', 'CYC',
'CYF', 'CYM', 'DDZ', 'DUM', 'GLH', 'HLY', 'HYP', 'M3L', 'MLY', 'MLZ', 'MSE', 'NEP', 'PNS', 'PTR',
'QSR', 'SEP', 'TOP', 'TYP', 'TYS']
AMINOACID_CODES = [
# 20 canonical amino acids
('CYS', 'C'), ('ASP', 'D'), ('SER', 'S'), ('GLN', 'Q'),
('LYS', 'K'), ('ILE', 'I'), ('PRO', 'P'), ('THR', 'T'),
('PHE', 'F'), ('ASN', 'N'), ('GLY', 'G'), ('HIS', 'H'),
('LEU', 'L'), ('ARG', 'R'), ('TRP', 'W'), ('ALA', 'A'),
('VAL', 'V'), ('GLU', 'E'), ('TYR', 'Y'), ('MET', 'M'),
# Non-canonical amino acids
('MSE', 'M'), ('SOC', 'C')]
BASE_CODES = [
# 5 canonical nucleic acids
('ADE', 'A'), ('GUA', 'G'), ('THY', 'T'), ('CYT', 'C'), ('URI', 'U'),
('DA', 'A'), ('DG', 'G'), ('DT', 'T'), ('DC', 'C'),
('A', 'A'), ('G', 'G'), ('T', 'T'), ('C', 'C'), ('U', 'U')]
NARES3 = ['---', 'CYT', 'THY', 'GUA', 'ADE', 'URI']
NARES2 = ['--', 'DC', 'DT', 'DG', 'DA', 'DU']
THREE_TO_ONE = dict(AMINOACID_CODES + BASE_CODES)
THREE_TO_TWO = dict([(three, two) for three, two in zip(NARES3, NARES2)])
class Restraint:
"""Represent a CNS restraint between two residues as a tuple"""
def __init__(self, partner1_chainid, partner1_resid, partner2_chainid, partner2_resid, distance, groupid):
#: Chain ID of 1st residue
self.partner1_chainid = partner1_chainid
#: Residue ID of 1st residue
self.partner1_resid = partner1_resid
#: Chain ID of 2nd residue
self.partner2_chainid = partner2_chainid
#: Residue ID of 2nd residue
self.partner2_resid = partner2_resid
#: Distance restraint as a list [upper_d, lower_d, target]
self.distance = distance
self.groupid = groupid
def __repr__(self):
return f'Restraint ({self.groupid}): {self.partner1_chainid}.{self.partner1_resid} <-> '\
f'{self.partner2_chainid}.{self.partner2_resid} : {self.distance}'
def _parse_block(block, group=1):
"""
Parse an `assign` block containing one or multiple selection and a distance value
:param str block: Block of assign statement starting with assign and finishing with distance value
:param int group: ID of the first group to be found
:return: List of restraints based on the selections
:rtype: list of :class:`~tools.TBLHandler.Restraint`
"""
block = "".join(block)
distance = re.findall(r'(\d+\.\d+) (\d+\.\d+) (\d+\.\d+)', block)
distance = tuple([float(d) for d in distance[0]])
selections = [s.strip('() ') for s in re.findall(r'\([^(]*\)', block)]
partner1 = selections[0]
partners = selections[1:]
resid = r'resid +(\d+)'
chainid = r'segid +([A-Z])'
partner1_chainid = re.findall(chainid, partner1)[0]
partner1_resid = int(re.findall(resid, partner1)[0])
restraints = []
for p in partners:
partner2_chainid = re.findall(chainid, p)[0]
partner2_resid = int(re.findall(resid, p)[0])
restraints.append(Restraint(partner1_chainid, partner1_resid, partner2_chainid, partner2_resid,
distance, group))
return restraints
def extract_restraints(tbl_content):
"""
Extract restraints from a TBL file content
:param list tbl_content: List of TBL file lines to be parsed
:return: List of all restraints present in the file
:rtype: list of :class:`~tools.TBLHandler.Restraint`
"""
restraints = []
current_assign = []
num_open = 0
num_close = 0
assign_block_found = False
for line in tbl_content.split(os.linesep):
line = line.strip()
if line and line[0] != "!":
if assign_block_found:
if num_open != num_close or len(current_assign) == 1:
current_assign.append(line)
num_open += line.count("(")
num_close += line.count(")")
else:
restraints.extend(_parse_block(current_assign))
current_assign = []
num_open = 0
num_close = 0
assign_block_found = False
if line.startswith("assign"):
assign_block_found = True
current_assign.append(line)
num_open += line.count("(")
num_close += line.count(")")
# Process the last line
restraints.extend(_parse_block(current_assign))
return restraints
def get_interface_ranges(interface):
"""
Convert lists of interface residues to lists of interface residue ranges
:param interface: Dictionary of interface residues per chain IDs (e.g. {'A':[1,2,3], 'B': [10,11,12], ...}
:return: interface_ranges: Dictionary of interface residues as ranges (e.g. {'A': [(1,3), (8,10)], 'B':[(10,12)]}
:rtype: dict
"""
def as_range(iterable): # not sure how to do this part elegantly
l = list(iterable)
if len(l) > 1:
return (l[0], l[-1])
else:
return (l[0], l[0])
interface_ranges = {}
for p in interface:
interface_ranges[p] = [as_range(g) for _, g in groupby(interface[p], lambda n, c=count(): n - next(c))]
return interface_ranges
def get_interface_residues(pdb_file, radius=10.0):
"""Return a list of interacting residues based on accessibility and distance to partner
:param str pdb_file: PDB file path
:param int,float radius: Maximum distance to be considered to define a residue as interacting with another chain
:return: interface: Dictionary of interface residues per chain IDs (e.g. {'A':[1,2,3], 'B': [10,11,12], ...}
:rtype: dict
"""
p = BioParser(QUIET=True)
s = p.get_structure('pdb', pdb_file)
if sum(1 for _ in s.get_chains()) < 2:
logging.error("Less than 2 chains have been detected in your PDB, no interface can be extracted.")
# raise PDBParsingError("Only one chain has been found, no interface can be extracted.")
m = s[0]
all_atoms = Selection.unfold_entities(m, 'A')
# Unfold atoms for NeighborSearch algorithm to work
ns = NeighborSearch(all_atoms)
interface = ns.search_all(radius, "R")
# Filter redundant residues
buffer = dict([(ch_id.id, []) for ch_id in m.get_chains()])
for r in interface:
if r[0].parent.id != r[1].parent.id:
if r[0].id[1] not in buffer[r[0].parent.id]:
buffer[r[0].parent.id].append(r[0].id[1])
if r[1].id[1] not in buffer[r[1].parent.id]:
buffer[r[1].parent.id].append(r[1].id[1])
interface = buffer
for ch_id in m.get_chains():
interface[ch_id.id] = sorted(interface[ch_id.id])
return interface
def _update_peptide_alphabet():
"""Update the IHM mmCIF alphabet with modified amino-acids and ligands supported by HADDOCK
:return dict_extension: A dictionary extension containing the information for new amino-acids and ligands
:rtype: dict
"""
dict_extension = {}
dict_extension['ACE'] = ihm.LPeptideChemComp(id='ACE', code='ACE', code_canonical='n', name='Nterm acetyl group',
formula='C2 H3 O2')
dict_extension['ALY'] = ihm.LPeptideChemComp(id='ALY', code='ALY', code_canonical='K', name='Acetylated LYS',
formula='C8 H14 O2 N2')
dict_extension['ASH'] = ihm.LPeptideChemComp(id='ASH', code='ASH', code_canonical='D', name='Protonated ASP',
formula='C4 H5 O3 N1')
dict_extension['CFE'] = ihm.LPeptideChemComp(id='CFE', code='CFE', code_canonical='C',
name='CYS with iron sulfur cluster', formula='C3 H4 O1 N1 S3 F2')
dict_extension['CSP'] = ihm.LPeptideChemComp(id='CSP', code='CSP', code_canonical='C', name='Phosphorylated CYS',
formula='C3 H4 O4 N1 S1 P1')
dict_extension['CTN'] = ihm.LPeptideChemComp(id='CTN', code='CTN', code_canonical='c', name='Cterm amide group',
formula='H2 N')
dict_extension['CYF'] = ihm.LPeptideChemComp(id='CYF', code='CYF', code_canonical='C',
name='CYS without the sulfur H',
formula='C3 H4 O1 N1 S1')
dict_extension['CYM'] = ihm.LPeptideChemComp(id='CYM', code='CYM', code_canonical='C', name='CYS with MTSL',
formula='C12 H5 O2 N2 S2')
dict_extension['DDZ'] = ihm.LPeptideChemComp(id='DDZ', code='DDZ', code_canonical='A', name='3,3,-dihydroxy ALA',
formula='C2 H5 O2 N1')
dict_extension['GLH'] = ihm.LPeptideChemComp(id='GLH', code='GLH', code_canonical='Q', name='Protonated GLU',
formula='C5 H7 O3 N1')
dict_extension['HEB'] = ihm.LPeptideChemComp(id='HEB', code='HEB', code_canonical='h', name='Heme B',
formula='C34 H30 O4 N4 F1')
dict_extension['HEC'] = ihm.LPeptideChemComp(id='HEC', code='HEC', code_canonical='h', name='Heme C',
formula='C34 H32 O4 N4 F1')
dict_extension['HYP'] = ihm.LPeptideChemComp(id='HYP', code='HYP', code_canonical='P', name='4R-hydroxyproline',
formula='C5 H7 O2 N1')
dict_extension['M3L'] = ihm.LPeptideChemComp(id='M3L', code='M3L', code_canonical='K', name='Trimethyl LYS',
formula='C9 H21 O1 N2')
dict_extension['MLY'] = ihm.LPeptideChemComp(id='MLY', code='MLY', code_canonical='K', name='Dimethyl LYS',
formula='C8 H18 O1 N2')
dict_extension['MLZ'] = ihm.LPeptideChemComp(id='MLZ', code='MLZ', code_canonical='K', name='Monomethyl LYS',
formula='C7 H15 O1 N2')
dict_extension['NEP'] = ihm.LPeptideChemComp(id='NEP', code='NEP', code_canonical='H', name='NE phosphorylated HIS',
formula='C6 H7 O4 N3 P1')
dict_extension['PTR'] = ihm.LPeptideChemComp(id='PTR', code='PTR', code_canonical='Y', name='O-Phosphotyrosine',
formula='C9 H10 O6 N1 P3')
dict_extension['SEP'] = ihm.LPeptideChemComp(id='SEP', code='SEP', code_canonical='S', name='Phosphorylated SER',
formula='C3 H4 O5 N1 P1')
dict_extension['TIP'] = ihm.LPeptideChemComp(id='TIP', code='TIP', code_canonical='w', name='Water model',
formula='H2O')
dict_extension['TOP'] = ihm.LPeptideChemComp(id='TOP', code='TOP', code_canonical='T', name='Phosphorylated THR',
formula='C4 H7 O5 N1 P1')
dict_extension['TYP'] = ihm.LPeptideChemComp(id='TYP', code='TYP', code_canonical='Y', name='Phosphorylated TYR',
formula='C9 H8 O5 N1 P1')
dict_extension['TYS'] = ihm.LPeptideChemComp(id='TYS', code='TYS', code_canonical='Y', name='Sulfonated TYR',
formula='C9 H8 O5 N1 S1')
dict_extension['PNS'] = ihm.LPeptideChemComp(id='PNS', code='PNS', code_canonical='S',
name='Phosphopantetheine SER',
formula='C11 H23 O7 N2 S1 P1')
return dict_extension
def create_ihm_mmcif(pdb_file, params, interface, restraints=None):
"""Create a IHM mmCIF compatible file from a PDB file and a list of HADDOCK parameters and interface residue IDs
:param str pdb_file: PDB file path
:param dict params: HADDOCK parameter file as a dictionary
:param list interface: List of interface residue IDs
:param restraints: List of Restraints used during the docking process
:type: list of :class:`~tools.TBLHandler.Restraint`
"""
# Load run parameters
# with open(param_path) as o:
# params = json.load(o)
# Stores main information - chain IDs, active residues, passives residues
partners = {}
for i, p in params['partners'].items():
partners[p['segid']] = {}
partners[p['segid']]['activeres'] = p['activereslist']
partners[p['segid']]['passiveres'] = p['passivereslist']
partners[p['segid']]['moleculetype'] = p['moleculetype']
# First, we create a system, which contains everything we know about the
# modeling. A single mmCIF file can contain multiple Systems, but in most
# cases we use just one:
system = ihm.System()
p = BioParser(QUIET=True)
try:
s = p.get_structure('HADDOCK model', pdb_file)
except Exception as e:
logging.error("Error while loading the PDB: {}".format(e))
raise
# Sequences are concatenated string, we ignore HOH
seqs_id = {}
seqs = {}
# Update LPeptideAlphabet to add modified aa supported by HADDOCK
dict_extension = _update_peptide_alphabet()
# We store sequences and match between author resids and mmCIF consecutive resids (1->n)
for r in s.get_residues():
if r.get_parent().id in partners and r.resname != "HOH":
segid = r.get_parent().id
if segid not in seqs_id:
seqs_id[segid] = 1
else:
seqs_id[segid] += 1
if segid not in seqs:
seqs[segid] = {'seq': [], 'id_from_pdb': {}, 'id_from_seq': {}, 'hetero': False}
seqs[segid]['id_from_pdb'][seqs_id[segid]] = r.id[1]
seqs[segid]['id_from_seq'][r.id[1]] = seqs_id[segid]
# Detect whether the residue is a non-standard aa (but supported by HADDOCK)
if r.resname not in VALID_BASES and r.resname not in VALID_AMINOACIDS:
pass
# We only store one-letter code for standard aa
try:
if partners[segid]['moleculetype'] == "DNA":
seqs[segid]['seq'].append(THREE_TO_ONE[r.resname.strip()])
else:
seqs[segid]['seq'].append(THREE_TO_ONE[r.resname.strip()])
except KeyError as e:
# Detect modified aa and supported ligands
if partners[segid]['moleculetype'] not in ("DNA", "RNA") and r.resname.strip() in dict_extension:
seqs[segid]['seq'].append(dict_extension[r.resname.strip()])
elif partners[segid]['moleculetype'] not in ("DNA", "RNA") and not seqs[segid]['hetero']:
seqs[segid]['hetero'] = True
seqs[segid]['seq'].append(r.resname.strip())
logging.error(e)
except Exception as e:
logging.error(f"An issue occurred when trying to add residue {r}", e)
raise Exception
# seqs[segid].append(r.resname.strip())
# Create different entities with specific dictionary depending on the unit type (Protein, RNA, DNA)
entities = []
for i, ch_id in enumerate(partners):
if partners[ch_id]['moleculetype'] == 'DNA':
alphabet = ihm.DNAAlphabet
elif partners[ch_id]['moleculetype'] == 'RNA':
alphabet = ihm.RNAAlphabet
else:
alphabet = ihm.LPeptideAlphabet
if not seqs[ch_id]['hetero']:
entities.append(ihm.Entity(seqs[ch_id]['seq'], alphabet=alphabet, description=f'Partner {i+1}'))
else:
entities.append(ihm.Entity([ihm.NonPolymerChemComp(seqs[ch_id]['seq'][0])], description=f'Partner {i+1}'))
system.entities.extend(tuple(entities))
# Create different asymetric units based on author sequence ids
asym_units = {}
for i, ch_id in enumerate(partners):
asym_units[ch_id] = (ihm.AsymUnit(entities[i], auth_seq_id_map=seqs[ch_id]['id_from_pdb'], id=ch_id))
system.asym_units.extend(tuple([unit for k, unit in asym_units.items()]))
# Create asymetric unit ranges from interface residues
asym_unit_ranges = []
for ch_id, unit in asym_units.items():
if not seqs[ch_id]['hetero']:
for r in interface[unit.id]:
asym_unit_ranges.append(ihm.AsymUnitRange(unit, seqs[unit.id]['id_from_seq'][r[0]],
seqs[unit.id]['id_from_seq'][r[1]]))
# Create Model assembly
modeled_assembly = ihm.Assembly([unit for k, unit in asym_units.items()], name='Modeled assembly')
assemblies = []
for ch_id, unit in asym_units.items():
assemblies.append(ihm.Assembly((unit,), name=f'Subunit {unit.id}'))
# In HADDOCK interface is flexible whereas the rest of the molecules are rigid
# We need to create new asymmetric units to store the interface
segm = [ihm.representation.AtomicSegment(unit, rigid=True) for ch_id, unit in asym_units.items()]
segm += [ihm.representation.AtomicSegment(asym, rigid=False) for asym in asym_unit_ranges]
rep = ihm.representation.Representation(segm)
# Define dataset location (in auto mode we defined them as external file)
pdb_datasets = []
em_dataset = []
for i, p in enumerate(partners):
loc = ihm.location.InputFileLocation(pdb_file.parent / f'data/sequence/protein{i+1}.pdb')
dataset = ihm.dataset.PDBDataset(loc)
pdb_datasets.append(dataset)
if params['em_rest']:
loc = ihm.location.InputFileLocation(pdb_file.parent / f'data/sequence/protein{i+1}.pdb')
dataset = ihm.dataset.EMDensityDataset(loc)
em_dataset.append(dataset)
all_datasets = ihm.dataset.DatasetGroup(pdb_datasets + em_dataset)
# Extract restraints
# TODO (un)ambiguous restraints
poly_res_features = []
if restraints:
for r in restraints:
a1 = asym_units[r.partner1_chainid]
f = ihm.restraint.ResidueFeature([a1(seqs[r.partner1_chainid]['id_from_seq'][r.partner1_resid],
seqs[r.partner1_chainid]['id_from_seq'][r.partner1_resid])])
if f.ranges[0] not in poly_res_features:
system.orphan_features.append(f)
poly_res_features.append(f.ranges[0])
a2 = asym_units[r.partner2_chainid]
if seqs[r.partner2_chainid]['hetero']:
f2 = ihm.restraint.ResidueFeature([a2])
else:
f2 = ihm.restraint.ResidueFeature([a2(seqs[r.partner2_chainid]['id_from_seq'][r.partner2_resid],
seqs[r.partner2_chainid]['id_from_seq'][r.partner2_resid])])
if f2.ranges[0] not in poly_res_features:
system.orphan_features.append(f2)
poly_res_features.append(f2.ranges[0])
# We do not know from where the ambiguous restraints come from so we create a generic dataset that will be used
# as reference for the ihm_derived_distance_restraint list
loc = ihm.location.InputFileLocation(pdb_file.parent / 'data/distances/ambig.tbl')
derived_dist_dataset = ihm.dataset.Dataset(loc)
dist = ihm.restraint.UpperBoundDistanceRestraint(2.0)
probability = 1 if not params["noecv"] else 1.0 / params["ncvpart"]
restraints = []
for feature in system.orphan_features:
for feature2 in system.orphan_features:
# We create restraints only between residues of different subunits (chain IDs)
if feature.ranges[0].asym.id != feature2.ranges[0].asym.id and (feature, feature2) not in restraints \
and (feature2, feature) not in restraints:
r = ihm.restraint.DerivedDistanceRestraint(dataset=derived_dist_dataset, feature1=feature,
feature2=feature2, distance=dist, probability=probability)
system.restraints.append(r)
restraints.append((feature, feature2))
if params['em_rest']:
em_rsr = ihm.restraint.EM3DRestraint(dataset=em_dataset[0], assembly=modeled_assembly)
system.restraints.append(em_rsr)
# Add protocol used
if params['em_rest']:
protocol = ihm.protocol.Protocol(name='HADDOCK-EM')
else:
protocol = ihm.protocol.Protocol(name='HADDOCK')
protocol.steps.append(ihm.protocol.Step(
assembly=modeled_assembly,
dataset_group=all_datasets,
method='Rigid-body minimization in HADDOCK (it0)',
name='Rigid-body minimization',
num_models_begin=0,
num_models_end=params['structures_0'], multi_scale=True))
protocol.steps.append(ihm.protocol.Step(
assembly=modeled_assembly,
dataset_group=all_datasets,
method='Semi-flexible SA in HADDOCK (it1)',
name='Simulated annealing',
num_models_begin=0,
num_models_end=params['structures_1'], multi_scale=True))
protocol.steps.append(ihm.protocol.Step(
assembly=modeled_assembly,
dataset_group=all_datasets,
method='Water refinement in HADDOCK (itw)',
name='Refinement',
num_models_begin=0,
num_models_end=params['waterrefine'], multi_scale=True))
atoms = dict([(ch_id.id, []) for ch_id in s.get_chains()])
for c in s.get_chains():
for r in c:
for a in r:
atoms[c.id].append((c.id, seqs[c.id]['id_from_seq'][r.id[1]], a.name[0], a.name, a.coord[0], a.coord[1],
a.coord[2]))
# Subclassing the IHM Model class and overriding the get_atoms method:
class MyModel(ihm.model.Model):
# Map our asym unit names A and B to IHM asym_unit objects:
asym_unit_map = dict([(unit.id, unit) for ch_id, unit in asym_units.items()])
def get_atoms(self):
for ch_id in self.asym_unit_map:
for asym, seq_id, type_symbol, atom_id, x, y, z in atoms[ch_id]:
yield ihm.model.Atom(asym_unit=self.asym_unit_map[asym], type_symbol=type_symbol, seq_id=seq_id,
atom_id=atom_id, x=x, y=y, z=z)
clus = [int(s) for s in os.path.basename(pdb_file) if s.isdigit()][0]
best = [int(s) for s in os.path.basename(pdb_file) if s.isdigit()][1]
if clus == best == 1:
model = MyModel(assembly=modeled_assembly, protocol=protocol, representation=rep, name='Best scoring model')
else:
model = MyModel(assembly=modeled_assembly, protocol=protocol, representation=rep,
name=f'Best {best} of cluster {clus}')
# Add cross-correlation value if EM map was used
if params['em_rest']:
with open(pdb_file) as o:
for l in o:
if 'cross-correlation' in l:
cc = float(l.split()[3])
break
em_rsr.fits[model] = ihm.restraint.EM3DRestraintFit(cross_correlation_coefficient=cc)
# Similar models can be grouped together. Here we only have a single model
# in the group
model_group = ihm.model.ModelGroup([model], name='All models')
# Groups are then placed into states, which can in turn be grouped. In this
# case we have only a single state:
state = ihm.model.State([model_group])
system.state_groups.append(ihm.model.StateGroup([state]))
# Dump mmCIF file
cif_file = pdb_file.parent / pdb_file.name.replace('.pdb', '.cif')
try:
logging.info(f"Saving as {pdb_file.name.replace('.pdb', '.cif')}")
with open(cif_file, 'w') as cif_fh:
ihm.dumper.write(cif_fh, [system])
except Exception as e:
logging.error("Error saving mmCIF archive", e)
def load_run_parameters(json_file):
job_params = json.load(open(json_file))
return job_params
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("run_directory",
help="Location of the uncompressed run, ex: /home/rodrigo/runs/47498-protein-protein")
args = parser.parse_args()
logging.basicConfig(level='DEBUG',
format='[%(asctime)s] %(levelname)s %(message)s',
datefmt='%d/%m/%Y %H:%M:%S')
logging.info('Converting the cluster*.pdb structures to .cif')
run_path = pathlib.Path(args.run_directory)
logging.info(f'Looking for models in {run_path}')
cluster_pdbs = list(run_path.glob('cluster*.pdb'))
cluster_pdbs.sort()
logging.info(f'Found {len(cluster_pdbs)} structures')
parameter_file = load_run_parameters(run_path / 'job_params.json')
# this can be none
logging.info(f"Looking for the tblfile field in {run_path / 'job_params.json'}")
tblfile = parameter_file["tblfile"]
if tblfile:
logging.info('tblfile field found, extracting information')
restraints = extract_restraints(tblfile)
else:
logging.warning('tblfile field not found')
restraints = False
for cluster_strct in cluster_pdbs:
logging.info(f'Converting {cluster_strct}')
interface_res = get_interface_residues(cluster_strct)
interface_ranges = get_interface_ranges(interface_res)
create_ihm_mmcif(cluster_strct, parameter_file, interface_ranges, restraints)