forked from haddocking/haddock-tools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pdb_strict_format.py
executable file
·237 lines (198 loc) · 10.2 KB
/
pdb_strict_format.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
#!/usr/bin/env python
# coding=utf-8
# To support python 2.5+
from __future__ import with_statement
"""
PDBParser.py
Module to validate PDB files.
"""
import os
import re
import argparse
# Regular expression to parse PDB ATOM/HETATM lines
# Spec: http://www.wwpdb.org/documentation/file-format-content/format33/sect9.html#ATOM
re_ATOM = re.compile("""
(ATOM[\s]{2}|HETATM)
(?P<serial>[\d\s]{5})
[\s]{1}
(?P<atname>[\w\s+\']{4})
(?P<altloc>[\w\s]{1})
(?P<resn>[\s\w]{3})
[\s]{1}
(?P<chain>[\w\s]{1})
(?P<resi>[\s\-?\d+$]{4})
(?P<icode>[\w\s]{1})
[\s]{3}
(?P<x>[\s\d.\-]{8})
(?P<y>[\s\d.\-]{8})
(?P<z>[\s\d.\-]{8})
(?P<o>[\s\d.\-]{6})
(?P<b>[\s\d.\-]{6})
""", re.VERBOSE)
# Taken from
# http://www.wwpdb.org/documentation/format33/v3.3.html
valid_records = {'ANISOU', 'ATOM', 'AUTHOR', 'CAVEAT', 'CISPEP', 'COMPND', 'CONECT', 'CRYST1', 'DBREF', 'DBREF1',
'DBREF2', 'ENDMDL', 'EXPDTA', 'FORMUL', 'HEADER', 'HELIX', 'HET', 'HETATM', 'HETNAM', 'HETSYN', 'JRNL',
'KEYWDS', 'LINK', 'MDLTYP', 'MODEL', 'MODRES', 'MTRIX1', 'MTRIX2', 'MTRIX3', 'NUMMDL', 'OBSLTE',
'ORIGX1', 'ORIGX2', 'ORIGX3', 'REVDAT', 'SCALE1', 'SCALE2', 'SCALE3', 'SEQADV', 'SEQRES', 'SHEET',
'SOURCE', 'SPLIT', 'SPRSDE', 'SSBOND', 'TER', 'TITLE', 'MASTER', 'END', 'REMARK', 'SEQALI', 'SPDBVT',
'SPDBVV', 'SPDBVf', 'SPDBVR', 'SPDBVb'}
aa = ["ACE", "CTN", "ALA", "CFE", "CYS", "CYM", "CYF", "CSP", "ASP", "GLU", "PHE", "GLY", "HIS", "NEP", "ILE", "LYS",
"ALY", "MLY", "MLZ", "M3L", "LEU", "MET", "ASN", "PRO", "GLN", "ARG", "SER", "SEP", "THR", "THP", "TOP", "VAL",
"TRP", "TYR", "PTR", "TYP", "TYS", "TIP", "HYP", "HEB", "HEC", "WAT", "PNS"]
bases = ["ADE", "CYT", "DOC", "GUA", "DGP", "URI", "THY", "THJ", " DG", " DC", " DT", " DA", "DG ", "DC ", "DT ", "DA ",
" A", " G", " C", " T", " U"]
aa += bases
weirdbases = [" A ", " G ", " C ", " T ", " U "] # unusual one-letter bases
aa += weirdbases
knownligands = ["DFO", "ADY", "ACD", "ACT", "ACN", "BDY", "BEN", "CHE", "DME", "ETA", "EOL", "BUT", "THS", "AMN",
"PHN", "URE", "IMI", "MER"]
aa += knownligands
class PDBParsingError(Exception):
"""
Custom exception class to provide better error messages.
Besides the standard error message, the class appends
the full offending line together with an example valid line.
"""
def __init__(self, user_message, line, final=False):
# Add information to our message
# Valid example ATOM line
valid_atom = "ATOM 32 N AARG A -3 11.281 86.699 94.383 0.50 35.88 N "
full_message = '{0}\n'.format(user_message)
# To avoid repetitive messages, only last PDBParsing exception triggers the wollowing line
if final:
full_message += '{0} <-- (Offending Line)\n{1}(Example Valid Line)'.format(line.strip(), valid_atom)
full_message += '\n\nSee http://www.wwpdb.org/documentation/file-format-content/format33/sect9.html#ATOM ' \
'for more details'
# Call base class constructor
Exception.__init__(self, full_message)
class PDBParser(object):
"""
Class to parse a PDB file coordinates
and ATOM/HETATM identity.
"""
def __init__(self, fpath=None, chain_id_check=False):
fpath = os.path.abspath(os.path.expanduser(fpath))
self.fpath = self._check_path(fpath)
# Holds tuple for each atom.
self.atoms = []
# Holds key (model_id, chain, resi, icode) for each residue
# and value (aname, position in self.atoms array)
self.residues = {}
self._parse(chain_id_check)
def _check_path(self, path):
""" Verifies the existence and permissions of a file
Args:
path (str): path to a file
Returns:
if existing and accessible, absolute path for file
Raises:
IOError: file cannot be found or read.
"""
full_path = os.path.abspath(path)
if not os.path.exists(full_path):
raise IOError('File does not exist: {0}'.format(full_path))
elif not os.access(full_path, os.R_OK):
raise IOError('File cannot be read (do you have permission?): {0}', format(full_path))
return full_path
def _parse_atom_line(self, line, chain_id_check):
"""Extracts information from ATOM/HETATM line"""
atom_data = re_ATOM.match(line)
if not atom_data:
raise PDBParsingError('ATOM/HETATM line does not meet the expected format', line)
(record, serial, atname,
altloc, resn, chain, resi,
icode, x, y, z, o, b, segid) = atom_data.groups()
# Check if it is a known residue
if record.strip() == "ATOM" and resn not in aa:
raise PDBParsingError('Residue name ("{}") is unknown, please check the syntax or replace ATOM by HETATM'.
format(resn), line)
# This is probably redundant because of the validation done in the regex matching..
try:
resi = int(resi)
except ValueError:
raise PDBParsingError('Residue number must be an integer (is {0!r})'.format(resi), line)
try:
x, y, z, o, b = map(float, [x, y, z, o, b])
except ValueError:
raise PDBParsingError('X,Y,Z coordinates, occupancy, and temperature (b) factors must be decimal numbers.',
line)
if chain_id_check and chain == " ":
if not segid:
raise PDBParsingError('No chain_id found', line)
else:
raise PDBParsingError('No chain_id found but segid detected,\n'
'use http://github.com/haddocking/pdb-tools/pdb_segxchain.py to exchange '
'chain_id for segid', line)
if chain_id_check and segid != " " and (chain != segid):
raise PDBParsingError('The seg_id is different from chain_id', line)
# Could use a named tuple here?
atom = (record, atname, altloc, resn, chain, resi, icode, x, y, z, o, b)
return atom
def _parse_model_line(self, line):
"""Extracts the model number from the line"""
try:
model_number = int(line[11:14])
except ValueError:
raise PDBParsingError('Model number must be an integer (is {0!r})'.format(line[11:14]), line)
else:
return model_number
def _parse(self, chain_id_check):
"""Actual parsing function"""
pdbf = self.fpath
# Counter / other variables
at_counter = 0
model_open = False
model_id = None
# Warning triggered if an ATOM/HETATM line has less than 80 columns
len_warning = False
# Parser loop
with open(pdbf, 'r') as pdb_handle:
for iline, line in enumerate(pdb_handle):
record = line[0:6].strip()
if record in valid_records:
if line.startswith('MODEL'):
if model_open:
raise PDBParsingError('It seems an ENDMDL statement is missing at line {0}'.format(iline),
line, True)
model_open = True
model_id = self._parse_model_line(line)
elif line.startswith('ENDMDL'):
model_open = False
elif line.startswith(('ATOM', 'HETATM')):
if not len_warning and len(line) < 81:
len_warning = True
try:
atom = self._parse_atom_line(line, chain_id_check)
except PDBParsingError as error:
raise PDBParsingError('Could not parse the PDB file at line {0}\n{1}'.
format(iline + 1, error), line, True)
else:
# Register atoms
atom_uid = (model_id, ) + atom
self.atoms.append(atom_uid)
# Register residue
_, atname, _, _, chain, resi, icode, _, _, _, _, _ = atom
res_id = (model_id, chain, resi, icode)
if res_id in self.residues:
self.residues[res_id].append((atname, at_counter))
else:
self.residues[res_id] = [(atname, at_counter)]
at_counter += 1
else:
raise PDBParsingError('Could not parse the PDB file at line {0}: record unknown ({1})'.
format(iline, record), line, True)
if len_warning:
print("WARNING: At least one ATOM/HETATM line consists of less than 80 columns. \nTo follow the official "
"wwPDB guidelines, please use http://github.com/haddocking/pdb-tools/pdb_linewidth to format your "
"PDB file.\nOfficial PDB format guidelines can be found here: "
"http://www.wwpdb.org/documentation/file-format-content/format33/sect9.html#ATOM.\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="This script validates a PDB file (*.pdb).\n")
parser.add_argument("pdb", help="PDB file")
parser.add_argument("-nc", "--no_chainid", action="store_false", help="Ignore empty chain ids")
args = parser.parse_args()
try:
f_pdb = PDBParser(args.pdb, args.no_chainid)
except PDBParsingError as e:
print(e)