Skip to content

Commit

Permalink
Feature het code in pdb from smiles (#150)
Browse files Browse the repository at this point in the history
* getting official HET code using pubchem. If successful, write the pdb file with correct code instead of UNL

* add test to get Hetcode
  • Loading branch information
Jgmedina95 authored Jul 9, 2024
1 parent 6333298 commit 135926e
Show file tree
Hide file tree
Showing 2 changed files with 111 additions and 0 deletions.
76 changes: 76 additions & 0 deletions mdagent/tools/base_tools/preprocess_tools/pdb_get.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,68 @@ def smiles2name(self, smi: str) -> str:
return "Unknown Molecule"
return name

def get_pubchem_cid(self, smiles):
_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/"
url = _url + f"{smiles}/cids/JSON"
response = requests.get(url)
if response.status_code == 200:
data = response.json()
return data["IdentifierList"]["CID"][0]
return None

def get_hetcode_from_cid(self, cid):
print(cid)
url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{cid}/JSON"
response = requests.get(url)
header_1 = "Interactions and Pathways"
header_2 = "Protein Bound 3D Structures"
header_3 = "Ligands from Protein Bound 3D Structures"
header_4 = "PDBe Ligand Code"
if response.status_code == 200:
data = response.json()
for section in data["Record"]["Section"]:
if section["TOCHeading"] == header_1:
for subsection in section["Section"]:
if subsection["TOCHeading"] == header_2:
for subsubsection in subsection["Section"]:
if subsubsection["TOCHeading"] == header_3:
for s in subsubsection["Section"]:
if s["TOCHeading"] == header_4:
return s["Information"][0]["Value"][
"StringWithMarkup"
][0]["String"]
return None

def get_hetcode(self, smiles):
cid = self.get_pubchem_cid(smiles)
if cid is not None:
return self.get_hetcode_from_cid(cid)
return None

def _add_res_info(self, m, code, name=None):
"""This code is unnecesarilly big considering i only want to add
the HET code to the molecule, but there is a bug in RDKIT that screws the
PDB file if all the other attributes are not added.
Updating residue Name from UNL to the HET code present in pubchem
Everythin else (number, is heteroatom, etc) is kept the same but needs to be
added to avoid the bug.
See: https://github.com/rdkit/rdkit/pull/7286#issue-2200600916"""
block = AllChem.MolToPDBBlock(m)
for line in block.split("\n"):
if line.startswith("HETATM"): # avoiding CONECT, COMPND, etc.
atom_number = line[6:11]
atom_name = line[12:16]
res_number = line[22:26]
atom = m.GetAtomWithIdx(int(atom_number) - 1)
res_inf = Chem.AtomPDBResidueInfo()
res_inf.SetName(f"{atom_name}")
res_inf.SetResidueName(f"{code}")
res_inf.SetResidueNumber(int(res_number))
res_inf.SetIsHeteroAtom(True)
atom.SetPDBResidueInfo(res_inf)
if name:
m.SetProp("_Name", name)

def small_molecule_pdb(self, mol_str: str) -> str:
# takes in molecule name or smiles (converts to smiles if name)
# writes pdb file name.pdb (gets name from smiles if possible)
Expand All @@ -175,16 +237,30 @@ def small_molecule_pdb(self, mol_str: str) -> str:
if self.is_smiles(mol_str):
m = Chem.MolFromSmiles(mol_str)
mol_name = self.smiles2name(mol_str)
HET_code = self.get_hetcode(mol_str)
if not HET_code:
HET_code = "UNK"
else: # if input is not smiles, try getting smiles
smi = self.molname2smiles(mol_str)
m = Chem.MolFromSmiles(smi)
mol_name = mol_str
HET_code = self.get_hetcode(smi)
if not HET_code:
HET_code = "UNL"

try: # only if needed
m = Chem.AddHs(m)

except Exception:
pass

AllChem.EmbedMolecule(m)
# add HET code to molecule

file_name = f"{self.path_registry.ckpt_pdb}/{mol_name}.pdb"
if HET_code != "UNL": # if HET code is UNL no need for this...
self._add_res_info(m, HET_code, mol_name)

Chem.MolToPDBFile(m, file_name)
print("finished writing pdb file")
self.path_registry.map_path(
Expand Down
35 changes: 35 additions & 0 deletions tests/test_preprocess/test_pdb_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,3 +131,38 @@ def test_packmol_download_only(packmol, small_molecule):
)
assert time_before == time_after
os.remove(f"{packmol.path_registry.ckpt_pdb}/{small_molecule[0]}.pdb")


cids = {
"CO": 887,
"CCO": 702,
"O": 962,
"CC(=O)C": 180,
"C(=O)(N)N": 1176,
"CS(=O)C": 679,
"CN(C)C=O": 6228,
"C(C(CO)O)O": 753,
}


def get_cid(smiles):
return cids[smiles]


pairs = [
("CO", "MOH"),
("CCO", "EOH"),
("O", "HOH"),
("CC(=O)C", "ACN"),
("C(=O)(N)N", "URE"),
("CS(=O)C", "DMS"),
("CN(C)C=O", "DMF"),
("CCO", "EOH"),
("C(C(CO)O)O", "GOL"),
]


@pytest.mark.parametrize("smiles, codes", pairs)
def test_get_het_codes(molpdb, smiles, codes):
cid = get_cid(smiles) # to not test the get_cid function
assert molpdb.get_hetcode_from_cid(cid) == codes

0 comments on commit 135926e

Please sign in to comment.