Feature het code in pdb from smiles (#150)

* getting official HET code using pubchem. If successful, write the pdb file with correct code instead of UNL * add test to get Hetcode
ur-whitelab · Jul 9, 2024 · 135926e · 135926e
1 parent 6333298
commit 135926e
Show file tree

Hide file tree

Showing 2 changed files with 111 additions and 0 deletions.
diff --git a/mdagent/tools/base_tools/preprocess_tools/pdb_get.py b/mdagent/tools/base_tools/preprocess_tools/pdb_get.py
@@ -165,6 +165,68 @@ def smiles2name(self, smi: str) -> str:
             return "Unknown Molecule"
         return name
 
+    def get_pubchem_cid(self, smiles):
+        _url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/"
+        url = _url + f"{smiles}/cids/JSON"
+        response = requests.get(url)
+        if response.status_code == 200:
+            data = response.json()
+            return data["IdentifierList"]["CID"][0]
+        return None
+
+    def get_hetcode_from_cid(self, cid):
+        print(cid)
+        url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{cid}/JSON"
+        response = requests.get(url)
+        header_1 = "Interactions and Pathways"
+        header_2 = "Protein Bound 3D Structures"
+        header_3 = "Ligands from Protein Bound 3D Structures"
+        header_4 = "PDBe Ligand Code"
+        if response.status_code == 200:
+            data = response.json()
+            for section in data["Record"]["Section"]:
+                if section["TOCHeading"] == header_1:
+                    for subsection in section["Section"]:
+                        if subsection["TOCHeading"] == header_2:
+                            for subsubsection in subsection["Section"]:
+                                if subsubsection["TOCHeading"] == header_3:
+                                    for s in subsubsection["Section"]:
+                                        if s["TOCHeading"] == header_4:
+                                            return s["Information"][0]["Value"][
+                                                "StringWithMarkup"
+                                            ][0]["String"]
+        return None
+
+    def get_hetcode(self, smiles):
+        cid = self.get_pubchem_cid(smiles)
+        if cid is not None:
+            return self.get_hetcode_from_cid(cid)
+        return None
+
+    def _add_res_info(self, m, code, name=None):
+        """This code is unnecesarilly big considering i only want to add
+        the HET code to the molecule, but there is a bug in RDKIT that screws the
+        PDB file if all the other attributes are not added.
+        Updating residue Name from UNL to the HET code present in pubchem
+        Everythin else (number, is heteroatom, etc) is kept the same but needs to be
+        added to avoid the bug.
+        See: https://github.com/rdkit/rdkit/pull/7286#issue-2200600916"""
+        block = AllChem.MolToPDBBlock(m)
+        for line in block.split("\n"):
+            if line.startswith("HETATM"):  # avoiding CONECT, COMPND, etc.
+                atom_number = line[6:11]
+                atom_name = line[12:16]
+                res_number = line[22:26]
+                atom = m.GetAtomWithIdx(int(atom_number) - 1)
+                res_inf = Chem.AtomPDBResidueInfo()
+                res_inf.SetName(f"{atom_name}")
+                res_inf.SetResidueName(f"{code}")
+                res_inf.SetResidueNumber(int(res_number))
+                res_inf.SetIsHeteroAtom(True)
+                atom.SetPDBResidueInfo(res_inf)
+        if name:
+            m.SetProp("_Name", name)
+
     def small_molecule_pdb(self, mol_str: str) -> str:
         # takes in molecule name or smiles (converts to smiles if name)
         # writes pdb file name.pdb (gets name from smiles if possible)
@@ -175,16 +237,30 @@ def small_molecule_pdb(self, mol_str: str) -> str:
             if self.is_smiles(mol_str):
                 m = Chem.MolFromSmiles(mol_str)
                 mol_name = self.smiles2name(mol_str)
+                HET_code = self.get_hetcode(mol_str)
+                if not HET_code:
+                    HET_code = "UNK"
             else:  # if input is not smiles, try getting smiles
                 smi = self.molname2smiles(mol_str)
                 m = Chem.MolFromSmiles(smi)
                 mol_name = mol_str
+                HET_code = self.get_hetcode(smi)
+                if not HET_code:
+                    HET_code = "UNL"
+
             try:  # only if needed
                 m = Chem.AddHs(m)
+
             except Exception:
                 pass
+
             AllChem.EmbedMolecule(m)
+            # add HET code to molecule
+
             file_name = f"{self.path_registry.ckpt_pdb}/{mol_name}.pdb"
+            if HET_code != "UNL":  # if HET code is UNL no need for this...
+                self._add_res_info(m, HET_code, mol_name)
+
             Chem.MolToPDBFile(m, file_name)
             print("finished writing pdb file")
             self.path_registry.map_path(

diff --git a/tests/test_preprocess/test_pdb_tools.py b/tests/test_preprocess/test_pdb_tools.py
@@ -131,3 +131,38 @@ def test_packmol_download_only(packmol, small_molecule):
     )
     assert time_before == time_after
     os.remove(f"{packmol.path_registry.ckpt_pdb}/{small_molecule[0]}.pdb")
+
+
+cids = {
+    "CO": 887,
+    "CCO": 702,
+    "O": 962,
+    "CC(=O)C": 180,
+    "C(=O)(N)N": 1176,
+    "CS(=O)C": 679,
+    "CN(C)C=O": 6228,
+    "C(C(CO)O)O": 753,
+}
+
+
+def get_cid(smiles):
+    return cids[smiles]
+
+
+pairs = [
+    ("CO", "MOH"),
+    ("CCO", "EOH"),
+    ("O", "HOH"),
+    ("CC(=O)C", "ACN"),
+    ("C(=O)(N)N", "URE"),
+    ("CS(=O)C", "DMS"),
+    ("CN(C)C=O", "DMF"),
+    ("CCO", "EOH"),
+    ("C(C(CO)O)O", "GOL"),
+]
+
+
+@pytest.mark.parametrize("smiles, codes", pairs)
+def test_get_het_codes(molpdb, smiles, codes):
+    cid = get_cid(smiles)  # to not test the get_cid function
+    assert molpdb.get_hetcode_from_cid(cid) == codes