diff --git a/mdagent/tools/base_tools/preprocess_tools/pdb_get.py b/mdagent/tools/base_tools/preprocess_tools/pdb_get.py index eac925ec..80d37f5c 100644 --- a/mdagent/tools/base_tools/preprocess_tools/pdb_get.py +++ b/mdagent/tools/base_tools/preprocess_tools/pdb_get.py @@ -165,6 +165,68 @@ def smiles2name(self, smi: str) -> str: return "Unknown Molecule" return name + def get_pubchem_cid(self, smiles): + _url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/" + url = _url + f"{smiles}/cids/JSON" + response = requests.get(url) + if response.status_code == 200: + data = response.json() + return data["IdentifierList"]["CID"][0] + return None + + def get_hetcode_from_cid(self, cid): + print(cid) + url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{cid}/JSON" + response = requests.get(url) + header_1 = "Interactions and Pathways" + header_2 = "Protein Bound 3D Structures" + header_3 = "Ligands from Protein Bound 3D Structures" + header_4 = "PDBe Ligand Code" + if response.status_code == 200: + data = response.json() + for section in data["Record"]["Section"]: + if section["TOCHeading"] == header_1: + for subsection in section["Section"]: + if subsection["TOCHeading"] == header_2: + for subsubsection in subsection["Section"]: + if subsubsection["TOCHeading"] == header_3: + for s in subsubsection["Section"]: + if s["TOCHeading"] == header_4: + return s["Information"][0]["Value"][ + "StringWithMarkup" + ][0]["String"] + return None + + def get_hetcode(self, smiles): + cid = self.get_pubchem_cid(smiles) + if cid is not None: + return self.get_hetcode_from_cid(cid) + return None + + def _add_res_info(self, m, code, name=None): + """This code is unnecesarilly big considering i only want to add + the HET code to the molecule, but there is a bug in RDKIT that screws the + PDB file if all the other attributes are not added. + Updating residue Name from UNL to the HET code present in pubchem + Everythin else (number, is heteroatom, etc) is kept the same but needs to be + added to avoid the bug. + See: https://github.com/rdkit/rdkit/pull/7286#issue-2200600916""" + block = AllChem.MolToPDBBlock(m) + for line in block.split("\n"): + if line.startswith("HETATM"): # avoiding CONECT, COMPND, etc. + atom_number = line[6:11] + atom_name = line[12:16] + res_number = line[22:26] + atom = m.GetAtomWithIdx(int(atom_number) - 1) + res_inf = Chem.AtomPDBResidueInfo() + res_inf.SetName(f"{atom_name}") + res_inf.SetResidueName(f"{code}") + res_inf.SetResidueNumber(int(res_number)) + res_inf.SetIsHeteroAtom(True) + atom.SetPDBResidueInfo(res_inf) + if name: + m.SetProp("_Name", name) + def small_molecule_pdb(self, mol_str: str) -> str: # takes in molecule name or smiles (converts to smiles if name) # writes pdb file name.pdb (gets name from smiles if possible) @@ -175,16 +237,30 @@ def small_molecule_pdb(self, mol_str: str) -> str: if self.is_smiles(mol_str): m = Chem.MolFromSmiles(mol_str) mol_name = self.smiles2name(mol_str) + HET_code = self.get_hetcode(mol_str) + if not HET_code: + HET_code = "UNK" else: # if input is not smiles, try getting smiles smi = self.molname2smiles(mol_str) m = Chem.MolFromSmiles(smi) mol_name = mol_str + HET_code = self.get_hetcode(smi) + if not HET_code: + HET_code = "UNL" + try: # only if needed m = Chem.AddHs(m) + except Exception: pass + AllChem.EmbedMolecule(m) + # add HET code to molecule + file_name = f"{self.path_registry.ckpt_pdb}/{mol_name}.pdb" + if HET_code != "UNL": # if HET code is UNL no need for this... + self._add_res_info(m, HET_code, mol_name) + Chem.MolToPDBFile(m, file_name) print("finished writing pdb file") self.path_registry.map_path( diff --git a/tests/test_preprocess/test_pdb_tools.py b/tests/test_preprocess/test_pdb_tools.py index 6459bd48..7652197d 100644 --- a/tests/test_preprocess/test_pdb_tools.py +++ b/tests/test_preprocess/test_pdb_tools.py @@ -131,3 +131,38 @@ def test_packmol_download_only(packmol, small_molecule): ) assert time_before == time_after os.remove(f"{packmol.path_registry.ckpt_pdb}/{small_molecule[0]}.pdb") + + +cids = { + "CO": 887, + "CCO": 702, + "O": 962, + "CC(=O)C": 180, + "C(=O)(N)N": 1176, + "CS(=O)C": 679, + "CN(C)C=O": 6228, + "C(C(CO)O)O": 753, +} + + +def get_cid(smiles): + return cids[smiles] + + +pairs = [ + ("CO", "MOH"), + ("CCO", "EOH"), + ("O", "HOH"), + ("CC(=O)C", "ACN"), + ("C(=O)(N)N", "URE"), + ("CS(=O)C", "DMS"), + ("CN(C)C=O", "DMF"), + ("CCO", "EOH"), + ("C(C(CO)O)O", "GOL"), +] + + +@pytest.mark.parametrize("smiles, codes", pairs) +def test_get_het_codes(molpdb, smiles, codes): + cid = get_cid(smiles) # to not test the get_cid function + assert molpdb.get_hetcode_from_cid(cid) == codes