Skip to content

Commit

Permalink
fix: correct pubchem crawling from smiles with multiple entries
Browse files Browse the repository at this point in the history
  • Loading branch information
jannisborn committed Oct 3, 2024
1 parent 78ad7c9 commit 3c42e78
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 4 deletions.
4 changes: 1 addition & 3 deletions pytoda/preprocessing/crawlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,9 +128,7 @@ def get_smiles_from_pubchem(
path = '{}/{}/{}/{}/{}/{}'.format(
PUBCHEM_START, query_type, drug, PUBCHEM_MID, option, PUBCHEM_END
)
smiles = (
urllib_request.urlopen(path).read().decode('UTF-8').replace('\n', '')
)
smiles = urllib_request.urlopen(path).read().decode('UTF-8').split()[0]
if not kekulize:
smiles = Chem.MolToSmiles(Chem.MolFromSmiles(smiles, sanitize=sanitize))
return smiles
Expand Down
11 changes: 10 additions & 1 deletion pytoda/preprocessing/tests/test_crawlers.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Testing Crawlers."""

import unittest

from pytoda.preprocessing.crawlers import ( # query_pubchem,; remove_pubchem_smiles,
Expand Down Expand Up @@ -31,7 +32,7 @@ def test_get_smiles_from_zinc(self) -> None:
self.assertEqual(smiles, ground_truth)

def test_get_smiles_from_pubchem(self) -> None:
"""Test get_smiles_from_zinc"""
"""Test get_smiles_from_pubchem"""

for sanitize in [True, False]:

Expand Down Expand Up @@ -83,6 +84,14 @@ def test_get_smiles_from_pubchem(self) -> None:
)
self.assertEqual(smiles, ground_truth)

# Test molecule where landing page has several entries
gt_smiles = (
'CC12C(C(CC(O1)N3C4=CC=CC=C4C5=C6C(=C7C8=CC=CC=C8N2C7=C53)CNC6=O)NC)OC'
)
drug = 'Staurosporine'
smiles = get_smiles_from_pubchem(drug, use_isomeric=False, kekulize=True)
self.assertEqual(smiles, gt_smiles)

def test_query_pubchem(self) -> None:
"""Test query_pubchem"""
# pass
Expand Down

0 comments on commit 3c42e78

Please sign in to comment.