Merge pull request #10 from SeonghwanSeo/dev-2.1.0

Dev 2.1.0
SeonghwanSeo · Aug 28, 2024 · 1729de1 · 1729de1
2 parents 2ff2d49 + b4ad69f
commit 1729de1
Show file tree

Hide file tree

Showing 24 changed files with 1,061 additions and 220 deletions.
diff --git a/.gitignore b/.gitignore
@@ -5,6 +5,9 @@ run.sh
 result/
 examples/library/
 nogit/
+maintain_test/
+tacogfn_reward
+largfn_reward
 
 
 # Byte-compiled / optimized / DLL files

diff --git a/README.md b/README.md
@@ -14,6 +14,7 @@ Official Github for **_PharmacoNet: Accelerating Large-Scale Virtual Screening b
 1. Fully automated protein-based pharmacophore modeling based on image instance segmentation modeling
 2. Coarse-grained graph matching at the pharmacophore level for high throughput
 3. Pharmacophore-aware scoring function with parameterized analytical function for robust generalization ability
+4. Better pocket representation for deep learning developer. ([Section](#pharmacophore-feature-extraction))
 
 PharmacoNet is an extremely rapid yet reasonably accurate ligand evaluation tool with high generation ability.
 
@@ -164,27 +165,34 @@ score = model.scoring_smiles(<SMILES>, <NUM_CONFORMERS>)
 
 ## Pharmacophore Feature Extraction
 
-For deep learning researcher who want to use PharmacoNet as pre-trained model for feature extraction, we provide the script `feature_extraction.py`.
 
-```bash
-python feature_extraction.py --protein <PROTEIN_PATH> --ref_ligand <REF_LIGAND_PATH> --out <SAVE_PT_PATH>
-python feature_extraction.py --protein <PROTEIN_PATH> --center <X> <Y> <Z> --out <SAVE_PT_PATH>
-```
+***See: [`./src/pmnet_appl/`](/src/pmnet_appl/).***
 
-```bash
-OUTPUT=(multi_scale_features, hotspot_info)
-  multi_scale_features: list[torch.Tensor]:
-    - torch.Tensor [96, 4, 4, 4]
-    - torch.Tensor [96, 8, 8, 8]
-    - torch.Tensor [96, 16, 16, 16]
-    - torch.Tensor [96, 32, 32, 32]
-    - torch.Tensor [96, 64, 64, 64]
-  hotspot_infos: list[hotspot_info]
-    info: dict[str, Any]
-      - hotspot_feature: torch.Tensor (192,)
+For deep learning researcher who want to use PharmacoNet as pre-trained model for feature extraction, we provide the python API.
+
+```python
+from pmnet.api import PharmacoNet, get_pmnet_dev, ProteinParser
+module: PharmacoNet = get_pmnet_dev('cuda') # default: score_threshold=0.5 (less threshold: more features)
+
+# End-to-End calculation
+pmnet_attr = module.feature_extraction(<PROTEIN_PATH>, ref_ligand_path=<REF_LIGAND_PATH>)
+pmnet_attr = module.feature_extraction(<PROTEIN_PATH>, center=(<CENTER_X>, <CENTER_Y>, <CENTER_Z>))
+
+# Step-wise calculation
+## In Dataset
+parser = ProteinParser(center_noise=<CENTER_NOISE>) # center_noise: for data augmentation
+## In Model (freezed, method is decorated by torch.no_grad())
+pmnet_attr = module.run_extraction(protein_data)
+
+"""
+pmnet_attr = (multi_scale_features, hotspot_infos)
+- multi_scale_features: tuple[Tensor, Tensor, Tensor, Tensor, Tensor]:
+    - [96, 4, 4, 4], [96, 8, 8, 8], [96, 16, 16, 16], [96, 32, 32, 32], [96, 64, 64, 64]
+- hotspot_infos: list[hotspot_info]
+    hotspot_info: dict[str, Any]
+      - hotspot_feature: Tensor [192,]
       - hotspot_position: tuple[float, float, float] - (x, y, z)
       - hotspot_score: float in [0, 1]
-
       - nci_type: str (10 types)
           'Hydrophobic': Hydrophobic interaction
           'PiStacking_P': PiStacking (Parallel)
@@ -197,37 +205,14 @@ OUTPUT=(multi_scale_features, hotspot_info)
           'HBond_pdon': Hydrogen Bond btw Protein Donor & Ligand Acceptor
           'HBond_ldon': Hydrogen Bond btw Protein Acceptor & Ligand Donor
 
+      # Features obtained from `nci_type`, i.e. `nci_type` is all you need.
       - hotspot_type: str (7 types)
           {'Hydrophobic', 'Aromatic', 'Cation', 'Anion',
            'Halogen', 'HBond_donor', 'HBond_acceptor'}
-          *** `type` is obtained from `nci_type`.
       - point_type: str (7 types)
           {'Hydrophobic', 'Aromatic', 'Cation', 'Anion',
            'Halogen', 'HBond_donor', 'HBond_acceptor'}
-          *** `type` is obtained from `nci_type`.
-```
-
-### Python Script
-
-For feature extraction, it is recommended to use `score_threshold=0.5` instead of default setting used for pharmacophore modeling. If you want to extract more features, decrease the `score_threshold`.
-
-```python
-from pmnet.module import PharmacoNet, parse_protein
-module = PharmacoNet(
-    "cuda",
-    score_threshold = 0.5,  # <SCORE_THRESHOLD: float | dict[str, float], recommended=0.5>,
-    molvoxel_library = 'numpy' # <MOLVOXEL_LIBRARY: str, if you use it in `Dataset`, set 'numpy'>
-)
-# End-to-End calculation
-multi_scale_features, hotspot_infos = module.feature_extraction(<PROTEIN_PATH>, <REF_LIGAND_PATH>)
-multi_scale_features, hotspot_infos = module.feature_extraction(<PROTEIN_PATH>, center=(<X>, <Y>, <Z>))
-
-# Step-wise calculation
-voxelizer = module.voxelizer
-# In Dataset (Type: Tuple[Tensor, Tensor, Tensor, Tensor])
-protein_data = module.parse_protein(voxelizer, <PROTEIN_PATH>, <REF_LIGAND_PATH>, <CENTER_NOISE>)
-# In Model
-multi_scale_features, hotspot_infos = module.run_extraction(protein_data)
+"""
 ```
 
 ### Paper List

diff --git a/environment.yml b/environment.yml
@@ -5,5 +5,5 @@ dependencies:
   - python=3.11
   - pip=24.0
   - openbabel=3.1.1
-  - pymol-open-source=3.0.0
   - numpy=1.26.4
+  - pymol-open-source=3.0.0
diff --git a/feature_extraction.py b/feature_extraction.py
@@ -1,6 +1,6 @@
 import argparse
 import torch
-from pmnet.module import PharmacoNet
+from pmnet.api import get_pmnet_dev
 
 
 class ArgParser(argparse.ArgumentParser):
@@ -57,9 +57,7 @@ def main(args):
     ]
     """
     device = "cuda" if args.cuda else "cpu"
-    score_threshold = 0.5  # NOTE: RECOMMENDED_SCORE_THRESHOLD
-
-    module = PharmacoNet(device, score_threshold)
+    module = get_pmnet_dev(device)
     multi_scale_features, hotspot_infos = module.feature_extraction(args.protein, args.ref_ligand, args.center)
     torch.save([multi_scale_features, hotspot_infos], args.out)
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "pharmaconet"
-version = "2.0.3"
+version = "2.1.0"
 description = "PharmacoNet: Open-Source Software for Protein-based Pharmacophore Modeling and Virtual Screening"
 license = { text = "MIT" }
 authors = [{ name = "Seonghwan Seo", email = "shwan0106@kaist.ac.kr" }]
@@ -34,6 +34,24 @@ dependencies = [
   "biopython>=1.83"
 ]
 
+[project.optional-dependencies]
+appl = [
+  "torch==2.3.1",
+  "torch-geometric==2.4.0",
+  "torch-scatter==2.1.2",
+  "torch-sparse==0.6.18",
+  "torch-cluster==1.6.3",
+]
+dev = [
+  "torch==2.3.1",
+  "torch-geometric==2.4.0",
+  "torch-scatter==2.1.2",
+  "torch-sparse==0.6.18",
+  "torch-cluster==1.6.3",
+  "wandb",
+  "tensorboard",
+]
+
 [project.urls]
 Website = "https://github.com/SeonghwanSeo/PharmacoNet"
 "Source Code" = "https://github.com/SeonghwanSeo/PharmacoNet"

diff --git a/src/pmnet/__init__.py b/src/pmnet/__init__.py
@@ -1,6 +1,6 @@
 from .pharmacophore_model import PharmacophoreModel
 
-__version__ = "2.0.3"
+__version__ = "2.1.0"
 __citation_information__ = (
     "Seo, S., & Kim, W. Y. (2023, December). "
     "PharmacoNet: Accelerating Large-Scale Virtual Screening by Deep Pharmacophore Modeling. "

diff --git a/src/pmnet/api/__init__.py b/src/pmnet/api/__init__.py
@@ -0,0 +1,21 @@
+# NOTE: For DL Model Training
+__all__ = ["PharmacoNet", "ProteinParser", "get_pmnet_dev", "MultiScaleFeature", "HotspotInfo"]
+
+import torch
+from pmnet.module import PharmacoNet
+from pmnet.data.parser import ProteinParser
+from . import typing
+
+
+def get_pmnet_dev(
+    device: str | torch.device = "cpu", score_threshold: float = 0.5, molvoxel_library: str = "numpy"
+) -> PharmacoNet:
+    """
+    device: 'cpu' | 'cuda'
+    score_threshold: float | dict[str, float] | None
+        custom threshold to identify hotspots.
+        For feature extraction, recommended value is '0.5'
+    molvoxel_library: str
+        If you want to use PharmacoNet in DL model training, recommend to use 'numpy'
+    """
+    return PharmacoNet(device, score_threshold, False, molvoxel_library)
diff --git a/src/pmnet/api/typing.py b/src/pmnet/api/typing.py
@@ -0,0 +1,6 @@
+from torch import Tensor
+from typing import Any
+
+
+MultiScaleFeature = tuple[Tensor, Tensor, Tensor, Tensor, Tensor]
+HotspotInfo = dict[str, Any]
diff --git a/src/pmnet/data/constant.py b/src/pmnet/data/constant.py
@@ -1,16 +1,16 @@
 from typing import Sequence, Set
 
 INTERACTION_LIST: Sequence[str] = (
-    'Hydrophobic',
-    'PiStacking_P',
-    'PiStacking_T',
-    'PiCation_lring',
-    'PiCation_pring',
-    'HBond_ldon',
-    'HBond_pdon',
-    'SaltBridge_lneg',
-    'SaltBridge_pneg',
-    'XBond'
+    "Hydrophobic",
+    "PiStacking_P",
+    "PiStacking_T",
+    "PiCation_lring",
+    "PiCation_pring",
+    "HBond_ldon",
+    "HBond_pdon",
+    "SaltBridge_lneg",
+    "SaltBridge_pneg",
+    "XBond",
 )
 
 NUM_INTERACTION_TYPES: int = 10
@@ -28,16 +28,16 @@
 
 # PLIP Distance + 0.5 A
 INTERACTION_DIST = {
-    HYDROPHOBIC: 4.5,       # 4.0 + 0.5
-    PISTACKING_P: 6.0,      # 5.5 + 0.5
-    PISTACKING_T: 6.0,      # 5.5 + 0.5
-    PICATION_LRING: 6.5,    # 6.0 + 0.5
-    PICATION_PRING: 6.5,    # 6.0 + 0.5
-    HBOND_LDON: 4.5,        # 4.1 + 0.5 - 0.1 (to be devided to 0.5)
-    HBOND_PDON: 4.5,        # 4.1 + 0.5 - 0.1
-    SALTBRIDGE_LNEG: 6.0,   # 5.5 + 0.5
-    SALTBRIDGE_PNEG: 6.0,   # 5.5 + 0.5
-    XBOND: 4.5,             # 4.0 + 0.5
+    HYDROPHOBIC: 4.5,  # 4.0 + 0.5
+    PISTACKING_P: 6.0,  # 5.5 + 0.5
+    PISTACKING_T: 6.0,  # 5.5 + 0.5
+    PICATION_LRING: 6.5,  # 6.0 + 0.5
+    PICATION_PRING: 6.5,  # 6.0 + 0.5
+    HBOND_LDON: 4.5,  # 4.1 + 0.5 - 0.1 (to be devided to 0.5)
+    HBOND_PDON: 4.5,  # 4.1 + 0.5 - 0.1
+    SALTBRIDGE_LNEG: 6.0,  # 5.5 + 0.5
+    SALTBRIDGE_PNEG: 6.0,  # 5.5 + 0.5
+    XBOND: 4.5,  # 4.0 + 0.5
 }
 
 LONG_INTERACTION: Set[int] = {
@@ -46,7 +46,7 @@
     PICATION_PRING,
     PICATION_LRING,
     SALTBRIDGE_LNEG,
-    SALTBRIDGE_PNEG
+    SALTBRIDGE_PNEG,
 }
 
 SHORT_INTERACTION: Set[int] = {

diff --git a/src/pmnet/data/parser.py b/src/pmnet/data/parser.py
@@ -0,0 +1,101 @@
+import os
+import tempfile
+from pathlib import Path
+
+import torch
+import numpy as np
+from openbabel import pybel
+
+from pmnet.data import token_inference, pointcloud
+from pmnet.data.objects import Protein
+from pmnet.data.extract_pocket import extract_pocket
+
+from molvoxel import create_voxelizer, BaseVoxelizer
+from torch import Tensor
+from numpy.typing import NDArray
+
+
+class ProteinParser:
+    def __init__(self, center_noise: float = 0.0, pocket_extract: bool = True, molvoxel_library: str = "numpy"):
+        """
+        center_noise: for data augmentation
+        pocket_extract: if True, we read pocket instead of entire protein. (faster)
+        """
+        self.voxelizer = create_voxelizer(0.5, 64, sigma=1 / 3, library=molvoxel_library)
+        self.noise: float = center_noise
+        self.extract: bool = pocket_extract
+
+        ob_log_handler = pybel.ob.OBMessageHandler()
+        ob_log_handler.SetOutputLevel(0)  # 0: None
+
+    def __call__(
+        self,
+        protein_pdb_path: str | Path,
+        ref_ligand_path: str | Path | None = None,
+        center: NDArray[np.float32] | tuple[float, float, float] | None = None,
+    ) -> tuple[Tensor, Tensor, Tensor, Tensor]:
+        return self.parse(protein_pdb_path, ref_ligand_path, center)
+
+    def parse(
+        self,
+        protein_pdb_path: str | Path,
+        ref_ligand_path: str | Path | None = None,
+        center: NDArray[np.float32] | tuple[float, float, float] | None = None,
+    ) -> tuple[Tensor, Tensor, Tensor, Tensor]:
+        assert (ref_ligand_path is not None) or (center is not None)
+        _center = self.get_center(ref_ligand_path, center)
+        return parse_protein(self.voxelizer, protein_pdb_path, _center, self.noise, self.extract)
+
+    @staticmethod
+    def get_center(
+        ref_ligand_path: str | Path | None = None,
+        center: tuple[float, float, float] | NDArray | None = None,
+    ) -> tuple[float, float, float]:
+        if center is not None:
+            assert len(center) == 3
+            x, y, z = center
+        else:
+            assert ref_ligand_path is not None
+            extension = os.path.splitext(ref_ligand_path)[1]
+            assert extension in [".sdf", ".pdb", ".mol2"]
+            ref_ligand = next(pybel.readfile(extension[1:], str(ref_ligand_path)))
+            x, y, z = np.mean([atom.coords for atom in ref_ligand.atoms], axis=0, dtype=np.float32).tolist()
+        return float(x), float(y), float(z)
+
+
+def parse_protein(
+    voxelizer: BaseVoxelizer,
+    protein_pdb_path: str | Path,
+    center: NDArray[np.float32] | tuple[float, float, float],
+    center_noise: float = 0.0,
+    pocket_extract: bool = True,
+) -> tuple[Tensor, Tensor, Tensor, Tensor]:
+    if isinstance(center, tuple):
+        center = np.array(center, dtype=np.float32)
+    if center_noise > 0:
+        center = center + (np.random.rand(3) * 2 - 1) * center_noise
+
+    if pocket_extract:
+        with tempfile.TemporaryDirectory() as dirname:
+            pocket_path = os.path.join(dirname, "pocket.pdb")
+            extract_pocket(protein_pdb_path, pocket_path, center)
+            protein_obj: Protein = Protein.from_pdbfile(pocket_path)
+    else:
+        protein_obj: Protein = Protein.from_pdbfile(protein_pdb_path)
+
+    token_positions, token_classes = token_inference.get_token_informations(protein_obj)
+    tokens, filter = token_inference.get_token_and_filter(token_positions, token_classes, center)
+    token_positions = token_positions[filter]
+
+    protein_positions, protein_features = pointcloud.get_protein_pointcloud(protein_obj)
+    protein_image = np.asarray(
+        voxelizer.forward_features(protein_positions, center, protein_features, radii=1.5), np.float32
+    )
+    mask = np.logical_not(np.asarray(voxelizer.forward_single(protein_positions, center, radii=1.0), np.bool_))
+    del protein_obj
+    return (
+        torch.from_numpy(protein_image).to(torch.float),
+        torch.from_numpy(mask).to(torch.bool),
+        torch.from_numpy(token_positions).to(torch.float),
+        torch.from_numpy(tokens).to(torch.long),
+    )