Merge pull request #7 from SeonghwanSeo/dev-2.0.1

Dev 2.0.1
SeonghwanSeo · Jul 12, 2024 · e61d36f · e61d36f
2 parents dc7e0a9 + 8e9ba19
commit e61d36f
Show file tree

Hide file tree

Showing 48 changed files with 338 additions and 355 deletions.
diff --git a/.gitignore b/.gitignore
@@ -4,7 +4,6 @@ weights
 run.sh
 result/
 examples/library/
-pyproject.toml
 
 
 # Byte-compiled / optimized / DLL files

diff --git a/README.md b/README.md
@@ -43,17 +43,19 @@ For various environment including Linux, MacOS and Window, the script installs *
 
 ```bash
 conda create -f environment.yml
-conda activate openph
+conda activate pmnet
+pip install torch # 1.13<=torch<=2.3.1, CUDA acceleration is available. 1min for 1 cpu, 10s for 1 gpu
+pip install .
 ```
 
 #### Manual Installation
 
 ```shell
 # Required python>=3.9, Best Performance at higher version. (3.9, 3.10, 3.11, 3.12 - best)
-conda create --name openph python=3.10 openbabel=3.1.1 pymol-open-source=3.0.0 numpy=1.26
-conda activate openph
+conda create --name openph python=3.10 openbabel=3.1.1 pymol-open-source=3.0.0 numpy=1.26.4
+conda activate pmnet
 
-pip install torch # torch >= 1.13, CUDA acceleration is available. 1min for 1 cpu, 10s for 1 gpu
+pip install torch # 1.13<=torch<=2.3.1, CUDA acceleration is available. 1min for 1 cpu, 10s for 1 gpu
 pip install rdkit biopython omegaconf tdqm numba # Numba is optional, but recommended.
 pip install molvoxel # Molecular voxelization tools with minimal dependencies (https://github.com/SeonghwanSeo/molvoxel.git)
 ```
@@ -165,33 +167,44 @@ score = model.scoring_smiles(<SMILES>, <NUM_CONFORMERS>)
 For deep learning researcher who want to use PharmacoNet as pre-trained model for feature extraction, we provide the script `feature_extraction.py`.
 
 ```bash
-python feature_extraction.py --protein <PROTEIN_PATH> --ref_ligand <REF_LIGAND_PATH> --out <SAVE_PKL_PATH>
-python feature_extraction.py --protein <PROTEIN_PATH> --center <X> <Y> <Z> --out <SAVE_PKL_PATH>
+python feature_extraction.py --protein <PROTEIN_PATH> --ref_ligand <REF_LIGAND_PATH> --out <SAVE_PT_PATH>
+python feature_extraction.py --protein <PROTEIN_PATH> --center <X> <Y> <Z> --out <SAVE_PT_PATH>
 ```
 
 ```bash
-PHARMACOPHORE NODE FEATURE LIST: List[Dict[str, Any]]
-    PHARMACOPHORE NODE FEATURE: Dict[str, Any]
-        - feature: NDArray[np.float32]
-        - type: str (7 types)
-            {'Hydrophobic', 'Aromatic', 'Cation', 'Anion',
-             'Halogen', 'HBond_donor', 'HBond_acceptor'}
-            *** `type` is obtained from `nci_type`.
-        - nci_type: str (10 types)
-            'Hydrophobic': Hydrophobic interaction
-            'PiStacking_P': Pi-Pi Stacking (Parallel)
-            'PiStacking_T': Pi-Pi Stacking (T-shaped)
-            'PiCation_lring': Cation-Pi Interaction btw Protein Cation & Ligand Aromatic Ring
-            'PiCation_pring': Cation-Pi Interaction btw Protein Aromatic Ring & Ligand Cation
-            'SaltBridge_pneg': SaltBridge btw Protein Anion & Ligand Cation
-            'SaltBridge_lneg': SaltBridge btw Protein Cation & Ligand Anion
-            'HBond_pdon': Hydrogen Bond btw Protein Donor & Ligand Acceptor
-            'HBond_ldon': Hydrogen Bond btw Protein Acceptor & Ligand Donor
-            'XBond': Halogen Bond
-        - priority_score: float in [0, 1]
-        - hotspot_position: tuple[float, float, float] - (x, y, z)
-        - center: tuple[float, float, float] - (x, y, z)
-        - radius: float
+OUTPUT=(multi_scale_features, hotspot_info)
+  multi_scale_features: list[torch.Tensor]:
+    - torch.Tensor [96, 4, 4, 4]
+    - torch.Tensor [96, 8, 8, 8]
+    - torch.Tensor [96, 16, 16, 16]
+    - torch.Tensor [96, 32, 32, 32]
+    - torch.Tensor [96, 64, 64, 64]
+  hotspot_infos: list[hotspot_info]
+    info: dict[str, Any]
+      - hotspot_feature: torch.Tensor (192,)
+      - hotspot_position: tuple[float, float, float] - (x, y, z)
+      - hotspot_score: float in [0, 1]
+
+      - nci_type: str (10 types)
+          'Hydrophobic': Hydrophobic interaction
+          'PiStacking_P': PiStacking (Parallel)
+          'PiStacking_T': PiStacking (T-shaped)
+          'PiCation_lring': Interaction btw Protein Cation & Ligand Aromatic Ring
+          'PiCation_pring': Interaction btw Protein Aromatic Ring & Ligand Cation
+          'SaltBridge_pneg': SaltBridge btw Protein Anion & Ligand Cation
+          'SaltBridge_lneg': SaltBridge btw Protein Cation & Ligand Anion
+          'XBond': Halogen Bond
+          'HBond_pdon': Hydrogen Bond btw Protein Donor & Ligand Acceptor
+          'HBond_ldon': Hydrogen Bond btw Protein Acceptor & Ligand Donor
+
+      - hotspot_type: str (7 types)
+          {'Hydrophobic', 'Aromatic', 'Cation', 'Anion',
+           'Halogen', 'HBond_donor', 'HBond_acceptor'}
+          *** `type` is obtained from `nci_type`.
+      - point_type: str (7 types)
+          {'Hydrophobic', 'Aromatic', 'Cation', 'Anion',
+           'Halogen', 'HBond_donor', 'HBond_acceptor'}
+          *** `type` is obtained from `nci_type`.
 ```
 
 ### Python Script
@@ -200,13 +213,12 @@ For feature extraction, it is recommended to use `score_threshold=0.5` instead o
 
 ```python
 from pmnet.module import PharmacoNet
-
 module = PharmacoNet(
     "cuda",
     score_threshold = 0.5  # <SCORE_THRESHOLD: float | dict[str, float], recommended=0.5>,
 )
-
-pharmacophore_node_feature_list = module.feature_extraction(<PROTEIN_PATH>, center=(<X>, <Y>, <Z>))
+multi_scale_features, hotspot_infos = module.feature_extraction(<PROTEIN_PATH>, <REF_LIGAND_PATH>)
+multi_scale_features, hotspot_infos = module.feature_extraction(<PROTEIN_PATH>, center=(<X>, <Y>, <Z>))
 ```
 
 ### Paper List
@@ -226,3 +238,4 @@ Paper on [arxiv](https://arxiv.org/abs/2310.00681)
   url = {https://arxiv.org/abs/2310.00681},
 }
 ```
+
diff --git a/environment.yml b/environment.yml
@@ -1,17 +1,16 @@
-name: openph
+name: pmnet
 channels:
   - conda-forge
 dependencies:
   - python=3.11
   - pip=24.0
   - openbabel=3.1.1
   - pymol-open-source=3.0.0
-  - numpy=1.26
+  - numpy=1.26.4
   - pip:
       - tqdm
-      - torch==1.13.1
       - molvoxel==0.1.3
-      - numba==0.59
+      - numba==0.59.1
       - omegaconf==2.3.0
       - gdown==5.1.0
       - biopython==1.83
diff --git a/feature_extraction.py b/feature_extraction.py
@@ -1,15 +1,37 @@
 import argparse
-import pickle
+import torch
 from pmnet.module import PharmacoNet
 
 
+class ArgParser(argparse.ArgumentParser):
+    def __init__(self):
+        super().__init__("PharmacoNet Feature Extraction Script")
+        self.formatter_class = argparse.ArgumentDefaultsHelpFormatter
+        self.add_argument(
+            "-p",
+            "--protein",
+            type=str,
+            help="custom path of protein pdb file (.pdb)",
+            required=True,
+        )
+        self.add_argument("--out", type=str, help="save path of features (torch object)", required=True)
+        self.add_argument(
+            "--ref_ligand",
+            type=str,
+            help="path of ligand to define the center of box (.sdf, .pdb, .mol2)",
+        )
+        self.add_argument("--center", nargs="+", type=float, help="coordinate of the center")
+        self.add_argument("--cuda", action="store_true", help="use gpu acceleration with CUDA")
+
+
 """
-PHARMACOPHORE_POINT_FEATURE_LIST: list[dict[str, Any]]
-    PHARMACOPHORE_POINT_FEATURE
-        - type: str (7 types)
-            {'Hydrophobic', 'Aromatic', 'Cation', 'Anion',
-             'Halogen', 'HBond_donor', 'HBond_acceptor'}
-            *** `type` is obtained from `nci_type`.
+return tuple[multi_scale_features, hotspot_info]
+    multi_scale_features: list[torch.Tensor]:
+        - [96, 4, 4, 4], [96, 8, 8, 8], [96, 16, 16, 16], [96, 32, 32, 32], [96, 64, 64, 64]
+    hotspot_info
+        - hotspot_feature: torch.Tensor (192,)
+        - hotspot_position: tuple[float, float, float] - (x, y, z)
+        - hotspot_score: float in [0, 1]
 
         - nci_type: str (10 types)
             'Hydrophobic': Hydrophobic interaction
@@ -23,72 +45,28 @@
             'HBond_pdon': Hydrogen Bond btw Protein Donor & Ligand Acceptor
             'HBond_ldon': Hydrogen Bond btw Protein Acceptor & Ligand Donor
 
-        - hotspot_position: tuple[float, float, float] - (x, y, z)
-        - priority_score: str in [0, 1]
-        - center: tuple[float, float, float] - (x, y, z) 
-        - radius: float
-        - feature: NDArray[np.float32]
+        - hotspot_type: str (7 types)
+            {'Hydrophobic', 'Aromatic', 'Cation', 'Anion',
+             'Halogen', 'HBond_donor', 'HBond_acceptor'}
+            *** `type` is obtained from `nci_type`.
+        - point_type: str (7 types)
+            {'Hydrophobic', 'Aromatic', 'Cation', 'Anion',
+             'Halogen', 'HBond_donor', 'HBond_acceptor'}
+            *** `type` is obtained from `nci_type`.
+]
 """
 
 
-# NOTE: UNUSED IN THIS SCRIPT
-DEFAULT_MODELING_SCORE_THRESHOLD = {
-    "PiStacking_P": 0.7,  # Top 30%
-    "PiStacking_T": 0.7,
-    "SaltBridge_lneg": 0.7,
-    "SaltBridge_pneg": 0.7,
-    "PiCation_lring": 0.7,
-    "PiCation_pring": 0.7,
-    "XBond": 0.85,  # Top 15%
-    "HBond_ldon": 0.85,
-    "HBond_pdon": 0.85,
-    "Hydrophobic": 0.85,
-}
-
-
 # NOTE: RECOMMENDED
 RECOMMENDED_SCORE_THRESHOLD = 0.5
 
 
-class ArgParser(argparse.ArgumentParser):
-    def __init__(self):
-        super().__init__("PharmacoNet Feature Extraction Script")
-        self.formatter_class = argparse.ArgumentDefaultsHelpFormatter
-        self.add_argument(
-            "-p",
-            "--protein",
-            type=str,
-            help="custom path of protein pdb file (.pdb)",
-            required=True,
-        )
-        self.add_argument(
-            "--out", type=str, help="save path of features (.pkl)", required=True
-        )
-        self.add_argument(
-            "--ref_ligand",
-            type=str,
-            help="path of ligand to define the center of box (.sdf, .pdb, .mol2)",
-        )
-        self.add_argument(
-            "--center", nargs="+", type=float, help="coordinate of the center"
-        )
-        self.add_argument(
-            "--cuda", action="store_true", help="use gpu acceleration with CUDA"
-        )
-
-
 if __name__ == "__main__":
     parser = ArgParser()
     args = parser.parse_args()
     module = PharmacoNet(
         device="cuda" if args.cuda else "cpu",
         score_threshold=RECOMMENDED_SCORE_THRESHOLD,
     )
-    pharmacophore_point_feature_list = module.feature_extraction(
-        args.protein, args.ref_ligand, args.center
-    )
-    for key, item in pharmacophore_point_feature_list[0].items():
-        print(key)
-        print(type(item))
-    with open(args.out, "wb") as w:
-        pickle.dump(pharmacophore_point_feature_list, w)
+    multi_scale_features, hotspot_infos = module.feature_extraction(args.protein, args.ref_ligand, args.center)
+    torch.save([multi_scale_features, hotspot_infos], args.out)
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,67 @@
+[build-system]
+requires = ["setuptools>=42", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "pharmaconet"
+version = "2.0.1"
+description = "PharmacoNet: Open-Source Software for Protein-based Pharmacophore Modeling and Virtual Screening"
+license = { text = "MIT" }
+authors = [{ name = "Seonghwan Seo", email = "shwan0106@kaist.ac.kr" }]
+requires-python = ">=3.10"
+classifiers = [
+  "Intended Audience :: Developers",
+  "Intended Audience :: Science/Research",
+  "Development Status :: 4 - Beta",
+  "Operating System :: OS Independent",
+  "License :: OSI Approved :: MIT License",
+  "Topic :: Scientific/Engineering",
+  "Topic :: Scientific/Engineering :: Bio-Informatics",
+  "Topic :: Scientific/Engineering :: Chemistry",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12"
+]
+
+dependencies = [
+  "torch>=1.13.0",
+  "numpy==1.26.4",
+  "numba==0.59.1",
+  "omegaconf>=2.3.0",
+  "molvoxel==0.1.3",
+  "gdown==5.1.0",
+  "biopython==1.83"
+]
+
+[project.urls]
+Website = "https://github.com/SeonghwanSeo/PharmacoNet"
+"Source Code" = "https://github.com/SeonghwanSeo/PharmacoNet"
+
+[tool.setuptools.packages.find]
+where = ["src"]
+
+# CODING
+[tool.ruff]
+target-version = "py310"
+line-length = 120
+
+[tool.ruff.lint]
+select = ["E", "F", "B", "UP", "T203",]
+ignore = ["E501"]
+
+[tool.ruff.per-file-ignores]
+"__init__.py" = [
+    "F401", # imported but unused
+    "E402", # Module level import not at top of file
+]
+
+[tool.pyright]
+pythonVersion = "3.10"
+typeCheckingMode = "standard"
+diagnosticMode = "openFilesOnly"
+reportImplicitStringConcatenation = false
+reportGeneralTypeIssues = "warning"
+reportDeprecated = "warning"
+reportUnusedVariable = false
+reportUnusedImport = false
+
diff --git a/screening.py b/screening.py
@@ -33,7 +33,7 @@ def __init__(self):
 
 
 def func(file, model, weight):
-    return file.stem, model.scoring_file(file, weight)
+    return file, model.scoring_file(file, weight)
 
 
 if __name__ == "__main__":
@@ -51,8 +51,8 @@ def func(file, model, weight):
     )
     library_path = Path(args.library_dir)
     file_list = list(library_path.rglob("*.sdf")) + list(library_path.rglob("*.mol2"))
+    print(f"find {len(file_list)} molecules")
     f = partial(func, model=model, weight=weight)
-
     with multiprocessing.Pool(args.cpus) as pool:
         result = pool.map(f, file_list)
 

diff --git a/pmnet/__init__.py → src/pmnet/__init__.py b/pmnet/__init__.py → src/pmnet/__init__.py
@@ -1,6 +1,6 @@
 from .pharmacophore_model import PharmacophoreModel
 
-__version__ = '1.0.0'
+__version__ = "2.0.1"
 __citation_information__ = (
     "Seo, S., & Kim, W. Y. (2023, December). "
     "PharmacoNet: Accelerating Large-Scale Virtual Screening by Deep Pharmacophore Modeling. "

diff --git a/pmnet/data/__init__.py → src/pmnet/data/__init__.py b/pmnet/data/__init__.py → src/pmnet/data/__init__.py
diff --git a/pmnet/data/constant.py → src/pmnet/data/constant.py b/pmnet/data/constant.py → src/pmnet/data/constant.py
diff --git a/pmnet/data/extract_pocket.py → src/pmnet/data/extract_pocket.py b/pmnet/data/extract_pocket.py → src/pmnet/data/extract_pocket.py
diff --git a/pmnet/data/objects/__init__.py → src/pmnet/data/objects/__init__.py b/pmnet/data/objects/__init__.py → src/pmnet/data/objects/__init__.py
diff --git a/pmnet/data/objects/atom_classes.py → src/pmnet/data/objects/atom_classes.py b/pmnet/data/objects/atom_classes.py → src/pmnet/data/objects/atom_classes.py
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,7 +4,6 @@ weights @@
     run.sh
     result/
     examples/library/
-    pyproject.toml
     # Byte-compiled / optimized / DLL files
@@ Expand Down @@