update

SeonghwanSeo · Aug 7, 2024 · 59fcba4 · 59fcba4
1 parent 3adba49
commit 59fcba4
Show file tree

Hide file tree

Showing 11 changed files with 314 additions and 331 deletions.
diff --git a/.gitignore b/.gitignore
@@ -4,6 +4,8 @@ weights
 run.sh
 result/
 examples/library/
+nogit/
+test.sh
 
 
 # Byte-compiled / optimized / DLL files

diff --git a/README.md b/README.md
@@ -212,13 +212,22 @@ OUTPUT=(multi_scale_features, hotspot_info)
 For feature extraction, it is recommended to use `score_threshold=0.5` instead of default setting used for pharmacophore modeling. If you want to extract more features, decrease the `score_threshold`.
 
 ```python
-from pmnet.module import PharmacoNet
+from pmnet.module import PharmacoNet, parse_protein
 module = PharmacoNet(
     "cuda",
-    score_threshold = 0.5  # <SCORE_THRESHOLD: float | dict[str, float], recommended=0.5>,
+    score_threshold = 0.5,  # <SCORE_THRESHOLD: float | dict[str, float], recommended=0.5>,
+    molvoxel_library = 'numpy' # <MOLVOXEL_LIBRARY: str, if you use it in `Dataset`, set 'numpy'>
 )
+# End-to-End calculation
 multi_scale_features, hotspot_infos = module.feature_extraction(<PROTEIN_PATH>, <REF_LIGAND_PATH>)
 multi_scale_features, hotspot_infos = module.feature_extraction(<PROTEIN_PATH>, center=(<X>, <Y>, <Z>))
+
+# Step-wise calculation
+voxelizer = module.voxelizer
+# In Dataset (Type: Tuple[Tensor, Tensor, Tensor, Tensor])
+protein_data = module.parse_protein(voxelizer, <PROTEIN_PATH>, <REF_LIGAND_PATH>, <CENTER_NOISE>)
+# In Model
+multi_scale_features, hotspot_infos = module.run_extraction(protein_data)
 ```
 
 ### Paper List

diff --git a/environment.yml b/environment.yml
@@ -7,10 +7,3 @@ dependencies:
   - openbabel=3.1.1
   - pymol-open-source=3.0.0
   - numpy=1.26.4
-  - pip:
-      - tqdm
-      - molvoxel==0.1.3
-      - numba==0.59.1
-      - omegaconf==2.3.0
-      - gdown==5.1.0
-      - biopython==1.83
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "pharmaconet"
-version = "2.0.1"
+version = "2.0.2"
 description = "PharmacoNet: Open-Source Software for Protein-based Pharmacophore Modeling and Virtual Screening"
 license = { text = "MIT" }
 authors = [{ name = "Seonghwan Seo", email = "shwan0106@kaist.ac.kr" }]
@@ -24,13 +24,14 @@ classifiers = [
 ]
 
 dependencies = [
+  "tqdm",
   "torch>=1.13.0",
-  "numpy==1.26.4",
-  "numba==0.59.1",
+  "numpy>=1.26,<1.27",
+  "numba>=0.59",
   "omegaconf>=2.3.0",
-  "molvoxel==0.1.3",
-  "gdown==5.1.0",
-  "biopython==1.83"
+  "molvoxel>=0.1.3",
+  "gdown>=5.1.0",
+  "biopython>=1.83"
 ]
 
 [project.urls]

diff --git a/src/pmnet/__init__.py b/src/pmnet/__init__.py
@@ -1,6 +1,6 @@
 from .pharmacophore_model import PharmacophoreModel
 
-__version__ = "2.0.1"
+__version__ = "2.0.2"
 __citation_information__ = (
     "Seo, S., & Kim, W. Y. (2023, December). "
     "PharmacoNet: Accelerating Large-Scale Virtual Screening by Deep Pharmacophore Modeling. "

diff --git a/src/pmnet/data/extract_pocket.py b/src/pmnet/data/extract_pocket.py
@@ -1,20 +1,63 @@
 import os
 import numpy as np
+import math
 
 from Bio.PDB import PDBParser, PDBIO
 from Bio.PDB.PDBIO import Select
 
+from typing import Union
 from numpy.typing import ArrayLike
+from pathlib import Path
 
 import warnings
+
 warnings.filterwarnings("ignore")
 
 AMINO_ACID = [
-    'GLY', 'ALA', 'VAL', 'LEU', 'ILE', 'PRO', 'PHE', 'TYR', 'TRP', 'SER',
-    'THR', 'CYS', 'MET', 'ASN', 'GLN', 'ASP', 'GLU', 'LYS', 'ARG', 'HIS',
-    'HIP', 'HIE', 'TPO', 'HID', 'LEV', 'MEU', 'PTR', 'GLV', 'CYT', 'SEP',
-    'HIZ', 'CYM', 'GLM', 'ASQ', 'TYS', 'CYX', 'GLZ', 'MSE', 'CSO', 'KCX',
-    'CSD', 'MLY', 'PCA', 'LLP'
+    "GLY",
+    "ALA",
+    "VAL",
+    "LEU",
+    "ILE",
+    "PRO",
+    "PHE",
+    "TYR",
+    "TRP",
+    "SER",
+    "THR",
+    "CYS",
+    "MET",
+    "ASN",
+    "GLN",
+    "ASP",
+    "GLU",
+    "LYS",
+    "ARG",
+    "HIS",
+    "HIP",
+    "HIE",
+    "TPO",
+    "HID",
+    "LEV",
+    "MEU",
+    "PTR",
+    "GLV",
+    "CYT",
+    "SEP",
+    "HIZ",
+    "CYM",
+    "GLM",
+    "ASQ",
+    "TYS",
+    "CYX",
+    "GLZ",
+    "MSE",
+    "CSO",
+    "KCX",
+    "CSD",
+    "MLY",
+    "PCA",
+    "LLP",
 ]
 
 
@@ -28,11 +71,9 @@ def accept_residue(self, residue):
             return 0
         if residue.get_resname() not in AMINO_ACID:
             return 0
-        residue_positions = np.array([
-            list(atom.get_vector())
-            for atom in residue.get_atoms()
-            if "H" not in atom.get_id()
-        ])
+        residue_positions = np.array(
+            [list(atom.get_vector()) for atom in residue.get_atoms() if "H" not in atom.get_id()]
+        )
         if residue_positions.shape[0] == 0:
             return 0
         min_dis = np.min(np.linalg.norm(residue_positions - self.center, axis=-1))
@@ -42,14 +83,14 @@ def accept_residue(self, residue):
             return 0
 
 
+DEFAULT_CUTOFF = 16 * math.sqrt(3) + 5.0
+
+
 def extract_pocket(
-    protein_pdb_path: str,
-    out_pocket_pdb_path: str,
-    center: ArrayLike,
-    cutoff: float
+    protein_pdb_path: Union[str, Path], out_pocket_pdb_path: str, center: ArrayLike, cutoff: float = DEFAULT_CUTOFF
 ):
     parser = PDBParser()
-    structure = parser.get_structure("protein", protein_pdb_path)
+    structure = parser.get_structure("protein", str(protein_pdb_path))
     io = PDBIO()
     io.set_structure(structure)
     io.save(out_pocket_pdb_path, DistSelect(center, cutoff))

diff --git a/src/pmnet/data/token_inference.py b/src/pmnet/data/token_inference.py
@@ -8,9 +8,7 @@
 from . import constant as C
 
 
-def get_token_informations(
-    protein_obj: Protein,
-) -> Tuple[NDArray[np.float32], NDArray[np.int16]]:
+def get_token_informations(protein_obj: Protein) -> Tuple[NDArray[np.float32], NDArray[np.int16]]:
     """get token information
 
     Args:
@@ -20,14 +18,15 @@ def get_token_informations(
         token_positions: [float, (N, 3)] token center positions
         token_classes: [int, (N,)] token interaction type
     """
-    num_tokens = \
-        len(protein_obj.hydrophobic_atoms_all) + \
-        len(protein_obj.rings_all) * 3 + \
-        len(protein_obj.hbond_donors_all) + \
-        len(protein_obj.hbond_acceptors_all) + \
-        len(protein_obj.pos_charged_atoms_all) * 2 + \
-        len(protein_obj.neg_charged_atoms_all) + \
-        len(protein_obj.xbond_acceptors_all)
+    num_tokens = (
+        len(protein_obj.hydrophobic_atoms_all)
+        + len(protein_obj.rings_all) * 3
+        + len(protein_obj.hbond_donors_all)
+        + len(protein_obj.hbond_acceptors_all)
+        + len(protein_obj.pos_charged_atoms_all) * 2
+        + len(protein_obj.neg_charged_atoms_all)
+        + len(protein_obj.xbond_acceptors_all)
+    )
 
     positions: List[Tuple[float, float, float]] = []
     classes: List[int] = []
@@ -83,8 +82,6 @@ def get_token_and_filter(
     positions: NDArray[np.float32],
     classes: NDArray[np.int16],
     center: NDArray[np.float32],
-    resolution: float,
-    dimension: int,
 ) -> Tuple[NDArray[np.int16], NDArray[np.int16]]:
     """Create token and Filtering valid instances
 
@@ -99,6 +96,7 @@ def get_token_and_filter(
         token: [int, (N_token, 4)]
         filter: [int, (N_token,)]
     """
+    resolution, dimension = 0.5, 64
     filter = []
     tokens = []
     x_center, y_center, z_center = center
@@ -116,12 +114,7 @@ def get_token_and_filter(
     return np.array(tokens, dtype=np.int16), np.array(filter, dtype=np.int16)
 
 
-def get_box_area(
-    tokens: ArrayLike,
-    pharmacophore_size: float,
-    resolution: float,
-    dimension: int,
-) -> NDArray[np.bool_]:
+def get_box_area(tokens: ArrayLike) -> NDArray[np.bool_]:
     """Create Box Area
 
     Args:
@@ -132,9 +125,10 @@ def get_box_area(
     Returns:
         box_areas: BoolArray [Ntoken, D, H, W] D=H=W=dimension
     """
+    resolution, dimension, pharmacophore_size = 0.5, 64, 1.0
     num_tokens = len(tokens)
     box_areas = np.zeros((num_tokens, dimension, dimension, dimension), dtype=np.bool_)
-    grids = np.stack(np.meshgrid(np.arange(dimension), np.arange(dimension), np.arange(dimension), indexing='ij'), 3)
+    grids = np.stack(np.meshgrid(np.arange(dimension), np.arange(dimension), np.arange(dimension), indexing="ij"), 3)
     for i, (x, y, z, t) in enumerate(tokens):
         x, y, z, t = int(x), int(y), int(z), int(t)
         distances = np.linalg.norm(grids - np.array([[x, y, z]]), axis=-1)
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,6 +4,8 @@ weights @@
     run.sh
     result/
     examples/library/
+    nogit/
+    test.sh
     # Byte-compiled / optimized / DLL files
@@ Expand Down @@