diff --git a/README.md b/README.md index 701e059..7ba4ae5 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ further validation. ---------------------------- 1. Setting up dependencies - python ~=3.9, perl ~=5.32 + python ~=3.9, perl ~=5.32 ```bash conda create --name secse -c conda-forge parallel tqdm biopandas openbabel chemprop xlrd=2 pandarallel rdkit=2022.09 conda activate secse @@ -114,6 +114,10 @@ further validation. - _spiro_site_count_, maximum of spiro ring site count, default=1, type=int - _fused_site_count_, maximum of fused ring site count, default=3, type=int - _rdkit_sa_score_, synthetic accessibility score (calculated by RDKit) cutoff, default=5, type=float + - _substructure_filter_, files containing the customized unwanted substructure SMARTS in "*.xls" format, set the + value to 0 if you do not have any additional unwanted substructure. PANIS already includes as default. The file + should include columns for **`Pattern`**, **`ID`**, and **`Max`**, where the **`ID`** should be unique for each SMARTS. You can + refer to the example file [subtructure_filter_demo.xls](demo/subtructure_filter_demo.xls), default=0, type=string Config file of a demo case [phgdh_demo_vina.ini](demo/phgdh_demo_vina.ini) Customized rule json template [rules.json](demo/rules.json). Rule ID should be in the form G-001-XXXX, like @@ -139,7 +143,8 @@ GNU Parallel installation python ~=3.9, perl ~=5.32 -numpy~=1.24.3, pandas~=1.3.3, xlrd~=2.0.1, pandarallel~=1.5.2, tqdm~=4.65.0, biopandas~=0.4.1, openbabel~=3.1.1, rdkit~=2022.09, chemprop~=1.5.2, pytorch~=2.0.0+cu117 +numpy~=1.24.3, pandas~=1.3.3, xlrd~=2.0.1, pandarallel~=1.5.2, tqdm~=4.65.0, biopandas~=0.4.1, openbabel~=3.1.1, rdkit~ +=2022.09, chemprop~=1.5.2, pytorch~=2.0.0+cu117 Linux server with CPUs only also works. diff --git a/demo/phgdh_demo_vina.ini b/demo/phgdh_demo_vina.ini index 02f35d0..8e9d039 100644 --- a/demo/phgdh_demo_vina.ini +++ b/demo/phgdh_demo_vina.ini @@ -48,4 +48,5 @@ ring_system_count = 4 bridged_site_count = 2 spiro_site_count = 1 fused_site_count = 3 -rdkit_sa_score = 5 \ No newline at end of file +rdkit_sa_score = 5 +substructure_filter = 0 \ No newline at end of file diff --git a/demo/subtructure_filter_demo.xls b/demo/subtructure_filter_demo.xls new file mode 100755 index 0000000..72ebc09 Binary files /dev/null and b/demo/subtructure_filter_demo.xls differ diff --git a/secse/growing/filter.py b/secse/growing/filter.py index 59d96eb..ec79ce0 100755 --- a/secse/growing/filter.py +++ b/secse/growing/filter.py @@ -33,10 +33,17 @@ def __init__(self, gen, config_path): self.input_smiles = None self.mol = None self.pains_smarts = None - self.strutFilter = StructureFilter() config = configparser.ConfigParser() config.read(config_path) + + substructure_filter_file = config.get("properties", "substructure_filter") + if substructure_filter_file == "0": + self.strutFilter = StructureFilter() + else: + # print("Use additional substructure filter patters.") + self.strutFilter = StructureFilter(substructure_filter_file) + self.MW = config.getfloat("properties", "MW") self.logP_lower = config.getfloat("properties", "logP_lower") self.logP_upper = config.getfloat("properties", "logP_upper") @@ -143,19 +150,7 @@ def alert_filter(self): yield "PAINS" yield "PASS" - def element_filter(self): - f_count = self.input_smiles.count("F") - br_count = self.input_smiles.count("Br") - cl_count = self.input_smiles.count("Cl") - i_count = self.input_smiles.count("I") - s_count = self.input_smiles.count("S") + self.input_smiles.count("s") - p_count = self.input_smiles.count("P") - if not all([f_count <= 5, br_count <= 2, cl_count <= 3, i_count <= 1, s_count <= 2, p_count <= 1]): - yield "element" - yield "PASS" - def substructure_filter(self): - # self.element_filter() yield self.strutFilter.sfilter(self.mol) def ring_system_filter(self): diff --git a/secse/utilities/substructure_filter.py b/secse/utilities/substructure_filter.py index fc98543..bb2f5d4 100755 --- a/secse/utilities/substructure_filter.py +++ b/secse/utilities/substructure_filter.py @@ -9,13 +9,13 @@ import pandas as pd from rdkit import Chem -FILTER_FILE = "Structure Filter_20211015_v1.12.xls" +FILTER_FILE = os.path.join(os.getenv("SECSE"), "utilities", "Structure Filter_20211015_v1.12.xls") class StructureFilter: - def __init__(self): - df = pd.read_excel(os.path.join(os.getenv("SECSE"), "utilities", FILTER_FILE), - usecols=["Pattern", "ID", "Max"]).dropna() + def __init__(self, filter_lst=FILTER_FILE): + df = pd.read_excel(filter_lst, usecols=["Pattern", "ID", "Max"]).dropna() + df["ID"] = df["ID"].astype(str) df = df.set_index("ID") df["Pattern_sma"] = df["Pattern"].apply(lambda x: Chem.MolFromSmarts(x)) self.fdic = df[["Pattern_sma", "Max"]].T.to_dict()