Skip to content

Commit

Permalink
v1.2
Browse files Browse the repository at this point in the history
  • Loading branch information
ChongLu121 committed Feb 21, 2023
1 parent df51bd1 commit 2715fa9
Show file tree
Hide file tree
Showing 11 changed files with 87 additions and 69 deletions.
9 changes: 6 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,12 @@ further validation.

----------------------------
1. Setup dependencies
```bash
conda create --name secse -c rdkit -c conda-forge parallel tqdm biopandas openbabel chemprop xlrd=2 pandarallel rdkit=2022.03
conda activate secse
```bash
conda create --name secse -c conda-forge parallel tqdm biopandas openbabel chemprop xlrd=2 pandarallel rdkit=2022.03
conda activate secse
```
2. ```bash
git clone https://github.com/KeenThera/SECSE.git
```
3. Set Environment Variables
`export SECSE=/path/to/SECSE`
Expand Down
18 changes: 7 additions & 11 deletions secse/evaluate/docking.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@
import argparse
import os
import shutil
import subprocess
import sys

from rdkit import Chem
from rdkit.Chem import AllChem
from uitilities.function_helper import shell_cmd_execute

sys.path.append(os.getenv("SECSE"))

Expand All @@ -21,27 +21,23 @@


def dock_by_py_vina(workdir, smi, receptor, cpu_num, x, y, z, box_size_x=20, box_size_y=20, box_size_z=20):
cmd = " ".join(
list(map(str, [VINA_SHELL, workdir, smi, receptor, x, y, z, box_size_x, box_size_y, box_size_z, cpu_num])))
print(cmd)
subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT)
cmd = list(map(str, [VINA_SHELL, workdir, smi, receptor, x, y, z, box_size_x, box_size_y, box_size_z, cpu_num]))
shell_cmd_execute(cmd)
merged_sdf(workdir, 0)


def dock_by_py_autodock_gpu(workdir, smi, receptor, cpu_num, gpu_num):
cmd = " ".join(list(map(str, [AUTODOCK_GPU_SHELL, workdir, smi, receptor, cpu_num, gpu_num])))
print(cmd)
subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT)
cmd = list(map(str, [AUTODOCK_GPU_SHELL, workdir, smi, receptor, cpu_num, gpu_num]))
shell_cmd_execute(cmd)
merged_sdf(workdir, 1)


def merged_sdf(workdir, program):
# modify output sdf
check_mols(workdir, program)
out_sdf = os.path.join(workdir, "docking_outputs_with_score.sdf")
cmd_cat = "find {} -name \"*sdf\" | xargs -n 100 cat > {}".format(os.path.join(workdir, "sdf_files"), out_sdf)
print(cmd_cat)
subprocess.check_output(cmd_cat, shell=True, stderr=subprocess.STDOUT)
cmd_cat = ["find", os.path.join(workdir, "sdf_files"), "-name \"*sdf\" | xargs -n 100 cat >", out_sdf]
shell_cmd_execute(cmd_cat)
# remove temporary files
shutil.rmtree(os.path.join(workdir, "pdb_files"))
shutil.rmtree(os.path.join(workdir, "ligands_for_docking"))
Expand Down
5 changes: 2 additions & 3 deletions secse/evaluate/glide_docking.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,14 @@
@time: 2021/11/19/10:05
"""
import os
import subprocess
from uitilities.function_helper import shell_cmd_execute

GLIDE_SHELL = os.path.join(os.getenv("SECSE"), "evaluate", "ligprep_glide.sh")


def dock_by_glide(workdir, mols_smi, target, gen, dock_mode, cpu_num):
ligprep_glide = [GLIDE_SHELL, mols_smi, workdir, target, str(gen), dock_mode, str(cpu_num)]
print(" ".join(ligprep_glide))
subprocess.check_output(" ".join(ligprep_glide), shell=True, stderr=subprocess.STDOUT)
shell_cmd_execute(ligprep_glide)
glide_out = os.path.join(workdir, "glide_gen_{}_lib.sdf".format(gen))
sdf_path = os.path.join(workdir, "docking_outputs_with_score.sdf")
write_score = False
Expand Down
2 changes: 1 addition & 1 deletion secse/evaluate/ligprep.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def ionization(sdf_path):
mol.removeh()
mol.OBMol.AddHydrogens(False, True, 7.4)
mol.OBMol.CorrectForPH(7.4)
charge_model = ob.OBChargeModel_FindType("gasteiger")
charge_model = ob.OBChargeModel.FindType("gasteiger")
charge_model.ComputeCharges(mol.OBMol)
# mol.localopt(forcefield='mmff94', steps=500)
mol.write("pdbqt", "{}.pdbqt".format(os.path.join(path, name)), overwrite=True)
Expand Down
43 changes: 22 additions & 21 deletions secse/grow_processes.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
import csv
import shutil
import os
import subprocess
import pandas as pd
import rdkit
import configparser
Expand All @@ -20,15 +19,17 @@
from scoring.sampling import sample_by_similarity, sample_by_rule_weight
from evaluate.docking import dock_by_py_vina, dock_by_py_autodock_gpu
from uitilities.load_rules import json_to_DB
from uitilities.function_helper import shell_cmd_execute
import time

rdkit.RDLogger.DisableLog("rdApp.*")


class Grow(object):
def __init__(self, generation, mols_smi, workdir, num_per_gen, docking_program,
receptor, start_gen, dl_mode, config_path,
cpu_num=0, gpu_num=1, rule_db=0, x=0, y=0, z=0, box_size_x=0, box_size_y=0, box_size_z=0):
def __init__(self, generation, mols_smi, workdir, num_per_gen, docking_program, receptor, start_gen, dl_mode,
config_path, cpu_num=0, gpu_num=1, rule_db=0, project_code="GEN", x=0, y=0, z=0, box_size_x=0,
box_size_y=0, box_size_z=0):

self.mols_smi = mols_smi
self.total_generation = int(generation)
self.workdir = workdir
Expand All @@ -50,14 +51,18 @@ def __init__(self, generation, mols_smi, workdir, num_per_gen, docking_program,

self.config_path = config_path

rule_db = str(rule_db)
if rule_db in [0, "0"]:
self.rule_db = None
elif str(rule_db).endswith("json"):
elif rule_db.endswith("json"):
os.makedirs(self.workdir, exist_ok=True)
self.rule_db = os.path.join(self.workdir, "rules.db")
json_to_DB(rule_db, self.rule_db)
elif rule_db.endswith("db"):
self.rule_db = rule_db
else:
raise Exception("Please check your input rule file.")
self.project_code = project_code

self.lig_sdf = None
self.winner_df = None
Expand Down Expand Up @@ -131,9 +136,8 @@ def dl_pre(self, step):
config.read(self.config_path)

dl_select_num = config.get("deep learning", "dl_per_gen")
dl_cmd = " ".join([dl_shell, self.workdir, train, pre, str(self.gen), dl_select_num, "22"])
print(dl_cmd)
subprocess.check_output(dl_cmd, shell=True, stderr=subprocess.STDOUT)
dl_cmd = [dl_shell, self.workdir, train, pre, str(self.gen), dl_select_num, "22"]
shell_cmd_execute(dl_cmd)
# docking top predicted compounds
self.workdir_now = os.path.join(self.workdir, "generation_{}_pre".format(self.gen))
self.mols_smi = os.path.join(self.workdir_now, "mols_for_docking_pred.smi")
Expand All @@ -144,7 +148,7 @@ def dl_pre(self, step):
self.lig_sdf = os.path.join(self.workdir, "generation_{}".format(self.gen),
"docking_outputs_with_score.sdf")
merge_cmd = ["cat", os.path.join(self.workdir_now, "docking_outputs_with_score.sdf"), ">>", self.lig_sdf]
subprocess.check_output(" ".join(merge_cmd), shell=True, stderr=subprocess.STDOUT)
shell_cmd_execute(merge_cmd)
self.workdir_now = os.path.join(self.workdir, "generation_{}".format(self.gen))

def grow(self):
Expand Down Expand Up @@ -181,26 +185,23 @@ def grow(self):

self._generation_dir = os.path.join(self.workdir_now, "generation_split_by_seed")
self.winner_df = self.winner_df.reset_index(drop=True)
header = mutation_df(self.winner_df, self.workdir, self.cpu_num, self.gen, self.rule_db)
header = mutation_df(self.winner_df, self.workdir, self.cpu_num, self.gen, self.rule_db, self.project_code)
generation_path = os.path.join(self.workdir_now, "generation")

cmd_cat = "cat {} > {}".format(os.path.join(self.workdir_now, "mutation.csv"),
generation_path + ".raw")
subprocess.check_output(cmd_cat, shell=True, stderr=subprocess.STDOUT)
cmd_dedup = "awk -F',' '!seen[$(NF-4)]++' " + generation_path + ".raw > " + generation_path + ".csv"
subprocess.check_output(cmd_dedup, shell=True, stderr=subprocess.STDOUT)
cmd_cat = ["cat", os.path.join(self.workdir_now, "mutation.csv"), ">", generation_path + ".raw"]
shell_cmd_execute(cmd_cat)
cmd_dedup = ["awk -F',' '!seen[$(NF-4)]++'", generation_path + ".raw", ">", generation_path + ".csv"]
shell_cmd_execute(cmd_dedup)
if not os.path.exists(self._generation_dir):
os.mkdir(self._generation_dir)
cmd_split = "awk -F, '{print>\"" + self._generation_dir + "/\"$2\".csv\"}' " + generation_path + ".csv"
subprocess.check_output(cmd_split, shell=True, stderr=subprocess.STDOUT)
cmd_split = ["awk -F, '{print>\"" + self._generation_dir + "/\"$2\".csv\"}'", generation_path + ".csv"]
shell_cmd_execute(cmd_split)
# filter
print("Step 2: Filtering all mutated mols")
time1 = time.time()
cmd_filter = ["sh", os.path.join(os.getenv("SECSE"), "growing", "filter_parallel.sh"), self.workdir_now,
cmd_filter = ["bash", os.path.join(os.getenv("SECSE"), "growing", "filter_parallel.sh"), self.workdir_now,
str(self.gen), self.config_path, str(self.cpu_num)]
cmd_filter = " ".join(cmd_filter)
print(cmd_filter)
subprocess.check_output(cmd_filter, shell=True, stderr=subprocess.STDOUT)
shell_cmd_execute(cmd_filter)
time2 = time.time()
print("Filter runtime: {:.2f} min.".format((time2 - time1) / 60))

Expand Down
11 changes: 6 additions & 5 deletions secse/growing/mutation/mutation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import sys

sys.path.append(os.getenv("SECSE"))
import subprocess
import copy
import sqlite3
import pandas as pd
Expand All @@ -12,6 +11,7 @@
from rdkit.Chem import rdChemReactions
from uitilities.wash_mol import get_bridged_atoms, neutralize_atoms
from uitilities.load_rules import json_to_DB
from uitilities.function_helper import shell_cmd_execute

rdkit.RDLogger.DisableLog("rdApp.*")

Expand Down Expand Up @@ -168,7 +168,7 @@ def clean(self):
self.out_product_smiles = []


def mutation_df(df: pd.DataFrame, workdir, cpu_num, gen=1, rule_db=None):
def mutation_df(df: pd.DataFrame, workdir, cpu_num, gen=1, rule_db=None, project_code="GEN"):
workdir = os.path.join(workdir, "generation_" + str(gen))

if rule_db is None:
Expand Down Expand Up @@ -208,11 +208,12 @@ def mutation_per_row(mut: Mutation, smi):
# write mutation mols
for info in i[-1]:
info = list(map(str, info))
new_line = last_gen_info + [info[0]] + ["GEN_" + str(gen) + "_M_" + str(n).zfill(9)] + info[1:]
new_line = last_gen_info + [info[0]] + [
project_code.upper() + "_" + str(gen) + "_M_" + str(n).zfill(9)] + info[1:]
f.write(",".join(new_line) + "\n")
n += 1
# drop duplicates product smiles by awk
cmd_dedup = "awk -F',' '!seen[$(NF-4)]++' " + mut_path + ".raw > " + mut_path + ".csv"
subprocess.check_output(cmd_dedup, shell=True, stderr=subprocess.STDOUT)
cmd_dedup = ["awk -F',' '!seen[$(NF-4)]++'", mut_path + ".raw ", ">", mut_path + ".csv"]
shell_cmd_execute(cmd_dedup)

return header
6 changes: 3 additions & 3 deletions secse/report/grow_path.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem.rdMolDescriptors import CalcExactMolWt
import subprocess
from pandarallel import pandarallel
import configparser

from scoring.ranking import read_dock_file
from uitilities.function_helper import shell_cmd_execute

pandarallel.initialize(verbose=0)

Expand Down Expand Up @@ -113,8 +113,8 @@ def grep_sdf(workdir, merge_file):
merged_sdf = os.path.join(workdir, "merged_all.sdf")
selected_sdf = os.path.join(workdir, "selected.sdf")
# merge all sdf
cmd_merge = "find {} -name \"docking_outputs_with_score.sdf\" | xargs cat > {}".format(workdir, merged_sdf)
subprocess.check_output(cmd_merge, shell=True, stderr=subprocess.STDOUT)
cmd_merge = ["find", workdir, "-name \"docking_outputs_with_score.sdf\" | xargs cat >", merged_sdf]
shell_cmd_execute(cmd_merge)
# create ids
df = pd.read_csv(merge_file)
ids = list(set(df["id"].apply(lambda x: x.split("-dp")[0])))
Expand Down
11 changes: 6 additions & 5 deletions secse/run_secse.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def main():
cpu_num = config.getint("DEFAULT", "cpu")
gpu_num = config.getint("DEFAULT", "gpu")
rule_db = config.get("DEFAULT", "rule_db")
project_code = config.get("DEFAULT", "project_code")

receptor = config.get("docking", "target")
dl_mode = config.getint("deep learning", "mode")
Expand All @@ -49,16 +50,16 @@ def main():
return None

if "vina" in docking_program.lower():
workflow = Grow(num_gen, mols_smi, workdir, num_per_gen, docking_program, receptor,
start_gen, dl_mode, args.config, cpu_num=cpu_num, rule_db=rule_db, x=x, y=y, z=z,
workflow = Grow(num_gen, mols_smi, workdir, num_per_gen, docking_program, receptor, start_gen, dl_mode,
args.config, cpu_num=cpu_num, rule_db=rule_db, project_code=project_code, x=x, y=y, z=z,
box_size_x=box_size_x, box_size_y=box_size_y, box_size_z=box_size_z)

elif "glide" in docking_program.lower():
workflow = Grow(num_gen, mols_smi, workdir, num_per_gen, docking_program, receptor, start_gen, dl_mode,
args.config, cpu_num=cpu_num, rule_db=rule_db)
args.config, cpu_num=cpu_num, rule_db=rule_db, project_code=project_code)
elif "autodock-gpu" in docking_program.lower():
workflow = Grow(num_gen, mols_smi, workdir, num_per_gen, docking_program, receptor, start_gen, dl_mode,
args.config, cpu_num=cpu_num, gpu_num=gpu_num, rule_db=rule_db)
args.config, cpu_num=cpu_num, gpu_num=gpu_num, rule_db=rule_db, project_code=project_code)
else:
print("Please check your input docking program argument.")
return None
Expand All @@ -75,7 +76,7 @@ def main():
" / ___| | ____| / ___| / ___| | ____|\n",
" \\___ \\ | _| | | \\___ \\ | _| \n",
" ___) | | |___ | |___ ___) | | |___ \n",
" |____/ |_____| \\____| |____/ |_____|")
" |____/ |_____| \\____| |____/ |_____| v1.2")

main()
time2 = time.time()
Expand Down
2 changes: 1 addition & 1 deletion secse/scoring/chemprop_pre.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ mkdir -p "$model_dir"
model="$model_dir"/G"$max_gen"_seed"$seed"
chemprop_train --data_path "$train" --dataset_type regression --save_dir \
"$model" --seed "$seed" --save_smiles --save_preds --show_individual_scores \
--extra_metrics {r2,mae,mse} --split_type random
--extra_metrics r2 mae mse --split_type random

# split files and prediction with CPU Parallelization
split_dir=$workdir/prediction/pre_split_$max_gen
Expand Down
32 changes: 16 additions & 16 deletions secse/scoring/docking_score_prediction.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
@time: 2021/10/27/14:26
"""
import argparse
import subprocess

from openbabel import openbabel
import pandas as pd
Expand All @@ -16,6 +15,7 @@
from rdkit.Chem import PandasTools
from rdkit.Chem import MolStandardize
from tqdm import tqdm
from uitilities.function_helper import shell_cmd_execute

rdkit.RDLogger.DisableLog("rdApp.*")

Expand Down Expand Up @@ -47,30 +47,30 @@ def get_pre(workdir, max_gen, get_all=False):
pre_raw = os.path.join(pre_dir, "all_G" + str(max_gen) + "_for_pre.raw")
pre_file = os.path.join(pre_dir, "all_G" + str(max_gen) + "_for_pre.csv")

cmd_cat = "find {} -name \"filter.csv\" |xargs awk -F, 'NR>1{{print $(NF-5)\",\"$(NF-6)}}' > {}".format(
workdir, pre_raw)
subprocess.check_output(cmd_cat, shell=True, stderr=subprocess.STDOUT)
cmd_dedup = "awk -F',' '!seen[$2]++' " + pre_raw + " > " + pre_file
subprocess.check_output(cmd_dedup, shell=True, stderr=subprocess.STDOUT)
cmd_cat = ["find", workdir, "-name \"filter.csv\" |xargs awk -F, 'NR>1{{print $(NF-5)\",\"$(NF-6)}}' >",
pre_raw]
shell_cmd_execute(cmd_cat)
cmd_dedup = ["awk -F',' '!seen[$2]++'", pre_raw, ">", pre_file]
shell_cmd_execute(cmd_dedup)

drop_mols = os.path.join(pre_dir, "drop_ids.txt")
mols_id_cat = "find {} -name \"mols_for_docking.smi\" |xargs cut -f2 > {}".format(workdir, drop_mols)
subprocess.check_output(mols_id_cat, shell=True, stderr=subprocess.STDOUT)
mols_id_cat = ["find", workdir, "-name \"mols_for_docking.smi\" |xargs cut -f2 >", drop_mols]
shell_cmd_execute(mols_id_cat)
final_file = os.path.join(pre_dir, "all_G" + str(max_gen) + "_for_pre_uniq.csv")
else:
pre_file = os.path.join(pre_dir, "gen_" + str(max_gen) + "_for_pre.csv")
cmd_cp = "awk -F, 'NR>1{{print $(NF-5)\",\"$(NF-6)}}' {} > {}".format(
os.path.join(workdir, "generation_" + str(max_gen), "filter.csv"), pre_file)
subprocess.check_output(cmd_cp, shell=True, stderr=subprocess.STDOUT)
cmd_cp = ["awk -F, 'NR>1{{print $(NF-5)\",\"$(NF-6)}}'",
os.path.join(workdir, "generation_" + str(max_gen), "filter.csv"), ">", pre_file]
shell_cmd_execute(cmd_cp)

drop_mols = os.path.join(pre_dir, "drop_ids_{}.txt".format(max_gen))
mols_id_cat = "cut -f2 {} > {}".format(
os.path.join(workdir, "generation_" + str(max_gen), "mols_for_docking.smi"), drop_mols)
subprocess.check_output(mols_id_cat, shell=True, stderr=subprocess.STDOUT)
mols_id_cat = ["cut -f2", os.path.join(workdir, "generation_" + str(max_gen), "mols_for_docking.smi"), ">",
drop_mols]
shell_cmd_execute(mols_id_cat)
final_file = os.path.join(pre_dir, "gen_" + str(max_gen) + "_for_pre_uniq.csv")

cmd_drop = "grep -wvf {} {} > {}".format(drop_mols, pre_file, final_file)
subprocess.check_output(cmd_drop, shell=True, stderr=subprocess.STDOUT)
cmd_drop = ["grep -wvf", drop_mols, pre_file, ">", final_file]
shell_cmd_execute(cmd_drop)
return final_file


Expand Down
17 changes: 17 additions & 0 deletions secse/uitilities/function_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/usr/bin/env python
# -*- coding:utf-8 _*-
"""
@author: Lu Chong
@file: function_helper.py
@time: 2022/10/13/16:36
"""
import subprocess


def shell_cmd_execute(cmd_lst):
cmd = " ".join(cmd_lst)
try:
subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as e:
print(e.output.decode())
raise Exception("Error executing command: {}".format(cmd))

0 comments on commit 2715fa9

Please sign in to comment.