This repository has been archived by the owner on Mar 17, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrun_optimization.py
103 lines (85 loc) · 3.18 KB
/
run_optimization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import argparse
import logging
import os
import sys
from qptuna.three_step_opt_build_merge import (
optimize,
buildconfig_best,
build_best,
)
from qptuna.config import ModelMode, OptimizationDirection
from qptuna.config.optconfig import (
OptimizationConfig,
PLS,
RandomForest,
SVR,
XGBregressor,
)
from qptuna.datareader import Dataset
from qptuna.descriptors import ECFP
logger = logging.getLogger(__name__)
def main():
algs = {
"PLS": PLS.new(
n_components={"low": 2, "high": 10}
),
"RF": RandomForest.new(
max_features=["auto", "sqrt", "log2"],
max_depth={"low": 2, "high": 32},
n_estimators={"low": 10, "high": 250}
),
"SVR": SVR.new(),
"xgboost": XGBregressor.new(
max_depth={"low": 2, "high": 32},
n_estimators={"low": 10, "high": 300},
learning_rate={"low": 0.1, "high": 0.1} # Constant.
)
}
props = ["Clearance", "logD", "Permeability", "Solubility"]
sets = ["1", "2", "3", "4"]
parser = argparse.ArgumentParser(description='Run MMP study.')
requiredNamed = parser.add_argument_group('required named arguments')
requiredNamed.add_argument("--prop", choices=props, required=True)
requiredNamed.add_argument("--setid", choices=sets, required=True)
requiredNamed.add_argument("--alg", choices=algs.keys(), required=True)
requiredNamed.add_argument("--datadir", required=True)
args = parser.parse_args()
prop = args.prop
setid = args.setid
alg = args.alg
datadir = args.datadir
study_name = f"MMP_{datadir}_{prop}_set{setid}_{alg}"
config = OptimizationConfig(
data=Dataset(
input_column="SMILES",
response_column="VALUE",
training_dataset_file=f"{datadir}/{prop}_set{setid}_train.csv",
),
descriptors=[
ECFP.new(radius=3, nBits=2048) # For ECFP6, radius=3.
],
algorithms=[
algs[alg],
],
settings=OptimizationConfig.Settings(
mode=ModelMode.REGRESSION,
n_jobs=8,
cross_validation=3,
n_trials=300,
direction=OptimizationDirection.MAXIMIZATION,
optuna_storage=f"sqlite:///optuna-storage/optuna_storage_{study_name}.sqlite",
track_to_mlflow=False,
),
)
study = optimize(config, study_name=study_name)
# Get the best Trial from the Study and make a Build (Training) configuration for it.
buildconfig = buildconfig_best(study)
# Build (re-Train) and save the best model.
build_best(buildconfig, f"best-models/best-{study_name}.pkl")
if __name__ == '__main__':
# Configure logger to output to console (Slurm collects console output by default).
# For remote logging, we can add handlers HTTPHandler or SysLogHandler from logging.handlers.
# For machine-readable logs, we can use a Json formatter, like https://github.com/madzak/python-json-logger.
logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))
logging.captureWarnings(True) # Capture sklearn warnings about version mismatch.
exit(main())