From 5a14311bac1a34fe686432f464bcdcc093e3c575 Mon Sep 17 00:00:00 2001 From: kljk345 Date: Fri, 13 Sep 2024 13:03:12 +0100 Subject: [PATCH] Init 3.1.3 --- docs/sphinx-source/algorithms.rst | 8 +- docs/sphinx-source/conf.py | 2 +- docs/sphinx-source/descriptors.rst | 44 ++++++++- docs/sphinx-source/transform.rst | 7 ++ optunaz/__init__.py | 2 +- optunaz/algorithms/chem_prop.py | 27 ++++-- optunaz/algorithms/chem_prop_hyperopt.py | 17 ++-- optunaz/builder.py | 40 +++++++-- optunaz/config/buildconfig.py | 12 +-- optunaz/descriptors.py | 1 - optunaz/evaluate.py | 92 ++++++------------- optunaz/model_writer.py | 2 + optunaz/schemagen.py | 2 +- optunaz/three_step_opt_build_merge.py | 6 +- optunaz/utils/tracking.py | 65 +++++++++----- pyproject.toml | 4 +- tests/test_tracking_build.py | 109 ++++++++++++++++++++--- 17 files changed, 296 insertions(+), 144 deletions(-) diff --git a/docs/sphinx-source/algorithms.rst b/docs/sphinx-source/algorithms.rst index 4c7d25e..d6e584f 100644 --- a/docs/sphinx-source/algorithms.rst +++ b/docs/sphinx-source/algorithms.rst @@ -14,13 +14,13 @@ Lasso KNeighborsClassifier -##### +#################### .. autoclass:: optunaz.config.optconfig.KNeighborsClassifier :members: KNeighborsRegressor -##### +################### .. autoclass:: optunaz.config.optconfig.KNeighborsRegressor :members: @@ -103,8 +103,8 @@ ChemPropHyperoptRegressor :members: -ChemPropRegressorPretrained -######################### +ChemPropHyperoptRegressorPretrained +################################### .. autoclass:: optunaz.config.optconfig.ChemPropRegressorPretrained :members: diff --git a/docs/sphinx-source/conf.py b/docs/sphinx-source/conf.py index a2427e3..db7223e 100644 --- a/docs/sphinx-source/conf.py +++ b/docs/sphinx-source/conf.py @@ -22,7 +22,7 @@ author = 'MAI' # The full version, including alpha/beta/rc tags -release = '3.1.2' +release = '3.1.3' # -- General configuration --------------------------------------------------- diff --git a/docs/sphinx-source/descriptors.rst b/docs/sphinx-source/descriptors.rst index d36f36b..5c2bc5d 100644 --- a/docs/sphinx-source/descriptors.rst +++ b/docs/sphinx-source/descriptors.rst @@ -19,7 +19,7 @@ ECFP_counts PathFP -########### +###### .. autoclass:: optunaz.descriptors.PathFP :members: @@ -37,13 +37,13 @@ UnscaledPhyschemDescriptors UnscaledJazzyDescriptors -########################### +######################## .. autoclass:: optunaz.descriptors.UnscaledJazzyDescriptors :members: UnscaledZScalesDescriptors -########### +########################## .. autoclass:: optunaz.descriptors.UnscaledZScalesDescriptors :members: @@ -67,7 +67,7 @@ PrecomputedDescriptorFromFile ZScales -########### +####### .. autoclass:: optunaz.descriptors.ZScalesDescriptors :members: @@ -94,3 +94,39 @@ CompositeDescriptor ################### .. autoclass:: optunaz.descriptors.CompositeDescriptor :members: + + +AmorProtDescriptors +################### +.. autoclass:: optunaz.descriptors.AmorProtDescriptors + :members: + + +PathFP +###### +.. autoclass:: optunaz.descriptors.PathFP + :members: + + +UnscaledMAPC +############ +.. autoclass:: optunaz.descriptors.UnscaledMAPC + :members: + + +UnscaledZScalesDescriptors +########################## +.. autoclass:: optunaz.descriptors.UnscaledZScalesDescriptors + :members: + + +MAPC +#### +.. autoclass:: optunaz.descriptors.MAPC + :members: + + +ZScalesDescriptors +################## +.. autoclass:: optunaz.descriptors.ZScalesDescriptors + :members: \ No newline at end of file diff --git a/docs/sphinx-source/transform.rst b/docs/sphinx-source/transform.rst index aaee310..6e5139d 100644 --- a/docs/sphinx-source/transform.rst +++ b/docs/sphinx-source/transform.rst @@ -24,3 +24,10 @@ ZScales ####### .. autoclass:: optunaz.utils.preprocessing.transform.ZScales :members: + + +AmorProt +######## +.. autoclass:: optunaz.utils.preprocessing.transform.AmorProt + :members: + diff --git a/optunaz/__init__.py b/optunaz/__init__.py index d6df3b4..b550279 100644 --- a/optunaz/__init__.py +++ b/optunaz/__init__.py @@ -1,5 +1,5 @@ import os -__version__ = "3.1.2" +__version__ = "3.1.3" os.environ["TQDM_DISABLE"] = "1" \ No newline at end of file diff --git a/optunaz/algorithms/chem_prop.py b/optunaz/algorithms/chem_prop.py index 0764f07..c85180a 100644 --- a/optunaz/algorithms/chem_prop.py +++ b/optunaz/algorithms/chem_prop.py @@ -97,6 +97,21 @@ def __exit__(self, *args): del self._stringio sys.stdout = self._stdout +class CaptureStdOutErr(list): + def __enter__(self): + self._stdout = sys.stdout + self._stderr = sys.stderr + sys.stdout = self._stringio = StringIO() + sys.stderr = self._stringioerr = StringIO() + return self + + def __exit__(self, *args): + self.extend(self._stringio.getvalue().splitlines()) + self.extend(self._stringioerr.getvalue().splitlines()) + del self._stringio + del self._stringioerr + sys.stdout = self._stdout + sys.stderr = self._stderr def save_model_memory(model_dir): tarblob = io.BytesIO() @@ -316,7 +331,7 @@ def fit(self, X, y): ).to_csv(x_aux_path.name, index=False) # arguments += ["--features_path", f"{x_aux_path.name}"] TODO: allow features once ChemProp is updated - with CaptureStdOut() as _: + with CaptureStdOutErr() as _: args = QSARtunaTrainArgs().parse_args(arguments) chemprop.train.cross_validate( args=args, train_func=chemprop.train.run_training @@ -365,7 +380,7 @@ def predict_proba(self, X): else: X = np.array(X).reshape(len(X), 1) - with CaptureStdOut() as _: + with CaptureStdOutErr() as _: args = QSARtunaPredictArgs().parse_args(arguments) model_objects = chemprop.train.load_model(args=args) preds = np.array( @@ -444,7 +459,7 @@ def predict_uncert(self, X): else: X = np.array(X[:, 0].reshape(len(X), 1)) - with CaptureStdOut() as _: + with CaptureStdOutErr() as _: args = QSARtunaPredictArgs().parse_args(arguments) if uncertainty_method == "dropout": model_objects = list(chemprop.train.load_model(args=args)) @@ -505,7 +520,7 @@ def interpret(self, X, prop_delta=0.75): X = np.array(X[:, 0].reshape(len(X), 1)) X = pd.DataFrame(X, columns=["smiles"]) X.to_csv(data_path.name, index=False) - with CaptureStdOut() as _: + with CaptureStdOutErr() as _: args = chemprop.args.InterpretArgs().parse_args(intrprt_args) with CaptureStdOut() as intrprt: interpret(args=args) @@ -565,7 +580,7 @@ def chemprop_fingerprint(self, X, fingerprint_type="MPN"): ] # if self.x_aux_ is not None: # fprnt_args += ["--features_path", f"{x_aux_path.name}"] TODO: allow features once ChemProp is updated - with CaptureStdOut() as _: + with CaptureStdOutErr() as _: args = chemprop.args.FingerprintArgs().parse_args(fprnt_args) try: fps = chemprop.train.molecule_fingerprint.molecule_fingerprint( @@ -861,7 +876,7 @@ def fit(self, X, y): arguments += ["--frzn_ffn_layers", "1"] if self.frzn == "mpnn_last_ffn": arguments += ["--frzn_ffn_layers", f"{self.ffn_num_layers - 1}"] - with CaptureStdOut() as _: + with CaptureStdOutErr() as _: args = QSARtunaTrainArgs().parse_args(arguments) chemprop.train.cross_validate( args=args, train_func=chemprop.train.run_training diff --git a/optunaz/algorithms/chem_prop_hyperopt.py b/optunaz/algorithms/chem_prop_hyperopt.py index 86ad542..476553e 100644 --- a/optunaz/algorithms/chem_prop_hyperopt.py +++ b/optunaz/algorithms/chem_prop_hyperopt.py @@ -37,16 +37,21 @@ chemprop.interpret.MoleculeDataLoader = MoleculeDataLoader -class CaptureStdOut(list): +class CaptureStdOutErr(list): def __enter__(self): self._stdout = sys.stdout + self._stderr = sys.stderr sys.stdout = self._stringio = StringIO() + sys.stderr = self._stringioerr = StringIO() return self def __exit__(self, *args): self.extend(self._stringio.getvalue().splitlines()) + self.extend(self._stringioerr.getvalue().splitlines()) del self._stringio + del self._stringioerr sys.stdout = self._stdout + sys.stderr = self._stderr def save_model_memory(model_dir): @@ -294,7 +299,7 @@ def fit(self, X, y): ).to_csv(x_aux_path.name, index=False) # arguments += ["--features_path", f"{x_aux_path.name}"] TODO: allow features once ChemProp is updated - with CaptureStdOut() as _: + with CaptureStdOutErr() as _: if self.num_iters > 1: with tempfile.NamedTemporaryFile( delete=True, mode="w+" @@ -375,7 +380,7 @@ def predict_proba(self, X): else: X = np.array(X[:, 0].reshape(len(X), 1)) - with CaptureStdOut() as _: + with CaptureStdOutErr() as _: args = chemprop.args.PredictArgs().parse_args(arguments) model_objects = chemprop.train.load_model(args=args) preds = np.array( @@ -450,7 +455,7 @@ def predict_uncert(self, X): else: X = np.array(X[:, 0].reshape(len(X), 1)) - with CaptureStdOut() as _: + with CaptureStdOutErr() as _: args = chemprop.args.PredictArgs().parse_args(arguments) if uncertainty_method == "dropout": model_objects = list(chemprop.train.load_model(args=args)) @@ -512,7 +517,7 @@ def interpret(self, X, prop_delta=0.75): X = pd.DataFrame(X, columns=["smiles"]) X.to_csv(data_path.name, index=False) args = chemprop.args.InterpretArgs().parse_args(intrprt_args) - with CaptureStdOut() as intrprt: + with CaptureStdOutErr() as intrprt: interpret(args=args) intrprt = [ line.split(",") @@ -560,7 +565,7 @@ def chemprop_fingerprint(self, X, fingerprint_type="MPN"): # if self.x_aux_ is not None: # fprnt_args += ["--features_path", f"{x_aux_path.name}"] TODO: allow features once ChemProp is updated # load_model returns pred&train arguments, object models & tasks info - but we only need TrainArgs here - with CaptureStdOut() as _: + with CaptureStdOutErr() as _: args = chemprop.args.FingerprintArgs().parse_args(fprnt_args) _, trainargs, _, _, _, _ = chemprop.train.load_model(args=args) if fingerprint_type == "MPN": diff --git a/optunaz/builder.py b/optunaz/builder.py index 2405aed..a5db573 100644 --- a/optunaz/builder.py +++ b/optunaz/builder.py @@ -17,8 +17,27 @@ def build( estimator = buildconfig.algorithm.estimator() if merge_train_and_test_data: train_smiles, train_y, train_aux = buildconfig.data.get_merged_sets() + test_smiles, test_y, test_aux, test_X = None, None, None, None else: - train_smiles, train_y, train_aux, _, _, _ = buildconfig.data.get_sets() + ( + train_smiles, + train_y, + train_aux, + test_smiles, + test_y, + test_aux, + ) = buildconfig.data.get_sets() + if test_smiles is not None and len(test_smiles) > 0: + test_X, failed_idx = descriptor_from_config( + test_smiles, buildconfig.descriptor, cache=cache + ) + test_y, test_smiles, test_aux = remove_failed_idx( + failed_idx, test_y, test_smiles, test_aux + ) + if test_aux is not None: + test_X = np.hstack((test_X, test_aux)) + else: + test_X = None train_X, failed_idx = descriptor_from_config( train_smiles, buildconfig.descriptor, cache=cache @@ -35,13 +54,20 @@ def build( estimator.X_ = train_X estimator.y_ = train_y estimator.aux_ = train_aux + estimator.test_smiles_ = test_smiles + estimator.test_X_ = test_X + estimator.test_y_ = test_y + estimator.test_aux_ = test_aux - if merge_train_and_test_data: - train_scores = get_merged_train_score(estimator, buildconfig, cache=cache) - test_scores = None - else: + if ( + not merge_train_and_test_data + and test_smiles is not None + and len(test_smiles) > 0 + ): train_scores, test_scores = get_train_test_scores( - estimator, buildconfig, cache=cache + estimator, buildconfig, train_X, train_y, test_X, test_y ) - + else: + train_scores = get_merged_train_score(estimator, buildconfig, train_X, train_y) + test_scores = None return estimator, train_scores, test_scores diff --git a/optunaz/config/buildconfig.py b/optunaz/config/buildconfig.py index 25a1a19..4312da9 100644 --- a/optunaz/config/buildconfig.py +++ b/optunaz/config/buildconfig.py @@ -42,8 +42,8 @@ def estimator(self) -> BaseEstimator: class AdaBoostClassifier(Algorithm): @dataclass class AdaBoostClassifierParameters: - n_estimators: int = field(metadata=schema(min=1)) - learning_rate: float = field(metadata=schema(min=0.0001)) + n_estimators: int = field(default=1, metadata=schema(min=1)) + learning_rate: float = field(default=0.1, metadata=schema(min=0.0001)) name: Literal["AdaBoostClassifier"] parameters: AdaBoostClassifierParameters @@ -116,7 +116,7 @@ class LogisticRegression(Algorithm): @dataclass class LogisticRegressionParameters: solver: str - C: float = field(metadata=schema(min=0.001, max=1000)) + C: float = field(default=1.0, metadata=schema(min=0.001, max=1000)) name: Literal["LogisticRegression"] parameters: LogisticRegressionParameters @@ -137,7 +137,7 @@ def estimator(self): class PLSRegression(Algorithm): @dataclass class PLSParameters: - n_components: int = field(metadata=schema(min=1)) + n_components: int = field(default=2, metadata=schema(min=1)) name: Literal["PLSRegression"] parameters: PLSParameters @@ -152,9 +152,9 @@ def estimator(self): class RandomForestClassifier(Algorithm): @dataclass class RandomForestParameters: - max_depth: int = field(metadata=schema(min=1)) - n_estimators: int = field(metadata=schema(min=1)) max_features: str + max_depth: int = field(default=None, metadata=schema(min=1)) + n_estimators: int = field(default=100, metadata=schema(min=1)) name: Literal["RandomForestClassifier"] parameters: RandomForestParameters diff --git a/optunaz/descriptors.py b/optunaz/descriptors.py index 8743156..01a2c1a 100644 --- a/optunaz/descriptors.py +++ b/optunaz/descriptors.py @@ -33,7 +33,6 @@ from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator from jazzy.api import molecular_vector_from_smiles -from jazzy.exception import JazzyError from sklearn import preprocessing from joblib import Parallel, delayed, effective_n_jobs from optunaz.config import NameParameterDataclass diff --git a/optunaz/evaluate.py b/optunaz/evaluate.py index e2c661b..0a5264b 100644 --- a/optunaz/evaluate.py +++ b/optunaz/evaluate.py @@ -1,18 +1,10 @@ +from typing import List import numpy as np -from typing import Dict, List - -from sklearn.metrics import get_scorer - +from sklearn.metrics import check_scoring +from sklearn.calibration import calibration_curve from optunaz import objective from optunaz.config import ModelMode from optunaz.config.buildconfig import BuildConfig -from optunaz.utils import remove_failed_idx -from optunaz.descriptors import descriptor_from_config - - -def score_all(scores: List[str], estimator, X, y) -> Dict[str, float]: - result = {s: get_scorer(s)(estimator, X, y) for s in scores} - return result def get_scores(mode: ModelMode) -> List[str]: @@ -22,68 +14,36 @@ def get_scores(mode: ModelMode) -> List[str]: scores = objective.classification_scores else: raise ValueError(f"Unrecognized mode: {mode}") - return scores -def score_all_smiles(scores, estimator, smiles, descriptor, aux, y, cache=None): - X, failed_idx = descriptor_from_config(smiles, descriptor, cache=cache) - y, smiles, aux = remove_failed_idx(failed_idx, y, smiles, aux) - if aux is not None: - X = np.hstack((X, aux)) - return score_all(scores, estimator, X, y) - - -def get_train_test_scores(estimator, buildconfig: BuildConfig, cache=None): +def get_train_test_scores( + estimator, buildconfig: BuildConfig, train_X, train_y, test_X, test_y +): scores = get_scores(buildconfig.settings.mode) - - ( - train_smiles, - train_y, - train_aux, - test_smiles, - test_y, - test_aux, - ) = buildconfig.data.get_sets() - - train_scores = score_all_smiles( - scores, - estimator, - train_smiles, - buildconfig.descriptor, - train_aux, - train_y, - cache=cache, - ) - if test_smiles is not None and len(test_smiles) > 0: - test_scores = score_all_smiles( - scores, - estimator, - test_smiles, - buildconfig.descriptor, - test_aux, - test_y, - cache=cache, - ) - else: - test_scores = None - + train_scores = check_scoring(estimator, scoring=scores)(estimator, train_X, train_y) + test_scores = check_scoring(estimator, scoring=scores)(estimator, test_X, test_y) return train_scores, test_scores -def get_merged_train_score(estimator, buildconfig: BuildConfig, cache=None): +def get_merged_train_score(estimator, buildconfig: BuildConfig, train_X, train_y): scores = get_scores(buildconfig.settings.mode) + train_scores = check_scoring(estimator, scoring=scores)(estimator, train_X, train_y) + return train_scores - train_smiles, train_y, train_aux = buildconfig.data.get_merged_sets() - - train_scores = score_all_smiles( - scores, - estimator, - train_smiles, - buildconfig.descriptor, - train_aux, - train_y, - cache=cache, - ) - return train_scores +def calibration_analysis(y_test, y_pred): + try: + frac_true, frac_pred = calibration_curve(y_test, y_pred, n_bins=15) + bin_edges = frac_pred + except ValueError: + # weight each bin by the total number of values so that the sum of all bars equal unity + weights = np.ones_like(y_test) / len(y_test) + # calculate fraction of true points across uniform bins + frac_true, bin_edges = np.histogram(y_test, bins=15, weights=weights) + # calculate fraction of pred points across uniform true bins + frac_pred, _ = np.histogram(y_pred, bins=bin_edges, weights=weights) + # convert to cumulative sum for plotting + frac_true = np.cumsum(frac_true) + frac_pred = np.cumsum(frac_pred) + return list(zip(bin_edges, frac_true, frac_pred)) diff --git a/optunaz/model_writer.py b/optunaz/model_writer.py index f00db30..d3e816a 100644 --- a/optunaz/model_writer.py +++ b/optunaz/model_writer.py @@ -266,3 +266,5 @@ def save_model( with open(filename, "wb") as f: pickle.dump(model, f) + + return model diff --git a/optunaz/schemagen.py b/optunaz/schemagen.py index e071a52..16efebf 100644 --- a/optunaz/schemagen.py +++ b/optunaz/schemagen.py @@ -1,7 +1,7 @@ import inspect import json from textwrap import dedent -from typing import Any, Tuple, Optional +from typing import Any, Optional import apischema from apischema import schema diff --git a/optunaz/three_step_opt_build_merge.py b/optunaz/three_step_opt_build_merge.py index e08c0c1..4ac2947 100644 --- a/optunaz/three_step_opt_build_merge.py +++ b/optunaz/three_step_opt_build_merge.py @@ -271,7 +271,7 @@ def log_scores(scores, main_score, label: str): if main_score_val is not None: logger.info(f"{label.capitalize()} score {main_score}: {main_score_val}") logger.info( - f"All {label} cores: { {k: round(number=v, ndigits=3) for k, v in scores.items()} }" + f"All {label} scores: { {k: round(number=v, ndigits=3) for k, v in scores.items()} }" ) @@ -283,7 +283,7 @@ def build_best( """Step 2. Build. Train a model with the best hyperparameters.""" model, train_scores, test_scores = build(buildconfig, cache=cache) - save_model( + qsartuna_model = save_model( model, buildconfig, outfname, @@ -298,7 +298,7 @@ def build_best( log_scores(test_scores, buildconfig.settings.scoring, "test") if buildconfig.settings.tracking_rest_endpoint is not None: - track_build(model, buildconfig) + track_build(qsartuna_model, buildconfig, test_scores) return buildconfig diff --git a/optunaz/utils/tracking.py b/optunaz/utils/tracking.py index 6da494f..d006598 100644 --- a/optunaz/utils/tracking.py +++ b/optunaz/utils/tracking.py @@ -2,18 +2,15 @@ import os from dataclasses import dataclass from typing import List, Dict - import requests from apischema import serialize from optunaz.config.build_from_opt import remove_algo_hash from optuna import Study from optuna.trial import FrozenTrial - from optunaz.config.build_from_opt import buildconfig_from_trial from optunaz.config.buildconfig import BuildConfig from optunaz.config.optconfig import OptimizationConfig -from optunaz.evaluate import get_train_test_scores -from optunaz.model_writer import wrap_model +from optunaz.evaluate import calibration_analysis logger = logging.getLogger(__name__) @@ -32,6 +29,7 @@ class TrackingData: trial_state: str all_cv_test_scores: Dict[str, List[float]] buildconfig: BuildConfig + algorith_hash: str def __post_init__(self): self.buildconfig.metadata = None # Metadata is not essential - drop. @@ -51,7 +49,7 @@ def round_scores(test_scores): @dataclass class InternalTrackingCallback: - """Callback to track (log) progress using internal tracking format""" + """Callback to track (log) Optimization progress using internal tracking format""" optconfig: OptimizationConfig trial_number_offset: int @@ -74,6 +72,7 @@ def __call__(self, study: Study, trial: FrozenTrial) -> None: trial_state=trial.state.name, all_cv_test_scores=round_scores(trial.user_attrs["test_scores"]), buildconfig=buildconfig, + algorith_hash=trial.user_attrs["alg_hash"], ) json_data = serialize(data) @@ -99,35 +98,40 @@ class Datapoint: predicted: float +@dataclass +class Calpoint: + bin_edges: float + frac_true: float + frac_pred: float + + @dataclass class BuildTrackingData: - """Dataclass defining internal tracking format""" + """Dataclass defining internal Build tracking format""" response_column_name: str - test_scores: Dict[str, float] + test_scores: Dict[str, float] | str test_points: List[Datapoint] + cal_points: List[Calpoint] | None -def track_build(model, buildconfig: BuildConfig): - train_scores, test_scores = get_train_test_scores(model, buildconfig) +def track_build(qptuna_model, buildconfig: BuildConfig, test_scores): + test_smiles = qptuna_model.predictor.test_smiles_ + test_aux = qptuna_model.predictor.test_aux_ + expected = qptuna_model.predictor.test_y_ + if test_smiles is None or len(test_smiles) < 1: + logger.warning("No test set.") + return rounded_test_scores = ( {k: round(v, ndigits=3) for k, v in test_scores.items()} if test_scores is not None - else None + else "" ) - _, _, _, smiles, expected, _ = buildconfig.data.get_sets() - - if smiles is None or len(smiles) < 1: - logger.warning("No test set.") - return - - mode = buildconfig.settings.mode - descriptor = buildconfig.descriptor - qsartuna_model = wrap_model(model, descriptor=descriptor, mode=mode) - - predicted = qsartuna_model.predict_from_smiles(smiles) + predicted = qptuna_model.predict_from_smiles(test_smiles, aux=test_aux) + if qptuna_model.transform is not None: + expected = qptuna_model.transform.reverse_transform(expected) test_points = [ Datapoint( @@ -135,13 +139,28 @@ def track_build(model, buildconfig: BuildConfig): expected=round(expval.item(), ndigits=3), # item() converts numpy to float. predicted=round(predval.item(), ndigits=3), ) - for smi, expval, predval in zip(smiles, expected, predicted) + for smi, expval, predval in zip(test_smiles, expected, predicted) ] + try: + cal_points = [ + Calpoint( + bin_edges=round(bin_edges.item(), ndigits=3), + frac_true=round(frac_true.item(), ndigits=3), + frac_pred=round(frac_pred.item(), ndigits=3), + ) + for bin_edges, frac_true, frac_pred in calibration_analysis( + expected, predicted + ) + ] + except ValueError: + cal_points = "" + data = BuildTrackingData( response_column_name=buildconfig.data.response_column, test_scores=rounded_test_scores, test_points=test_points, + cal_points=cal_points, ) json_data = serialize(data) @@ -156,4 +175,4 @@ def track_build(model, buildconfig: BuildConfig): try: response = requests.post(url, json=json_data, headers=headers) except Exception as e: - logger.warning(f"Failed to report build results to {url}: {e}") + logger.warning(f"Failed to report build results {json_data} to {url}: {e}") diff --git a/pyproject.toml b/pyproject.toml index 8566666..88a4eb2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "qsartuna" -version = "3.1.2" +version = "3.1.3" description = "QSARtuna: QSAR using Optimization for Hyperparameter Tuning" authors = ["Molecular AI, AstraZeneca"] license = "Apache-2.0" @@ -18,7 +18,7 @@ pandas = "^1.5.3" jsonpickle = "^2.0" xgboost = "^1.3" rdkit = ">=2023.3.1" -scikit-learn = "1.4.0" +scikit-learn = "1.5.1" apischema = "^0.17" chemprop = "1.6.1" descriptastorus = "^2.4" # Chemprop dependency, but chemprop does not install it itself. diff --git a/tests/test_tracking_build.py b/tests/test_tracking_build.py index b4bb05c..b3a7d6e 100644 --- a/tests/test_tracking_build.py +++ b/tests/test_tracking_build.py @@ -5,10 +5,14 @@ import optunaz.three_step_opt_build_merge from optunaz.config import ModelMode -from optunaz.config.buildconfig import BuildConfig, Lasso +from optunaz.config.buildconfig import BuildConfig, Lasso, RandomForestClassifier from optunaz.datareader import Dataset from optunaz.descriptors import ECFP from optunaz.utils.preprocessing.splitter import Random +from optunaz.utils.preprocessing.transform import ( + LogBase, + LogNegative, +) @pytest.fixture @@ -38,12 +42,16 @@ def buildconfig_regression(file_drd2_50, shared_datadir): ) -def test_build_tracking(buildconfig_regression): - with tempfile.NamedTemporaryFile() as f: - optunaz.three_step_opt_build_merge.build_best(buildconfig_regression, f.name) +def test_build_tracking(shared_datadir, buildconfig_regression): + with tempfile.NamedTemporaryFile( + mode="wt", delete=False, dir=shared_datadir, suffix=".pkl" + ) as build_best_pkl: + optunaz.three_step_opt_build_merge.build_best( + buildconfig_regression, build_best_pkl.name + ) -def test_build_2(shared_datadir): +def test_build_notest(shared_datadir): buildconfig = BuildConfig( data=Dataset( input_column="canonical", @@ -62,8 +70,35 @@ def test_build_2(shared_datadir): ), ) - with tempfile.NamedTemporaryFile() as f: - optunaz.three_step_opt_build_merge.build_best(buildconfig, f.name) + with tempfile.NamedTemporaryFile( + mode="wt", delete=False, dir=shared_datadir, suffix=".pkl" + ) as build_best_pkl: + optunaz.three_step_opt_build_merge.build_best(buildconfig, build_best_pkl.name) + + +def test_build_cls(shared_datadir): + buildconfig = BuildConfig( + data=Dataset( + input_column="canonical", + response_column="molwt_gt_330", + training_dataset_file=str( + shared_datadir / "DRD2" / "subset-50" / "train.csv" + ), + split_strategy=Random(fraction=0.2), + ), + metadata=None, + descriptor=ECFP.new(), + algorithm=RandomForestClassifier.new(max_features="auto"), + settings=BuildConfig.Settings( + mode=ModelMode.CLASSIFICATION, + tracking_rest_endpoint="http://localhost:8891", # To listen: nc -l -k 8891 + ), + ) + + with tempfile.NamedTemporaryFile( + mode="wt", delete=False, dir=shared_datadir, suffix=".pkl" + ) as build_best_pkl: + optunaz.three_step_opt_build_merge.build_best(buildconfig, build_best_pkl.name) def test_3(shared_datadir): @@ -125,8 +160,16 @@ def test_3(shared_datadir): buildconfig.data.training_dataset_file = str( shared_datadir / "DRD2" / "subset-50" / "train.csv" ) - with tempfile.NamedTemporaryFile() as f: - optunaz.three_step_opt_build_merge.build_best(buildconfig, f.name) + buildconfig.data.intermediate_test_dataset_file = str( + shared_datadir / "int_test.csv" + ) + buildconfig.data.intermediate_training_dataset_file = str( + shared_datadir / "int_train.csv" + ) + with tempfile.NamedTemporaryFile( + mode="wt", delete=False, dir=shared_datadir, suffix=".pkl" + ) as build_best_pkl: + optunaz.three_step_opt_build_merge.build_best(buildconfig, build_best_pkl.name) def test_4(shared_datadir): @@ -188,8 +231,16 @@ def test_4(shared_datadir): buildconfig.data.training_dataset_file = str( shared_datadir / "DRD2" / "subset-50" / "train.csv" ) - with tempfile.NamedTemporaryFile() as f: - optunaz.three_step_opt_build_merge.build_best(buildconfig, f.name) + buildconfig.data.intermediate_test_dataset_file = str( + shared_datadir / "int_test.csv" + ) + buildconfig.data.intermediate_training_dataset_file = str( + shared_datadir / "int_train.csv" + ) + with tempfile.NamedTemporaryFile( + mode="wt", delete=False, dir=shared_datadir, suffix=".pkl" + ) as build_best_pkl: + optunaz.three_step_opt_build_merge.build_best(buildconfig, build_best_pkl.name) def test_build_notestset(shared_datadir): @@ -209,6 +260,38 @@ def test_build_notestset(shared_datadir): tracking_rest_endpoint="http://localhost:8891", # To listen: nc -l -k 8891 ), ) + with tempfile.NamedTemporaryFile( + mode="wt", delete=False, dir=shared_datadir, suffix=".pkl" + ) as build_best_pkl: + optunaz.three_step_opt_build_merge.build_best(buildconfig, build_best_pkl.name) + - with tempfile.NamedTemporaryFile() as f: - optunaz.three_step_opt_build_merge.build_best(buildconfig, f.name) +def test_build_aux(shared_datadir): + buildconfig = BuildConfig( + data=Dataset( + input_column="canonical", + response_column="molwt", + training_dataset_file=str( + shared_datadir / "aux_descriptors_datasets" / "train_with_conc.csv" + ), + test_dataset_file=str( + shared_datadir / "aux_descriptors_datasets" / "train_with_conc.csv" + ), + aux_column="aux2", + log_transform=True, + log_transform_base=LogBase.LOG10, + log_transform_negative=LogNegative.FALSE, + log_transform_unit_conversion=2, + ), + metadata=None, + descriptor=ECFP.new(), + algorithm=Lasso.new(), + settings=BuildConfig.Settings( + mode=ModelMode.REGRESSION, + tracking_rest_endpoint="http://localhost:8891", # To listen: nc -l -k 8891 + ), + ) + with tempfile.NamedTemporaryFile( + mode="wt", delete=False, dir=shared_datadir, suffix=".pkl" + ) as build_best_pkl: + optunaz.three_step_opt_build_merge.build_best(buildconfig, build_best_pkl.name)