From e4d88719d7a88f24ec2576fc8b9db587514c563a Mon Sep 17 00:00:00 2001 From: Rose Wilfong Date: Wed, 31 Jul 2024 12:03:33 -0400 Subject: [PATCH 01/57] sampling and seed --- atomsci/ddm/docs/PARAMETERS.md | 32 ++ atomsci/ddm/pipeline/model_datasets.py | 9 +- atomsci/ddm/pipeline/model_pipeline.py | 70 +++- atomsci/ddm/pipeline/model_wrapper.py | 86 +++-- atomsci/ddm/pipeline/parameter_parser.py | 18 +- atomsci/ddm/pipeline/perf_data.py | 10 +- atomsci/ddm/pipeline/random_seed.py | 62 ++++ atomsci/ddm/pipeline/sampling.py | 55 +++ atomsci/ddm/pipeline/splitting.py | 53 +-- .../sampling_json/kfold_cv_NN_SMOTE.json | 16 + .../kfold_cv_NN_undersampling.json | 16 + .../sampling_json/kfold_cv_RF_SMOTE.json | 16 + .../kfold_cv_RF_undersampling.json | 16 + .../sampling_json/kfold_cv_xgboost_SMOTE.json | 16 + .../kfold_cv_xgboost_undersampling.json | 16 + .../train_valid_test_NN_SMOTE.json | 16 + .../train_valid_test_NN_undersampling.json | 16 + .../train_valid_test_RF_SMOTE.json | 16 + .../train_valid_test_RF_undersampling.json | 16 + .../train_valid_test_xgboost_SMOTE.json | 16 + ...rain_valid_test_xgboost_undersampling.json | 16 + .../sampling_test/sampling_test.py | 313 ++++++++++++++++++ .../nn_classification_kfold_test.json | 15 + .../nn_classification_train_valid_test.json | 15 + .../model_json/nn_regression_kfold_test.json | 15 + .../nn_regression_train_valid_test.json | 15 + .../rf_classification_kfold_test.json | 15 + .../rf_classification_train_valid_test.json | 15 + .../model_json/rf_regression_kfold_test.json | 15 + .../rf_regression_train_valid_test.json | 15 + .../xgboost_classification_kfold_test.json | 15 + ...boost_classification_train_valid_test.json | 15 + .../xgboost_regression_kfold_test.json | 15 + .../xgboost_regression_train_valid_test.json | 15 + .../integrative/seed_test/seed_test_models.py | 292 ++++++++++++++++ .../seed_test/seed_test_splitting.py | 147 ++++++++ .../split_json/test_kfold_random_split.json | 13 + .../split_json/test_kfold_scaffold_split.json | 13 + .../test_random_train_valid_test_split.json | 13 + .../test_scaffold_train_valid_test.json | 13 + 40 files changed, 1501 insertions(+), 70 deletions(-) create mode 100644 atomsci/ddm/pipeline/random_seed.py create mode 100644 atomsci/ddm/pipeline/sampling.py create mode 100644 atomsci/ddm/test/integrative/sampling_test/sampling_json/kfold_cv_NN_SMOTE.json create mode 100644 atomsci/ddm/test/integrative/sampling_test/sampling_json/kfold_cv_NN_undersampling.json create mode 100644 atomsci/ddm/test/integrative/sampling_test/sampling_json/kfold_cv_RF_SMOTE.json create mode 100644 atomsci/ddm/test/integrative/sampling_test/sampling_json/kfold_cv_RF_undersampling.json create mode 100644 atomsci/ddm/test/integrative/sampling_test/sampling_json/kfold_cv_xgboost_SMOTE.json create mode 100644 atomsci/ddm/test/integrative/sampling_test/sampling_json/kfold_cv_xgboost_undersampling.json create mode 100644 atomsci/ddm/test/integrative/sampling_test/sampling_json/train_valid_test_NN_SMOTE.json create mode 100644 atomsci/ddm/test/integrative/sampling_test/sampling_json/train_valid_test_NN_undersampling.json create mode 100644 atomsci/ddm/test/integrative/sampling_test/sampling_json/train_valid_test_RF_SMOTE.json create mode 100644 atomsci/ddm/test/integrative/sampling_test/sampling_json/train_valid_test_RF_undersampling.json create mode 100644 atomsci/ddm/test/integrative/sampling_test/sampling_json/train_valid_test_xgboost_SMOTE.json create mode 100644 atomsci/ddm/test/integrative/sampling_test/sampling_json/train_valid_test_xgboost_undersampling.json create mode 100644 atomsci/ddm/test/integrative/sampling_test/sampling_test.py create mode 100644 atomsci/ddm/test/integrative/seed_test/model_json/nn_classification_kfold_test.json create mode 100644 atomsci/ddm/test/integrative/seed_test/model_json/nn_classification_train_valid_test.json create mode 100644 atomsci/ddm/test/integrative/seed_test/model_json/nn_regression_kfold_test.json create mode 100644 atomsci/ddm/test/integrative/seed_test/model_json/nn_regression_train_valid_test.json create mode 100644 atomsci/ddm/test/integrative/seed_test/model_json/rf_classification_kfold_test.json create mode 100644 atomsci/ddm/test/integrative/seed_test/model_json/rf_classification_train_valid_test.json create mode 100644 atomsci/ddm/test/integrative/seed_test/model_json/rf_regression_kfold_test.json create mode 100644 atomsci/ddm/test/integrative/seed_test/model_json/rf_regression_train_valid_test.json create mode 100644 atomsci/ddm/test/integrative/seed_test/model_json/xgboost_classification_kfold_test.json create mode 100644 atomsci/ddm/test/integrative/seed_test/model_json/xgboost_classification_train_valid_test.json create mode 100644 atomsci/ddm/test/integrative/seed_test/model_json/xgboost_regression_kfold_test.json create mode 100644 atomsci/ddm/test/integrative/seed_test/model_json/xgboost_regression_train_valid_test.json create mode 100644 atomsci/ddm/test/integrative/seed_test/seed_test_models.py create mode 100644 atomsci/ddm/test/integrative/seed_test/seed_test_splitting.py create mode 100644 atomsci/ddm/test/integrative/seed_test/split_json/test_kfold_random_split.json create mode 100644 atomsci/ddm/test/integrative/seed_test/split_json/test_kfold_scaffold_split.json create mode 100644 atomsci/ddm/test/integrative/seed_test/split_json/test_random_train_valid_test_split.json create mode 100644 atomsci/ddm/test/integrative/seed_test/split_json/test_scaffold_train_valid_test.json diff --git a/atomsci/ddm/docs/PARAMETERS.md b/atomsci/ddm/docs/PARAMETERS.md index 135b332b..611ae5c6 100644 --- a/atomsci/ddm/docs/PARAMETERS.md +++ b/atomsci/ddm/docs/PARAMETERS.md @@ -276,6 +276,14 @@ The AMPL pipeline contains many parameters and options to fit models and make pr |*Description:*|True/False flag for setting verbosity| |*Default:*|FALSE| |*Type:*|Bool| + +- **seed** + +||| +|-|-| +|*Description:*|Seed used for initializing a random number generator to ensure results are reproducible. Default is None and a random seed will be generated.| +|*Default:*|None| +|*Type:*|int| - **production** @@ -529,6 +537,30 @@ the model will train for max_epochs regardless of validation error.| |*Default:*|scaffold| |*Type:*|str| +- **sampling_method** + +||| +|-|-| +|*Description:*|The sampling method for addressing class imbalance in classification datasets. Options include 'undersampling' and 'SMOTE'.| +|*Default:*|None| +|*Type:*|str| + +- **sampling_ratio** + +||| +|-|-| +|*Description:*|The desired ratio of the minority class to the majority class after sampling (e.g., if str, 'minority', 'not minority'; if float, '0.2', '1.0'). | +|*Default:*|auto| +|*Type:*|str| + +- **sampling_k_neighbors** + +||| +|-|-| +|*Description:*|The number of nearest neighbors to consider when generating synthetic samples (e.g., 5, 7, 9). Specifically used for SMOTE sampling method.| +|*Default:*|5| +|*Type:*|int| + - **mtss\_num\_super\_scaffolds** ||| diff --git a/atomsci/ddm/pipeline/model_datasets.py b/atomsci/ddm/pipeline/model_datasets.py index b070e8c0..8ca3ce7a 100644 --- a/atomsci/ddm/pipeline/model_datasets.py +++ b/atomsci/ddm/pipeline/model_datasets.py @@ -16,7 +16,6 @@ import traceback import sys -from atomsci.ddm.pipeline import random_seed as rs from collections import defaultdict @@ -438,7 +437,7 @@ def get_dataset_tasks(self, dset_df): return self.tasks is not None # **************************************************************************************** - def split_dataset(self): + def split_dataset(self, random_state=None, seed=None): """Splits the dataset into paired training/validation and test subsets, according to the split strategy selected by the model params. For traditional train/valid/test splits, there is only one training/validation pair. For k-fold cross-validation splits, there are k different train/valid pairs; the validation sets are @@ -457,7 +456,7 @@ def split_dataset(self): # Create object to delegate splitting to. if self.splitting is None: - self.splitting = split.create_splitting(self.params) + self.splitting = split.create_splitting(self.params, random_state=random_state, seed=seed) self.train_valid_dsets, self.test_dset, self.train_valid_attr, self.test_attr = \ self.splitting.split_dataset(self.dataset, self.attr, self.params.smiles_col) if self.train_valid_dsets is None: @@ -568,7 +567,7 @@ def create_dataset_split_table(self): return split_df # **************************************************************************************** - def load_presplit_dataset(self, directory=None): + def load_presplit_dataset(self, directory=None, random_state=None, seed=None): """Loads a table of compound IDs assigned to split subsets, and uses them to split the currently loaded featurized dataset. @@ -595,7 +594,7 @@ def load_presplit_dataset(self, directory=None): """ # Load the split table from the datastore or filesystem - self.splitting = split.create_splitting(self.params) + self.splitting = split.create_splitting(self.params, random_state=random_state, seed=seed) try: split_df, split_kv = self.load_dataset_split_table(directory) diff --git a/atomsci/ddm/pipeline/model_pipeline.py b/atomsci/ddm/pipeline/model_pipeline.py index 575c6b6d..5df0f660 100644 --- a/atomsci/ddm/pipeline/model_pipeline.py +++ b/atomsci/ddm/pipeline/model_pipeline.py @@ -32,6 +32,8 @@ from atomsci.ddm.pipeline import parameter_parser as parse from atomsci.ddm.pipeline import model_tracker as trkr from atomsci.ddm.pipeline import transformations as trans +from atomsci.ddm.pipeline import random_seed as rs +from atomsci.ddm.pipeline import sampling as sample logging.basicConfig(format='%(asctime)-15s %(message)s') @@ -154,7 +156,7 @@ class ModelPipeline: data (ModelDataset object): A data object that featurizes and splits the dataset """ - def __init__(self, params, ds_client=None, mlmt_client=None): + def __init__(self, params, ds_client=None, mlmt_client=None, random_state=None, seed=None): """Initializes ModelPipeline object. Args: @@ -188,6 +190,23 @@ def __init__(self, params, ds_client=None, mlmt_client=None): self.log = logging.getLogger('ATOM') self.run_mode = 'training' # default, can be overridden later self.start_time = time.time() + + # initialize seed + if seed is None: + seed = getattr(params, 'seed', None) + self.random_gen = rs.RandomStateGenerator(params, seed) + self.seed = self.random_gen.get_seed() + else: + # pass the seed into the RandomStateGenerator + self.random_gen = rs.RandomStateGenerator(seed) + self.seed = self.random_gen.get_seed() + + if random_state is None: + self.random_state = self.random_gen.get_random_state() + else: + self.random_state = random_state + # log the seed used + self.log.info('Initiating ModelPipeline with seed {}'.format(self.seed)) # Default dataset_name parameter from dataset_key if params.dataset_name is None: @@ -237,7 +256,7 @@ def __init__(self, params, ds_client=None, mlmt_client=None): # **************************************************************************************** - def load_featurize_data(self, params=None): + def load_featurize_data(self, params=None, random_state=None, seed=None): """Loads the dataset from the datastore or the file system and featurizes it. If we are training a new model, split the dataset into training, validation and test sets. @@ -248,6 +267,7 @@ def load_featurize_data(self, params=None): Args: params (Namespace): Optional set of parameters to be used for featurization; by default this function uses the parameters used when the pipeline was created. + seed (int): Optional seed for reproducibility Side effects: Sets the following attributes of the ModelPipeline @@ -266,10 +286,13 @@ def load_featurize_data(self, params=None): self.log.info('Training in production mode. Ignoring ' 'previous split and creating production split. ' 'Production split will not be saved.') - self.data.split_dataset() - elif not (params.previously_split and self.data.load_presplit_dataset()): - self.data.split_dataset() + self.data.split_dataset(random_state=self.random_state, seed=self.seed) + elif not (params.previously_split and self.data.load_presplit_dataset(random_state=self.random_state, seed=self.seed)): + self.data.split_dataset(random_state=self.random_state, seed=self.seed) self.data.save_split_dataset() + # write split metadata + self.create_split_metadata() + self.save_split_metadata() if self.data.params.prediction_type == 'classification': self.data._validate_classification_dataset() # We now create transformers after splitting, to allow for the case where the transformer @@ -282,6 +305,8 @@ def load_featurize_data(self, params=None): if self.run_mode == 'training': for i, (train, valid) in enumerate(self.data.train_valid_dsets): + if self.data.params.prediction_type == 'classification' and self.params.sampling_method is not None: + train = sample.apply_sampling_method(train, params, random_state=self.random_state, seed=self.seed) train = self.model_wrapper.transform_dataset(train) valid = self.model_wrapper.transform_dataset(valid) self.data.train_valid_dsets[i] = (train, valid) @@ -342,6 +367,13 @@ def create_model_metadata(self): hyperparam_uuid=self.params.hyperparam_uuid, ampl_version=mu.get_ampl_version() ) + # add in sampling method parameters for documentation/reproducibility + if self.params.sampling_method is not None: + model_params['sampling_method'] = self.params.sampling_method + if self.params.sampling_ratio is not None: + model_params['sampling_ratio'] = self.params.sampling_ratio + if self.params.sampling_k_neighbors is not None: + model_params['sampling_k_neighbors'] = self.params.sampling_k_neighbors splitting_metadata = self.data.get_split_metadata() model_metadata = dict( @@ -360,6 +392,8 @@ def create_model_metadata(self): model_metadata[key] = data for key, data in trans.get_transformer_specific_metadata(self.params).items(): model_metadata[key] = data + + model_metadata['seed'] = self.seed self.model_metadata = model_metadata @@ -413,6 +447,28 @@ def save_model_metadata(self, retries=5, sleep_sec=60): trkr.save_model_tarball(self.output_dir, self.params.model_tarball_path) self.model_wrapper._clean_up_excess_files(self.model_wrapper.model_dir) + # **************************************************************************************** + def create_split_metadata(self): + """Creates metadata for each split dataset. + It will save the seed used to create the split dataset and relevant parameters.""" + self.split_data = dict( + dataset_key = self.params.dataset_key, + id_col = self.params.id_col, + smiles_col = self.params.smiles_col, + response_cols = self.params.response_cols, + seed = self.seed + ) + self.splitting_metadata = self.data.get_split_metadata() + self.split_data['splitting_metadata'] = self.splitting_metadata + + # **************************************************************************************** + def save_split_metadata(self): + out_file = os.path.join(self.output_dir, 'split_metadata.json') + + with open(out_file, 'w') as out: + json.dump(self.split_data, out, sort_keys=True, indent=4, separators=(',', ': ')) + out.write("\n") + # **************************************************************************************** def create_prediction_metadata(self, prediction_results): """Initializes a data structure to hold performance metrics from a model run on a new dataset, @@ -540,7 +596,7 @@ def split_dataset(self, featurization=None): # **************************************************************************************** - def train_model(self, featurization=None): + def train_model(self, featurization=None, random_state=None, seed=None): """Build model described by self.params on the training dataset described by self.params. Generate predictions for the training, validation, and test datasets, and save the predictions and @@ -574,7 +630,7 @@ def train_model(self, featurization=None): ## create model wrapper if not split_only if not self.params.split_only: - self.model_wrapper = model_wrapper.create_model_wrapper(self.params, self.featurization, self.ds_client) + self.model_wrapper = model_wrapper.create_model_wrapper(self.params, self.featurization, self.ds_client, random_state=self.random_state, seed=self.seed) self.model_wrapper.setup_model_dirs() self.load_featurize_data() diff --git a/atomsci/ddm/pipeline/model_wrapper.py b/atomsci/ddm/pipeline/model_wrapper.py index 0b5f6e44..8abc7709 100644 --- a/atomsci/ddm/pipeline/model_wrapper.py +++ b/atomsci/ddm/pipeline/model_wrapper.py @@ -171,7 +171,7 @@ def all_bases(model): return result # **************************************************************************************** -def create_model_wrapper(params, featurizer, ds_client=None): +def create_model_wrapper(params, featurizer, ds_client=None, random_state=None, seed=None): """Factory function for creating Model objects of the correct subclass for params.model_type. Args: @@ -189,11 +189,11 @@ def create_model_wrapper(params, featurizer, ds_client=None): """ if params.model_type == 'NN': if params.featurizer == 'graphconv': - return GraphConvDCModelWrapper(params, featurizer, ds_client) + return GraphConvDCModelWrapper(params, featurizer, ds_client, random_state=random_state, seed=seed) else: - return MultitaskDCModelWrapper(params, featurizer, ds_client) + return MultitaskDCModelWrapper(params, featurizer, ds_client, random_state=random_state, seed=seed) elif params.model_type == 'RF': - return DCRFModelWrapper(params, featurizer, ds_client) + return DCRFModelWrapper(params, featurizer, ds_client, random_state=random_state, seed=seed) elif params.model_type == 'xgboost': if not xgboost_supported: raise Exception("Unable to import xgboost. \ @@ -209,9 +209,9 @@ def create_model_wrapper(params, featurizer, ds_client=None): installation: \ from pip: pip install xgboost==0.90") else: - return DCxgboostModelWrapper(params, featurizer, ds_client) + return DCxgboostModelWrapper(params, featurizer, ds_client, random_state=random_state, seed=seed) elif params.model_type == 'hybrid': - return HybridModelWrapper(params, featurizer, ds_client) + return HybridModelWrapper(params, featurizer, ds_client, random_state=random_state, seed=seed) elif params.model_type in pp.model_wl: requested_model = pp.model_wl[params.model_type] bases = all_bases(requested_model) @@ -220,9 +220,9 @@ def create_model_wrapper(params, featurizer, ds_client=None): if any(['TorchModel' in str(b) for b in bases]): if not afp_supported: raise Exception("dgl and dgllife packages must be installed to use attentive_fp model.") - return PytorchDeepChemModelWrapper(params, featurizer, ds_client) + return PytorchDeepChemModelWrapper(params, featurizer, ds_client, random_state=random_state, seed=seed) elif any(['KerasModel' in str(b) for b in bases]): - return KerasDeepChemModelWrapper(params, featurizer, ds_client) + return KerasDeepChemModelWrapper(params, featurizer, ds_client, random_state=random_state, seed=seed) else: raise ValueError("Unknown model_type %s" % params.model_type) @@ -252,7 +252,7 @@ class ModelWrapper(object): best_model_dir (str): The subdirectory under output_dir that contains the best model. Created in setup_model_dirs """ - def __init__(self, params, featurizer, ds_client): + def __init__(self, params, featurizer, ds_client, random_state=None, seed=None): """Initializes ModelWrapper object. Args: @@ -290,6 +290,9 @@ def __init__(self, params, featurizer, ds_client): self.transformers_x = [] self.transformers_w = [] + self.random_state = random_state + self.seed = seed + # **************************************************************************************** def setup_model_dirs(self): @@ -1177,7 +1180,7 @@ class HybridModelWrapper(NNModelWrapper): """ - def __init__(self, params, featurizer, ds_client): + def __init__(self, params, featurizer, ds_client, random_state=None, seed=None): """Initializes HybridModelWrapper object. Args: @@ -1202,7 +1205,11 @@ def __init__(self, params, featurizer, ds_client): model: dc.models.TorchModel """ - super().__init__(params, featurizer, ds_client) + super().__init__(params, featurizer, ds_client, random_state=random_state, seed=seed) + + self.random_state = random_state + self.seed = seed + if self.params.layer_sizes is None: if self.params.featurizer == 'ecfp': self.params.layer_sizes = [1000, 500] @@ -1601,7 +1608,7 @@ class ForestModelWrapper(ModelWrapper): contains code that is similar between the two tree based classes """ - def __init__(self, params, featurizer, ds_client): + def __init__(self, params, featurizer, ds_client, random_state=None, seed=None): """Initializes DCRFModelWrapper object. Args: @@ -1610,12 +1617,15 @@ def __init__(self, params, featurizer, ds_client): featurizer (Featurization): Object managing the featurization of compounds ds_client: datastore client. """ - super().__init__(params, featurizer, ds_client) + super().__init__(params, featurizer, ds_client, random_state=random_state, seed=seed) self.best_model_dir = os.path.join(self.output_dir, 'best_model') self.model_dir = self.best_model_dir os.makedirs(self.best_model_dir, exist_ok=True) - self.model = self.make_dc_model(self.best_model_dir) + self.random_state = random_state + self.seed = seed + + self.model = self.make_dc_model(self.best_model_dir, random_state=random_state, seed=seed) # **************************************************************************************** def train(self, pipeline): @@ -1684,7 +1694,7 @@ def train(self, pipeline): self.best_epoch = 0 # **************************************************************************************** - def make_dc_model(self, model_dir): + def make_dc_model(self, model_dir, random_state=None, seed=None): """Build a DeepChem model. Builds a model, wraps it in DeepChem's wrapper and returns it @@ -1808,7 +1818,7 @@ class DCRFModelWrapper(ForestModelWrapper): """ - def __init__(self, params, featurizer, ds_client): + def __init__(self, params, featurizer, ds_client, random_state=None, seed=None): """Initializes DCRFModelWrapper object. Args: @@ -1817,10 +1827,12 @@ def __init__(self, params, featurizer, ds_client): featurizer (Featurization): Object managing the featurization of compounds ds_client: datastore client. """ - super().__init__(params, featurizer, ds_client) + super().__init__(params, featurizer, ds_client, random_state=random_state, seed=seed) + self.random_state = random_state + self.seed = seed # **************************************************************************************** - def make_dc_model(self, model_dir): + def make_dc_model(self, model_dir, random_state=None, seed=None): """Build a DeepChem model. Builds a model, wraps it in DeepChem's wrapper and returns it @@ -1835,12 +1847,14 @@ def make_dc_model(self, model_dir): rf_model = RandomForestRegressor(n_estimators=self.params.rf_estimators, max_features=self.params.rf_max_features, max_depth=self.params.rf_max_depth, - n_jobs=-1) + n_jobs=-1, + random_state=self.seed) else: rf_model = RandomForestClassifier(n_estimators=self.params.rf_estimators, max_features=self.params.rf_max_features, max_depth=self.params.rf_max_depth, - n_jobs=-1) + n_jobs=-1, + random_state=self.seed) return dc.models.sklearn_models.SklearnModel(rf_model, model_dir=model_dir) @@ -1959,7 +1973,7 @@ class DCxgboostModelWrapper(ForestModelWrapper): """ - def __init__(self, params, featurizer, ds_client): + def __init__(self, params, featurizer, ds_client, random_state=None, seed=None): """Initializes RunModel object. Args: @@ -1968,10 +1982,12 @@ def __init__(self, params, featurizer, ds_client): featurizer (Featurization): Object managing the featurization of compounds ds_client: datastore client. """ - super().__init__(params, featurizer, ds_client) + super().__init__(params, featurizer, ds_client, random_state=random_state, seed=seed) + self.random_state = random_state + self.seed = seed # **************************************************************************************** - def make_dc_model(self, model_dir): + def make_dc_model(self, model_dir, random_state=None, seed=None): """Build a DeepChem model. Builds a model, wraps it in DeepChem's wrapper and returns it @@ -1999,7 +2015,7 @@ def make_dc_model(self, model_dir): reg_lambda=1, scale_pos_weight=1, base_score=0.5, - random_state=0, + random_state= self.seed, #0, missing=np.nan, importance_type='gain', n_jobs=-1, @@ -2024,7 +2040,7 @@ def make_dc_model(self, model_dir): reg_lambda=1, scale_pos_weight=1, base_score=0.5, - random_state=0, + random_state=self.seed, #0, importance_type='gain', missing=np.nan, gpu_id = -1, @@ -2134,7 +2150,7 @@ def reload_model(self, reload_dir): reg_lambda=1, scale_pos_weight=1, base_score=0.5, - random_state=0, + random_state=self.seed, #0, missing=np.nan, importance_type='gain', n_jobs=-1, @@ -2159,7 +2175,7 @@ def reload_model(self, reload_dir): reg_lambda=1, scale_pos_weight=1, base_score=0.5, - random_state=0, + random_state=self.seed, #0, importance_type='gain', missing=np.nan, gpu_id = -1, @@ -2311,7 +2327,7 @@ class PytorchDeepChemModelWrapper(NNModelWrapper): valid_perfs (dict): A dictionary of predicted values and metrics on the validation dataset """ - def __init__(self, params, featurizer, ds_client): + def __init__(self, params, featurizer, ds_client, random_state=None, seed=None): """Initializes AttentiveFPModelWrapper object. Creates the underlying DeepChem AttentiveFPModel instance. Args: @@ -2321,9 +2337,10 @@ def __init__(self, params, featurizer, ds_client): ds_client: datastore client. """ # use NNModelWrapper init. - super().__init__(params, featurizer, ds_client) + super().__init__(params, featurizer, ds_client, random_state=random_state, seed=seed) self.num_epochs_trained = 0 - + self.random_state = random_state + self.seed = seed self.model = self.recreate_model() # **************************************************************************************** @@ -2349,6 +2366,7 @@ def recreate_model(self, **kwargs): # build the model model = chosen_model( + sed = self.seed, **extracted_features ) @@ -2679,7 +2697,7 @@ class GraphConvDCModelWrapper(KerasDeepChemModelWrapper): """ - def __init__(self, params, featurizer, ds_client): + def __init__(self, params, featurizer, ds_client, random_state=None, seed=None): """Initializes GraphConvDCModelWrapper object. Args: @@ -2712,7 +2730,8 @@ def __init__(self, params, featurizer, ds_client): self.g = tf.Graph() self.sess = tf.compat.v1.Session(graph=self.g) self.num_epochs_trained = 0 - + self.random_state = random_state + self.seed = seed self.model = self.recreate_model(model_dir=self.model_dir) # **************************************************************************************** @@ -2750,7 +2769,8 @@ def recreate_model(self, model_dir=None): dense_layer_size=self.params.layer_sizes[-1], dropout=self.params.dropouts, penalty=self.params.weight_decay_penalty, - penalty_type=self.params.weight_decay_penalty_type) + penalty_type=self.params.weight_decay_penalty_type, + seed=self.seed) return model # **************************************************************************************** diff --git a/atomsci/ddm/pipeline/parameter_parser.py b/atomsci/ddm/pipeline/parameter_parser.py index 8d960b38..74ba4582 100644 --- a/atomsci/ddm/pipeline/parameter_parser.py +++ b/atomsci/ddm/pipeline/parameter_parser.py @@ -542,7 +542,7 @@ def get_list_args(self): } convert_to_int_list = {'layer_sizes','rf_max_features','rf_estimators', 'rf_max_depth', 'umap_dim', 'umap_neighbors', 'layer_nums', 'node_nums', - 'xgb_max_depth', 'xgb_n_estimators'}.union(all_auto_int_lists()) + 'xgb_max_depth', 'xgb_n_estimators', 'seed'}.union(all_auto_int_lists()) convert_to_numeric_list = convert_to_float_list | convert_to_int_list keep_as_list = {'dropouts','weight_init_stddevs','bias_init_consts', 'layer_sizes','dropout_list','layer_nums'}.union(all_auto_lists()) @@ -1043,6 +1043,10 @@ def get_parser(): '--verbose', dest='verbose', action='store_true', help='True/False flag for setting verbosity') parser.set_defaults(verbose=False) + parser.add_argument( + '--seed', dest='seed', default=None, + help='Random seed used for initializing the random number generator to ensure results are reproducible.' + 'Default is None and a random seed will be generated.') # ********************************************************************************************************** # model_building_parameters: graphconv @@ -1229,6 +1233,18 @@ def get_parser(): help='Type of splitter to use: index, random, scaffold, butina, ave_min, temporal, fingerprint, multitaskscaffold or stratified.' ' Used to set the splitting.py subclass. Can be input as a comma separated list for hyperparameter search' ' (e.g. \'scaffold\',\'random\')') + # sampling specific parameters (imbalance-learn) + parser.add_argument( + '--sampling_method', dest='sampling_method', type=str, default=None, + help='Method for sampling to address class imbalance (e.g., \'undersampling\', \'SMOTE\')') + + parser.add_argument( + '--sampling_ratio', dest='sampling_ratio', type=str, default='auto', + help='The desired ratio of the minority class to the majority class after sampling.') + parser.add_argument( + '--sampling_k_neighbors', dest='sampling_k_neighbors', type=int, default=5, + help='The nearest neighbors used to define the neighborhood of samples to use to generate the synthetic samples. Specifically used for SMOTE.') + parser.add_argument( '--mtss_num_super_scaffolds', default=40, type=int, diff --git a/atomsci/ddm/pipeline/perf_data.py b/atomsci/ddm/pipeline/perf_data.py index df5fd7bc..4fe2eb00 100644 --- a/atomsci/ddm/pipeline/perf_data.py +++ b/atomsci/ddm/pipeline/perf_data.py @@ -261,8 +261,8 @@ def model_choice_score(self, score_type='r2'): """ ids, pred_vals, stds = self.get_pred_values() - real_vals = self.get_real_values(ids) - weights = self.get_weights(ids) + real_vals = self.get_real_values(ids=ids) + weights = self.get_weights(ids=ids) scores = [] for i in range(self.num_tasks): nzrows = np.where(weights[:,i] != 0)[0] @@ -1029,7 +1029,11 @@ def get_pred_values(self): otherwise. """ - ids = sorted(self.pred_vals.keys()) + #ids = sorted(self.pred_vals.keys()) + all_ids = sorted(self.pred_vals.keys()) + # with kfold + SMOTE, not all ids have predictions + ids = [id for id in all_ids if not (self.pred_vals[id].size == 0)] + if self.subset in ['train', 'test', 'train_valid']: rawvals = np.concatenate([self.pred_vals[id].mean(axis=0, keepdims=True).reshape((1,-1)) for id in ids]) vals = dc.trans.undo_transforms(rawvals, self.transformers) diff --git a/atomsci/ddm/pipeline/random_seed.py b/atomsci/ddm/pipeline/random_seed.py new file mode 100644 index 00000000..9fb541ae --- /dev/null +++ b/atomsci/ddm/pipeline/random_seed.py @@ -0,0 +1,62 @@ +""" Used to set random seed from parameter_parser for reproducibility. """ +import numpy as np +import uuid +import random +import torch +import tensorflow as tf +#---------------------------------------------------------------------------------- +class RandomStateGenerator: + """ + A class to manage random state and seed generation for reproducible randomness. + + Attributes: + params: Additional parameters. + seed: The seed for the random state. + random_state: The random state generator. + """ + def __init__(self, params=None, seed=None): + self.params = params + if seed is not None: + self.seed = seed + elif self.params.seed is not None: + self.seed = self.params.seed + else: + self.seed = uuid.uuid4().int % (2**32) + self.set_seed(self.seed) + + def set_seed(self, seed): + """Set the seed for all relevant libraries.""" + + global _seed, _random_state + _seed = seed + + _random_state = np.random.default_rng(_seed) + + # set seed for numpy + np.random.default_rng(_seed) + + # needed for deepchem + np.random.seed(_seed) + + # set seed for random + random.seed(_seed) + + # set seed for PyTorch + torch.manual_seed(_seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(_seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + + # set seed for tensorflow + tf.random.set_seed(_seed) + + self.random_state = _random_state + + def get_seed(self): + """Returns the seed when called""" + return self.seed + + def get_random_state(self): + """Returns the random state when called""" + return self.random_state \ No newline at end of file diff --git a/atomsci/ddm/pipeline/sampling.py b/atomsci/ddm/pipeline/sampling.py new file mode 100644 index 00000000..c1fb28b1 --- /dev/null +++ b/atomsci/ddm/pipeline/sampling.py @@ -0,0 +1,55 @@ +"""Module used to perform sampling on classification datasets.""" +import numpy as np +# sampling specific libraries +from imblearn.over_sampling import SMOTE +from imblearn.under_sampling import RandomUnderSampler +# deepchem for dataset +import deepchem as dc +# ===================================================================================================== +def apply_sampling_method(train, params, random_state=None, seed=None): + """ + Apply a sampling method to a classification dataset when split_strategy=='train_valid_test' + + Inputs: + - train: DeepChem NumpyDataset with train.X, train.y, train.w, and train.ids + - params (NameSpace object): contains all the parameter information. + Returns: + - train_resampled: a DeepChem NumpyDataset with train.X, train.y, train.w, and train.ids + """ + sampling_ratio = params.sampling_ratio + + if params.sampling_method=='SMOTE': + sampling_k_neighbors = params.sampling_k_neighbors # smote specific parameter + smote=SMOTE(sampling_strategy=sampling_ratio, k_neighbors=sampling_k_neighbors, random_state=seed) + X_resampled, y_resampled = smote.fit_resample(train.X, train.y.ravel()) + y_resampled=y_resampled.reshape(-1, 1) + + # calculate synthetic weights + num_original = len(train.X) + num_synthetic = len(X_resampled)-num_original + + # set the new weights equal to 1 + average_weight = 1 #np.mean(train.w) + synthetic_weights=np.full((num_synthetic,1), average_weight, dtype=np.float64) + resampled_weights=np.concatenate([train.w, synthetic_weights]) + + # update the id length with synthetic ids for any newly introduced data + synthetic_ids = [f"synthetic_{i}" for i in range(num_synthetic)] + new_ids = np.concatenate([train.ids, synthetic_ids]) + + elif params.sampling_method == 'undersampling': + undersampler = RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=seed) + X_resampled, y_resampled = undersampler.fit_resample(train.X, train.y.ravel()) + y_resampled=y_resampled.reshape(-1, 1) + + #adjust weights and ids + resampled_indices = undersampler.sample_indices_ + resampled_weights = train.w[resampled_indices] + new_ids = train.ids[resampled_indices] + + else: + raise ValueError(f"Unknown sampling method: {params.sampling_method}. Supported methods are 'SMOTE' and 'undersampling'.") + # return a new dc.data.NumpyDataset with the resampled data, the original weights and ids + train_resampled= dc.data.NumpyDataset(X_resampled, y_resampled, resampled_weights, new_ids) + + return train_resampled \ No newline at end of file diff --git a/atomsci/ddm/pipeline/splitting.py b/atomsci/ddm/pipeline/splitting.py index f5ddf8a5..a16d7bae 100644 --- a/atomsci/ddm/pipeline/splitting.py +++ b/atomsci/ddm/pipeline/splitting.py @@ -30,7 +30,7 @@ 'mtss_train_valid_dist_weight', 'mtss_split_fraction_weight', 'mtss_num_pop', 'mtss_response_distr_weight'] -def create_splitting(params): +def create_splitting(params, random_state=None, seed=None): """Factory function to create appropriate type of Splitting object, based on dataset parameters Args: @@ -46,11 +46,11 @@ def create_splitting(params): """ if params.production: - return ProductionSplitting(params) + return ProductionSplitting(params, random_state=random_state, seed=seed) elif params.split_strategy == 'train_valid_test': - return TrainValidTestSplitting(params) + return TrainValidTestSplitting(params, random_state=random_state, seed=seed) elif params.split_strategy == 'k_fold_cv': - return KFoldSplitting(params) + return KFoldSplitting(params, random_state=random_state, seed=seed) else: raise Exception("Unknown split strategy %s" % params.split_strategy) @@ -175,7 +175,7 @@ class Splitting(object): """ - def __init__(self, params): + def __init__(self, params, random_state=None, seed=None): """Constructor, also serves as a factory method for creating the associated DeepChem splitter object Args: @@ -196,6 +196,9 @@ def __init__(self, params): splitter (Deepchem split object): A splitting object of the subtype specified by split """ + self.random_state = random_state + self.seed = seed + self.params = params self.split = params.splitter if params.splitter == 'index': @@ -283,7 +286,7 @@ class KFoldSplitting(Splitting): """ - def __init__(self, params): + def __init__(self, params, random_state=None, seed=None): """Initialization method for KFoldSplitting. Sets the following attributes for KFoldSplitting: @@ -296,8 +299,11 @@ def __init__(self, params): num_folds (int): The number of k-fold splits to perform """ - super().__init__(params) + super().__init__(params, random_state, seed) self.num_folds = params.num_folds + self.random_state = random_state + self.seed = seed + # **************************************************************************************** @@ -367,13 +373,13 @@ def split_dataset(self, dataset, attr_df, smiles_col): # Use DeepChem train_test_split() to select held-out test set; then use k_fold_split on the # training set to split it into training/validation folds. if self.split == 'butina': - train_cv, test, _ = self.splitter.train_valid_test_split(dataset) + train_cv, test, _ = self.splitter.train_valid_test_split(dataset, seed=self.seed) self.splitter = dc.splits.ScaffoldSplitter() - train_cv_pairs = self.splitter.k_fold_split(train_cv, self.num_folds) + train_cv_pairs = self.splitter.k_fold_split(train_cv, self.num_folds, seed=self.seed) else: # TODO: Add special handling for AVE splitter - train_cv, test = self.splitter.train_test_split(dataset, frac_train=train_frac) - train_cv_pairs = self.splitter.k_fold_split(train_cv, self.num_folds) + train_cv, test = self.splitter.train_test_split(dataset, frac_train=train_frac, seed=self.seed) + train_cv_pairs = self.splitter.k_fold_split(train_cv, self.num_folds, seed=self.seed) train_valid_dsets = [] train_valid_attr = [] @@ -406,7 +412,7 @@ class TrainValidTestSplitting(Splitting): """ - def __init__(self, params): + def __init__(self, params, random_state=None, seed=None): """Initialization method for TrainValidTestSplitting. Sets the following attributes for TrainValidTestSplitting: @@ -419,8 +425,10 @@ def __init__(self, params): num_folds (int): The number of k-fold splits to perform. In this case, it is always set to 1 """ - super().__init__(params) + super().__init__(params, random_state=random_state, seed=seed) self.num_folds = 1 + self.random_state = random_state + self.seed = seed # **************************************************************************************** def get_split_prefix(self, parent=''): @@ -493,11 +501,11 @@ def split_dataset(self, dataset, attr_df, smiles_col): if self.split == 'butina': # Can't use train_test_split with Butina because Butina splits into train and valid sets only. - train_valid, test, _ = self.splitter.train_valid_test_split(dataset) + train_valid, test, _ = self.splitter.train_valid_test_split(dataset, seed=self.seed) self.splitter = dc.splits.ScaffoldSplitter() # With Butina splitting, we don't have control over the size of the test set train_frac = 1.0 - self.params.split_valid_frac - train, valid = self.splitter.train_test_split(train_valid, frac_train=train_frac) + train, valid = self.splitter.train_test_split(train_valid, frac_train=train_frac, seed=self.seed) elif self.split == 'ave_min': # AVEMinSplitter also only does train-valid splits, but at least nested splits seem to work. # TODO: Change this if we modify AVE splitter to do 3-way splits internally. @@ -506,11 +514,11 @@ def split_dataset(self, dataset, attr_df, smiles_col): log.info("Performing split for test set") train_valid, test, _ = self.splitter.train_valid_test_split(dataset, frac_train=train_valid_frac, frac_valid=self.params.split_test_frac, - frac_test=0.0) + frac_test=0.0, seed=self.seed) log.info("Performing split of training and validation sets") train, valid, _ = self.splitter.train_valid_test_split(train_valid, frac_train=train_frac/train_valid_frac, frac_valid=self.params.split_valid_frac/train_valid_frac, - frac_test=0.0) + frac_test=0.0, seed=self.seed) log.info("Results of 3-way split: %d training, %d validation, %d test compounds" % ( train.X.shape[0], valid.X.shape[0], test.X.shape[0])) elif self.split == 'temporal': @@ -532,11 +540,12 @@ def split_dataset(self, dataset, attr_df, smiles_col): response_distr_fitness_weight=self.params.mtss_response_distr_weight, num_super_scaffolds=self.params.mtss_num_super_scaffolds, num_pop=self.params.mtss_num_pop, - num_generations=self.params.mtss_num_generations) + num_generations=self.params.mtss_num_generations, + seed=self.seed) else: train_frac = 1.0 - self.params.split_valid_frac - self.params.split_test_frac train, valid, test = self.splitter.train_valid_test_split(dataset, - frac_train=train_frac, frac_valid=self.params.split_valid_frac, frac_test=self.params.split_test_frac) + frac_train=train_frac, frac_valid=self.params.split_valid_frac, frac_test=self.params.split_test_frac, seed=self.seed) # After splitting unique compound_ids or SMILES are expanded train, train_attr = dm.expand_selection(train.ids) @@ -561,10 +570,12 @@ def split( # **************************************************************************************** class ProductionSplitting(Splitting): - def __init__(self, params): + def __init__(self, params, random_state=None, seed=None): """This Splitting only does one thing and ignores all splitter parameters""" self.splitter = ProductionSplitter() self.split = 'production' + self.random_state = random_state + self.seed = seed # **************************************************************************************** def get_split_prefix(self, parent=''): @@ -624,7 +635,7 @@ def split_dataset(self, dataset, attr_df, smiles_col): dm = DatasetManager(dataset=dataset, attr_df=attr_df, smiles_col=smiles_col, needs_smiles=self.needs_smiles()) dataset = dm.compact_dataset() - train, valid, test = self.splitter.train_valid_test_split(dataset) + train, valid, test = self.splitter.train_valid_test_split(dataset, seed=self.seed) # After splitting unique compound_ids or SMILES are expanded train, train_attr = dm.expand_selection(train.ids) diff --git a/atomsci/ddm/test/integrative/sampling_test/sampling_json/kfold_cv_NN_SMOTE.json b/atomsci/ddm/test/integrative/sampling_test/sampling_json/kfold_cv_NN_SMOTE.json new file mode 100644 index 00000000..41febfb6 --- /dev/null +++ b/atomsci/ddm/test/integrative/sampling_test/sampling_json/kfold_cv_NN_SMOTE.json @@ -0,0 +1,16 @@ +{"verbose": "True", +"datastore": "False", +"save_results": "False", +"model_type": "NN", +"featurizer": "ecfp", +"prediction_type": "classification", +"split_strategy": "k_fold_cv", +"splitter": "random", +"sampling_method": "SMOTE", +"split_test_frac": "0.15", +"split_valid_frac": "0.15", +"transformers": "True", +"id_col": "compound_id", +"response_cols":"active", +"smiles_col": "base_rdkit_smiles", +"max_epochs":"100"} \ No newline at end of file diff --git a/atomsci/ddm/test/integrative/sampling_test/sampling_json/kfold_cv_NN_undersampling.json b/atomsci/ddm/test/integrative/sampling_test/sampling_json/kfold_cv_NN_undersampling.json new file mode 100644 index 00000000..86f7bcf2 --- /dev/null +++ b/atomsci/ddm/test/integrative/sampling_test/sampling_json/kfold_cv_NN_undersampling.json @@ -0,0 +1,16 @@ +{"verbose": "True", +"datastore": "False", +"save_results": "False", +"model_type": "NN", +"featurizer": "ecfp", +"prediction_type": "classification", +"split_strategy": "k_fold_cv", +"splitter": "random", +"sampling_method": "undersampling", +"smiles_col": "base_rdkit_smiles", +"split_test_frac": "0.15", +"split_valid_frac": "0.15", +"transformers": "True", +"id_col": "compound_id", +"response_cols":"active", +"max_epochs":"100"} \ No newline at end of file diff --git a/atomsci/ddm/test/integrative/sampling_test/sampling_json/kfold_cv_RF_SMOTE.json b/atomsci/ddm/test/integrative/sampling_test/sampling_json/kfold_cv_RF_SMOTE.json new file mode 100644 index 00000000..d2b95fb8 --- /dev/null +++ b/atomsci/ddm/test/integrative/sampling_test/sampling_json/kfold_cv_RF_SMOTE.json @@ -0,0 +1,16 @@ +{"verbose": "True", +"datastore": "False", +"save_results": "False", +"model_type": "RF", +"featurizer": "ecfp", +"prediction_type": "classification", +"split_strategy": "k_fold_cv", +"splitter": "random", +"sampling_method": "SMOTE", +"split_test_frac": "0.15", +"split_valid_frac": "0.15", +"transformers": "True", +"id_col": "compound_id", +"response_cols":"active", +"smiles_col": "base_rdkit_smiles", +"max_epochs":"100"} \ No newline at end of file diff --git a/atomsci/ddm/test/integrative/sampling_test/sampling_json/kfold_cv_RF_undersampling.json b/atomsci/ddm/test/integrative/sampling_test/sampling_json/kfold_cv_RF_undersampling.json new file mode 100644 index 00000000..fda25eff --- /dev/null +++ b/atomsci/ddm/test/integrative/sampling_test/sampling_json/kfold_cv_RF_undersampling.json @@ -0,0 +1,16 @@ +{"verbose": "True", +"datastore": "False", +"save_results": "False", +"model_type": "RF", +"featurizer": "ecfp", +"prediction_type": "classification", +"split_strategy": "k_fold_cv", +"splitter": "random", +"sampling_method": "undersampling", +"split_test_frac": "0.15", +"split_valid_frac": "0.15", +"transformers": "True", +"smiles_col": "base_rdkit_smiles", +"id_col": "compound_id", +"response_cols":"active", +"max_epochs":"100"} \ No newline at end of file diff --git a/atomsci/ddm/test/integrative/sampling_test/sampling_json/kfold_cv_xgboost_SMOTE.json b/atomsci/ddm/test/integrative/sampling_test/sampling_json/kfold_cv_xgboost_SMOTE.json new file mode 100644 index 00000000..dafca23c --- /dev/null +++ b/atomsci/ddm/test/integrative/sampling_test/sampling_json/kfold_cv_xgboost_SMOTE.json @@ -0,0 +1,16 @@ +{"verbose": "True", +"datastore": "False", +"save_results": "False", +"model_type": "xgboost", +"featurizer": "ecfp", +"prediction_type": "classification", +"split_strategy": "k_fold_cv", +"sampling_method": "SMOTE", +"splitter": "random", +"split_test_frac": "0.15", +"split_valid_frac": "0.15", +"transformers": "True", +"id_col": "compound_id", +"response_cols":"active", +"smiles_col": "base_rdkit_smiles", +"max_epochs":"100"} \ No newline at end of file diff --git a/atomsci/ddm/test/integrative/sampling_test/sampling_json/kfold_cv_xgboost_undersampling.json b/atomsci/ddm/test/integrative/sampling_test/sampling_json/kfold_cv_xgboost_undersampling.json new file mode 100644 index 00000000..14b43f1c --- /dev/null +++ b/atomsci/ddm/test/integrative/sampling_test/sampling_json/kfold_cv_xgboost_undersampling.json @@ -0,0 +1,16 @@ +{"verbose": "True", +"datastore": "False", +"save_results": "False", +"model_type": "xgboost", +"featurizer": "ecfp", +"prediction_type": "classification", +"split_strategy": "k_fold_cv", +"sampling_method": "undersampling", +"splitter": "random", +"split_test_frac": "0.15", +"split_valid_frac": "0.15", +"transformers": "True", +"id_col": "compound_id", +"response_cols":"active", +"smiles_col": "base_rdkit_smiles", +"max_epochs":"100"} \ No newline at end of file diff --git a/atomsci/ddm/test/integrative/sampling_test/sampling_json/train_valid_test_NN_SMOTE.json b/atomsci/ddm/test/integrative/sampling_test/sampling_json/train_valid_test_NN_SMOTE.json new file mode 100644 index 00000000..7fdbb4d3 --- /dev/null +++ b/atomsci/ddm/test/integrative/sampling_test/sampling_json/train_valid_test_NN_SMOTE.json @@ -0,0 +1,16 @@ +{"verbose": "True", +"datastore": "False", +"save_results": "False", +"model_type": "NN", +"featurizer": "ecfp", +"prediction_type": "classification", +"split_strategy": "train_valid_test", +"splitter": "scaffold", +"sampling_method": "SMOTE", +"split_test_frac": "0.15", +"split_valid_frac": "0.15", +"transformers": "True", +"id_col": "compound_id", +"smiles_col": "base_rdkit_smiles", +"response_cols":"active", +"max_epochs":"100"} \ No newline at end of file diff --git a/atomsci/ddm/test/integrative/sampling_test/sampling_json/train_valid_test_NN_undersampling.json b/atomsci/ddm/test/integrative/sampling_test/sampling_json/train_valid_test_NN_undersampling.json new file mode 100644 index 00000000..bd771283 --- /dev/null +++ b/atomsci/ddm/test/integrative/sampling_test/sampling_json/train_valid_test_NN_undersampling.json @@ -0,0 +1,16 @@ +{"verbose": "True", +"datastore": "False", +"save_results": "False", +"model_type": "NN", +"featurizer": "ecfp", +"prediction_type": "classification", +"split_strategy": "train_valid_test", +"sampling_method": "undersampling", +"splitter": "random", +"split_test_frac": "0.15", +"split_valid_frac": "0.15", +"transformers": "True", +"id_col": "compound_id", +"response_cols":"active", +"smiles_col": "base_rdkit_smiles", +"max_epochs":"100"} \ No newline at end of file diff --git a/atomsci/ddm/test/integrative/sampling_test/sampling_json/train_valid_test_RF_SMOTE.json b/atomsci/ddm/test/integrative/sampling_test/sampling_json/train_valid_test_RF_SMOTE.json new file mode 100644 index 00000000..88a761f7 --- /dev/null +++ b/atomsci/ddm/test/integrative/sampling_test/sampling_json/train_valid_test_RF_SMOTE.json @@ -0,0 +1,16 @@ +{"verbose": "True", +"datastore": "False", +"save_results": "False", +"model_type": "RF", +"featurizer": "ecfp", +"prediction_type": "classification", +"split_strategy": "train_valid_test", +"sampling_method": "SMOTE", +"splitter": "random", +"split_test_frac": "0.15", +"split_valid_frac": "0.15", +"transformers": "True", +"id_col": "compound_id", +"response_cols":"active", +"smiles_col": "base_rdkit_smiles", +"max_epochs":"100"} \ No newline at end of file diff --git a/atomsci/ddm/test/integrative/sampling_test/sampling_json/train_valid_test_RF_undersampling.json b/atomsci/ddm/test/integrative/sampling_test/sampling_json/train_valid_test_RF_undersampling.json new file mode 100644 index 00000000..e88717a1 --- /dev/null +++ b/atomsci/ddm/test/integrative/sampling_test/sampling_json/train_valid_test_RF_undersampling.json @@ -0,0 +1,16 @@ +{"verbose": "True", +"datastore": "False", +"save_results": "False", +"model_type": "RF", +"featurizer": "ecfp", +"prediction_type": "classification", +"split_strategy": "train_valid_test", +"sampling_method": "undersampling", +"splitter": "random", +"split_test_frac": "0.15", +"split_valid_frac": "0.15", +"transformers": "True", +"id_col": "compound_id", +"response_cols":"active", +"smiles_col": "base_rdkit_smiles", +"max_epochs":"100"} \ No newline at end of file diff --git a/atomsci/ddm/test/integrative/sampling_test/sampling_json/train_valid_test_xgboost_SMOTE.json b/atomsci/ddm/test/integrative/sampling_test/sampling_json/train_valid_test_xgboost_SMOTE.json new file mode 100644 index 00000000..8845062f --- /dev/null +++ b/atomsci/ddm/test/integrative/sampling_test/sampling_json/train_valid_test_xgboost_SMOTE.json @@ -0,0 +1,16 @@ +{"verbose": "True", +"datastore": "False", +"save_results": "False", +"model_type": "xgboost", +"featurizer": "ecfp", +"prediction_type": "classification", +"split_strategy": "train_valid_test", +"sampling_method": "SMOTE", +"splitter": "random", +"split_test_frac": "0.15", +"split_valid_frac": "0.15", +"transformers": "True", +"id_col": "compound_id", +"response_cols":"active", +"smiles_col": "base_rdkit_smiles", +"max_epochs":"100"} \ No newline at end of file diff --git a/atomsci/ddm/test/integrative/sampling_test/sampling_json/train_valid_test_xgboost_undersampling.json b/atomsci/ddm/test/integrative/sampling_test/sampling_json/train_valid_test_xgboost_undersampling.json new file mode 100644 index 00000000..840333e1 --- /dev/null +++ b/atomsci/ddm/test/integrative/sampling_test/sampling_json/train_valid_test_xgboost_undersampling.json @@ -0,0 +1,16 @@ +{"verbose": "True", +"datastore": "False", +"save_results": "False", +"model_type": "xgboost", +"featurizer": "ecfp", +"prediction_type": "classification", +"split_strategy": "train_valid_test", +"sampling_method": "undersampling", +"splitter": "random", +"split_test_frac": "0.15", +"split_valid_frac": "0.15", +"transformers": "True", +"id_col": "compound_id", +"response_cols":"active", +"smiles_col": "base_rdkit_smiles", +"max_epochs":"100"} \ No newline at end of file diff --git a/atomsci/ddm/test/integrative/sampling_test/sampling_test.py b/atomsci/ddm/test/integrative/sampling_test/sampling_test.py new file mode 100644 index 00000000..41e2a7f7 --- /dev/null +++ b/atomsci/ddm/test/integrative/sampling_test/sampling_test.py @@ -0,0 +1,313 @@ +#!/usr/bin/env python +"""Testing the sampling methods. Want to ensure that the model pipeline works and that the sampling methods are incorporated. +Based off of the test_kfold_split.py method. """ +import pandas as pd +import sklearn.metrics as skmetrics +import copy +import os +import json + +from atomsci.ddm.pipeline import model_pipeline as mp +from atomsci.ddm.pipeline import parameter_parser as parse +import atomsci.ddm.pipeline.predict_from_model as pfm + +#------------------------------------------------------------------- + +def get_test_set(dataset_key, split_csv, id_col): + """ + Read the dataset key and split_uuid to split dataset into split components + + Parameters: + - dataset_key: path to csv file of dataset + - split_uuid: path to split csv file + - id_col: name of ID column + + Returns: + - train, valid, test dataframe + """ + df = pd.read_csv(dataset_key) + split_df=pd.read_csv(split_csv) + test_df = df[df[id_col].isin(split_df[split_df['subset']=='test']['cmpd_id'])] + + return test_df + +def split(pparams): + split_params=copy.copy(pparams) + split_params.split_only=True + split_params.previously_split=False + + model_pipeline= mp.ModelPipeline(split_params) + split_uuid = model_pipeline.split_dataset() + + return split_uuid + +def train(pparams): + train_pipe = mp.ModelPipeline(pparams) + train_pipe.train_model() + + return train_pipe + +def find_best_test_metric(model_metrics): + for metric in model_metrics: + if metric['label'] == 'best' and metric['subset']=='test': + return metric + return None + +def saved_model_identity(pparams): + script_path = os.path.dirname(os.path.realpath(__file__)) + + model_pipe = mp.ModelPipeline(pparams) + + if not pparams.previously_split: + split_uuid = model_pipe.split_dataset() + pparams.split_uuid = split_uuid + pparams.previously_split = True + pparams.split_only=False + + try: + model_pipe.train_model() + except Exception as e: + print(f"Error during model training: {e}") + return + + #train_pipe = train(pparams) + split_csv = os.path.join(script_path, '../../test_datasets/', model_pipe.data._get_split_key()) + test_df = get_test_set(pparams.dataset_key, split_csv, pparams.id_col) + + # load model metrics from file + with open(os.path.join(pparams.output_dir, 'model_metrics.json'), 'r') as f: + model_metrics = json.load(f) + + metrics = find_best_test_metric(model_metrics) + id_col = metrics['input_dataset']['id_col'] + response_col=metrics['input_dataset']['response_cols'][0] + smiles_col = metrics['input_dataset']['smiles_col'] + test_length = metrics['prediction_results']['num_compounds'] + + # predict from model + model_tar = model_pipe.params.model_tarball_path + pred_df = pfm.predict_from_model_file(model_tar, test_df, id_col=id_col, + smiles_col=smiles_col, response_col=response_col) + pred_df2 = pfm.predict_from_model_file(model_tar, test_df, id_col=id_col, + smiles_col=smiles_col, response_col=response_col) + + X = pred_df[response_col+'_actual'].values + y = pred_df[response_col+'_pred'].values + X2 = pred_df2[response_col+'_actual'].values + y2 = pred_df2[response_col+'_pred'].values + + accuracy = skmetrics.accuracy_score(X, y) + precision = skmetrics.precision_score(X, y, average='weighted') + recall = skmetrics.recall_score(X, y, average='weighted') + prc_auc = skmetrics.average_precision_score(X, y) + + saved_accuracy = metrics['prediction_results']['accuracy_score'] + saved_precision = metrics['prediction_results']['precision'] + saved_recall = metrics['prediction_results']['recall_score'] + saved_prc_auc = metrics['prediction_results']['prc_auc_score'] + + # show results + print(metrics['subset']) + print(pred_df.columns) + print("Accuracy difference:", abs(accuracy - saved_accuracy)) + print("Precision difference:", abs(precision - saved_precision)) + print("Recall difference:", abs(recall-saved_recall)) + print("PRC AUC difference:", abs(prc_auc-saved_prc_auc)) + + assert abs(accuracy-saved_accuracy) < 1 \ + and abs(precision - saved_precision) < 1 \ + and abs(recall-saved_recall) < 1 \ + and abs(prc_auc - saved_prc_auc) < 1 \ + and (test_length == len(test_df)) + +#------------------------------------------------------------------- + +#-------- random forest +def train_valid_test_RF_SMOTE_test(): + script_path = os.path.dirname(os.path.realpath(__file__)) + json_file = os.path.join(script_path, 'sampling_json/train_valid_test_RF_SMOTE.json') + + pparams = parse.wrapper(['--config_file', json_file]) + pparams.dataset_key= os.path.join(script_path, + '../../test_datasets/aurka_chembl_base_smiles_union.csv') + pparams.result_dir=script_path + pparams.split_uuid= 'test-split' + + saved_model_identity(pparams) + +def k_fold_cv_RF_SMOTE_test(): + script_path = os.path.dirname(os.path.realpath(__file__)) + json_file = os.path.join(script_path, 'sampling_json/kfold_cv_RF_SMOTE.json') + + pparams = parse.wrapper(['--config_file', json_file]) + pparams.dataset_key= os.path.join(script_path, + '../../test_datasets/aurka_chembl_base_smiles_union.csv') + pparams.result_dir=script_path + pparams.split_uuid= 'test-split' + + saved_model_identity(pparams) + +def k_fold_cv_RF_undersampling_test(): + + script_path = os.path.dirname(os.path.realpath(__file__)) + json_file = os.path.join(script_path, 'sampling_json/kfold_cv_RF_undersampling.json') + + pparams = parse.wrapper(['--config_file', json_file]) + pparams.dataset_key= os.path.join(script_path, + '../../test_datasets/aurka_chembl_base_smiles_union.csv') + pparams.result_dir=script_path + pparams.split_uuid= 'test-split' + + saved_model_identity(pparams) + +def train_valid_test_RF_undersampling_test(): + script_path = os.path.dirname(os.path.realpath(__file__)) + json_file = os.path.join(script_path, 'sampling_json/train_valid_test_RF_undersampling.json') + + pparams = parse.wrapper(['--config_file', json_file]) + pparams.dataset_key= os.path.join(script_path, + '../../test_datasets/aurka_chembl_base_smiles_union.csv') + pparams.result_dir=script_path + pparams.split_uuid= 'test-split' + + saved_model_identity(pparams) + +#-------- neural network + +def train_valid_test_NN_SMOTE_test(): + script_path = os.path.dirname(os.path.realpath(__file__)) + json_file = os.path.join(script_path, 'sampling_json/train_valid_test_NN_SMOTE.json') + + pparams = parse.wrapper(['--config_file', json_file]) + pparams.dataset_key= os.path.join(script_path, + '../../test_datasets/aurka_chembl_base_smiles_union.csv') + pparams.result_dir=script_path + pparams.split_uuid= 'test-split' + + saved_model_identity(pparams) + +def train_valid_test_NN_undersampling_test(): + script_path = os.path.dirname(os.path.realpath(__file__)) + json_file = os.path.join(script_path, 'sampling_json/train_valid_test_NN_undersampling.json') + + pparams = parse.wrapper(['--config_file', json_file]) + pparams.dataset_key= os.path.join(script_path, + '../../test_datasets/aurka_chembl_base_smiles_union.csv') + pparams.result_dir=script_path + pparams.split_uuid= 'test-split' + + saved_model_identity(pparams) + +def k_fold_cv_NN_SMOTE_test(): + script_path = os.path.dirname(os.path.realpath(__file__)) + json_file = os.path.join(script_path, 'sampling_json/kfold_cv_NN_SMOTE.json') + + pparams = parse.wrapper(['--config_file', json_file]) + pparams.dataset_key= os.path.join(script_path, + '../../test_datasets/aurka_chembl_base_smiles_union.csv') + pparams.result_dir=script_path + pparams.split_uuid= 'test-split' + + saved_model_identity(pparams) + +def k_fold_cv_NN_undersampling_test(): + script_path = os.path.dirname(os.path.realpath(__file__)) + json_file = os.path.join(script_path, 'sampling_json/kfold_cv_NN_undersampling.json') + + pparams = parse.wrapper(['--config_file', json_file]) + pparams.dataset_key= os.path.join(script_path, + '../../test_datasets/aurka_chembl_base_smiles_union.csv') + pparams.result_dir=script_path + pparams.split_uuid= 'test-split' + + saved_model_identity(pparams) + +#-------- xgboost + +def train_valid_test_xgboost_SMOTE_test(): + script_path = os.path.dirname(os.path.realpath(__file__)) + json_file = os.path.join(script_path, 'sampling_json/train_valid_test_xgboost_SMOTE.json') + + pparams = parse.wrapper(['--config_file', json_file]) + pparams.dataset_key= os.path.join(script_path, + '../../test_datasets/aurka_chembl_base_smiles_union.csv') + pparams.result_dir=script_path + pparams.split_uuid= 'test-split' + + saved_model_identity(pparams) + +def train_valid_test_xgboost_undersampling_test(): + script_path = os.path.dirname(os.path.realpath(__file__)) + json_file = os.path.join(script_path, 'sampling_json/train_valid_test_xgboost_undersampling.json') + + pparams = parse.wrapper(['--config_file', json_file]) + pparams.dataset_key= os.path.join(script_path, + '../../test_datasets/aurka_chembl_base_smiles_union.csv') + pparams.result_dir=script_path + pparams.split_uuid= 'test-split' + + saved_model_identity(pparams) + +def k_fold_cv_xgboost_SMOTE_test(): + script_path = os.path.dirname(os.path.realpath(__file__)) + json_file = os.path.join(script_path, 'sampling_json/kfold_cv_xgboost_SMOTE.json') + + pparams = parse.wrapper(['--config_file', json_file]) + pparams.dataset_key= os.path.join(script_path, + '../../test_datasets/aurka_chembl_base_smiles_union.csv') + pparams.result_dir=script_path + pparams.split_uuid= 'test-split' + + saved_model_identity(pparams) + +def k_fold_cv_xgboost_undersampling_test(): + script_path = os.path.dirname(os.path.realpath(__file__)) + json_file = os.path.join(script_path, 'sampling_json/kfold_cv_xgboost_undersampling.json') + + pparams = parse.wrapper(['--config_file', json_file]) + pparams.dataset_key= os.path.join(script_path, + '../../test_datasets/aurka_chembl_base_smiles_union.csv') + pparams.result_dir=script_path + pparams.split_uuid= 'test-split' + + saved_model_identity(pparams) +#------------------------------------------------------------------- + +if __name__=='__main__': + print('train_valid_test_RF_SMOTE_test') + train_valid_test_RF_SMOTE_test() + + print('train_valid_test_NN_SMOTE_test') + train_valid_test_NN_SMOTE_test() + + print("train_valid_test_RF_undersampling_test") + train_valid_test_RF_undersampling_test() + + print("train_valid_test_NN_undersampling_test") + train_valid_test_NN_undersampling_test() + + print("kfold_cv_NN_SMOTE_test") + k_fold_cv_NN_SMOTE_test() + + print("kfold_cv_NN_undersampling_test") + k_fold_cv_NN_undersampling_test() + + print("kfold_cv_RF_SMOTE_test") + k_fold_cv_RF_SMOTE_test() + + print("kfold_cv_RF_undersampling_test") + k_fold_cv_RF_undersampling_test() + + print("train_valid_test_xgboost_SMOTE_test") + train_valid_test_xgboost_SMOTE_test() + + print("train_valid_test_xgboost_undersampling_test") + train_valid_test_xgboost_undersampling_test() + + print("k_fold_cv_xgboost_SMOTE_test") + k_fold_cv_xgboost_SMOTE_test() + + print("k_fold_cv_xgboost_undersampling_test") + k_fold_cv_xgboost_undersampling_test() + + print("Passed!") \ No newline at end of file diff --git a/atomsci/ddm/test/integrative/seed_test/model_json/nn_classification_kfold_test.json b/atomsci/ddm/test/integrative/seed_test/model_json/nn_classification_kfold_test.json new file mode 100644 index 00000000..7ce26e8b --- /dev/null +++ b/atomsci/ddm/test/integrative/seed_test/model_json/nn_classification_kfold_test.json @@ -0,0 +1,15 @@ +{"verbose": "True", +"datastore": "False", +"save_results": "False", +"model_type": "NN", +"featurizer": "ecfp", +"prediction_type": "classification", +"split_strategy": "k_fold_cv", +"splitter": "random", +"split_test_frac": "0.15", +"split_valid_frac": "0.15", +"transformers": "True", +"id_col": "compound_id", +"smiles_col": "base_rdkit_smiles", +"response_cols":"active", +"max_epochs":"100"} \ No newline at end of file diff --git a/atomsci/ddm/test/integrative/seed_test/model_json/nn_classification_train_valid_test.json b/atomsci/ddm/test/integrative/seed_test/model_json/nn_classification_train_valid_test.json new file mode 100644 index 00000000..c3eda35a --- /dev/null +++ b/atomsci/ddm/test/integrative/seed_test/model_json/nn_classification_train_valid_test.json @@ -0,0 +1,15 @@ +{"verbose": "True", +"datastore": "False", +"save_results": "False", +"model_type": "NN", +"featurizer": "ecfp", +"prediction_type": "classification", +"split_strategy": "train_valid_test", +"splitter": "random", +"split_test_frac": "0.15", +"split_valid_frac": "0.15", +"transformers": "True", +"id_col": "compound_id", +"smiles_col": "base_rdkit_smiles", +"response_cols":"active", +"max_epochs":"100"} \ No newline at end of file diff --git a/atomsci/ddm/test/integrative/seed_test/model_json/nn_regression_kfold_test.json b/atomsci/ddm/test/integrative/seed_test/model_json/nn_regression_kfold_test.json new file mode 100644 index 00000000..dede5f7b --- /dev/null +++ b/atomsci/ddm/test/integrative/seed_test/model_json/nn_regression_kfold_test.json @@ -0,0 +1,15 @@ +{"verbose": "True", +"datastore": "False", +"save_results": "False", +"model_type": "NN", +"featurizer": "ecfp", +"prediction_type": "regression", +"split_strategy": "k_fold_cv", +"splitter": "random", +"split_test_frac": "0.15", +"split_valid_frac": "0.15", +"transformers": "True", +"id_col": "compound_id", +"smiles_col": "base_rdkit_smiles", +"response_cols":"pIC50", +"max_epochs":"100"} \ No newline at end of file diff --git a/atomsci/ddm/test/integrative/seed_test/model_json/nn_regression_train_valid_test.json b/atomsci/ddm/test/integrative/seed_test/model_json/nn_regression_train_valid_test.json new file mode 100644 index 00000000..ba86fcce --- /dev/null +++ b/atomsci/ddm/test/integrative/seed_test/model_json/nn_regression_train_valid_test.json @@ -0,0 +1,15 @@ +{"verbose": "True", +"datastore": "False", +"save_results": "False", +"model_type": "NN", +"featurizer": "ecfp", +"prediction_type": "regression", +"split_strategy": "train_valid_test", +"splitter": "random", +"split_test_frac": "0.15", +"split_valid_frac": "0.15", +"transformers": "True", +"id_col": "compound_id", +"smiles_col": "base_rdkit_smiles", +"response_cols":"pIC50", +"max_epochs":"100"} \ No newline at end of file diff --git a/atomsci/ddm/test/integrative/seed_test/model_json/rf_classification_kfold_test.json b/atomsci/ddm/test/integrative/seed_test/model_json/rf_classification_kfold_test.json new file mode 100644 index 00000000..dd8f6b64 --- /dev/null +++ b/atomsci/ddm/test/integrative/seed_test/model_json/rf_classification_kfold_test.json @@ -0,0 +1,15 @@ +{"verbose": "True", +"datastore": "False", +"save_results": "False", +"model_type": "RF", +"featurizer": "ecfp", +"prediction_type": "classification", +"split_strategy": "k_fold_cv", +"splitter": "random", +"split_test_frac": "0.15", +"split_valid_frac": "0.15", +"transformers": "True", +"id_col": "compound_id", +"smiles_col": "base_rdkit_smiles", +"response_cols":"active", +"max_epochs":"100"} \ No newline at end of file diff --git a/atomsci/ddm/test/integrative/seed_test/model_json/rf_classification_train_valid_test.json b/atomsci/ddm/test/integrative/seed_test/model_json/rf_classification_train_valid_test.json new file mode 100644 index 00000000..4263b742 --- /dev/null +++ b/atomsci/ddm/test/integrative/seed_test/model_json/rf_classification_train_valid_test.json @@ -0,0 +1,15 @@ +{"verbose": "True", +"datastore": "False", +"save_results": "False", +"model_type": "RF", +"featurizer": "ecfp", +"prediction_type": "classification", +"split_strategy": "train_valid_test", +"splitter": "random", +"split_test_frac": "0.15", +"split_valid_frac": "0.15", +"transformers": "True", +"id_col": "compound_id", +"smiles_col": "base_rdkit_smiles", +"response_cols":"pIC50", +"max_epochs":"100"} \ No newline at end of file diff --git a/atomsci/ddm/test/integrative/seed_test/model_json/rf_regression_kfold_test.json b/atomsci/ddm/test/integrative/seed_test/model_json/rf_regression_kfold_test.json new file mode 100644 index 00000000..30bcc444 --- /dev/null +++ b/atomsci/ddm/test/integrative/seed_test/model_json/rf_regression_kfold_test.json @@ -0,0 +1,15 @@ +{"verbose": "True", +"datastore": "False", +"save_results": "False", +"model_type": "RF", +"featurizer": "ecfp", +"prediction_type": "regression", +"split_strategy": "k_fold_cv", +"splitter": "random", +"split_test_frac": "0.15", +"split_valid_frac": "0.15", +"transformers": "True", +"id_col": "compound_id", +"smiles_col": "base_rdkit_smiles", +"response_cols":"pIC50", +"max_epochs":"100"} \ No newline at end of file diff --git a/atomsci/ddm/test/integrative/seed_test/model_json/rf_regression_train_valid_test.json b/atomsci/ddm/test/integrative/seed_test/model_json/rf_regression_train_valid_test.json new file mode 100644 index 00000000..05415938 --- /dev/null +++ b/atomsci/ddm/test/integrative/seed_test/model_json/rf_regression_train_valid_test.json @@ -0,0 +1,15 @@ +{"verbose": "True", +"datastore": "False", +"save_results": "False", +"model_type": "RF", +"featurizer": "ecfp", +"prediction_type": "regression", +"split_strategy": "train_valid_test", +"splitter": "random", +"split_test_frac": "0.15", +"split_valid_frac": "0.15", +"transformers": "True", +"id_col": "compound_id", +"smiles_col": "base_rdkit_smiles", +"response_cols":"pIC50", +"max_epochs":"100"} \ No newline at end of file diff --git a/atomsci/ddm/test/integrative/seed_test/model_json/xgboost_classification_kfold_test.json b/atomsci/ddm/test/integrative/seed_test/model_json/xgboost_classification_kfold_test.json new file mode 100644 index 00000000..7c62dcab --- /dev/null +++ b/atomsci/ddm/test/integrative/seed_test/model_json/xgboost_classification_kfold_test.json @@ -0,0 +1,15 @@ +{"verbose": "True", +"datastore": "False", +"save_results": "False", +"model_type": "xgboost", +"featurizer": "ecfp", +"prediction_type": "classification", +"split_strategy": "k_fold_cv", +"splitter": "random", +"split_test_frac": "0.15", +"split_valid_frac": "0.15", +"transformers": "True", +"id_col": "compound_id", +"smiles_col": "base_rdkit_smiles", +"response_cols":"active", +"max_epochs":"100"} \ No newline at end of file diff --git a/atomsci/ddm/test/integrative/seed_test/model_json/xgboost_classification_train_valid_test.json b/atomsci/ddm/test/integrative/seed_test/model_json/xgboost_classification_train_valid_test.json new file mode 100644 index 00000000..a8072c5b --- /dev/null +++ b/atomsci/ddm/test/integrative/seed_test/model_json/xgboost_classification_train_valid_test.json @@ -0,0 +1,15 @@ +{"verbose": "True", +"datastore": "False", +"save_results": "False", +"model_type": "xgboost", +"featurizer": "ecfp", +"prediction_type": "classification", +"split_strategy": "train_valid_test", +"splitter": "random", +"split_test_frac": "0.15", +"split_valid_frac": "0.15", +"transformers": "True", +"id_col": "compound_id", +"smiles_col": "base_rdkit_smiles", +"response_cols":"active", +"max_epochs":"100"} \ No newline at end of file diff --git a/atomsci/ddm/test/integrative/seed_test/model_json/xgboost_regression_kfold_test.json b/atomsci/ddm/test/integrative/seed_test/model_json/xgboost_regression_kfold_test.json new file mode 100644 index 00000000..7bd80469 --- /dev/null +++ b/atomsci/ddm/test/integrative/seed_test/model_json/xgboost_regression_kfold_test.json @@ -0,0 +1,15 @@ +{"verbose": "True", +"datastore": "False", +"save_results": "False", +"model_type": "xgboost", +"featurizer": "ecfp", +"prediction_type": "regression", +"split_strategy": "k_fold_cv", +"splitter": "random", +"split_test_frac": "0.15", +"split_valid_frac": "0.15", +"transformers": "True", +"id_col": "compound_id", +"smiles_col": "base_rdkit_smiles", +"response_cols":"pIC50", +"max_epochs":"100"} \ No newline at end of file diff --git a/atomsci/ddm/test/integrative/seed_test/model_json/xgboost_regression_train_valid_test.json b/atomsci/ddm/test/integrative/seed_test/model_json/xgboost_regression_train_valid_test.json new file mode 100644 index 00000000..a96c72a3 --- /dev/null +++ b/atomsci/ddm/test/integrative/seed_test/model_json/xgboost_regression_train_valid_test.json @@ -0,0 +1,15 @@ +{"verbose": "True", +"datastore": "False", +"save_results": "False", +"model_type": "xgboost", +"featurizer": "ecfp", +"prediction_type": "regression", +"split_strategy": "train_valid_test", +"splitter": "random", +"split_test_frac": "0.15", +"split_valid_frac": "0.15", +"transformers": "True", +"id_col": "compound_id", +"smiles_col": "base_rdkit_smiles", +"response_cols":"pIC50", +"max_epochs":"100"} \ No newline at end of file diff --git a/atomsci/ddm/test/integrative/seed_test/seed_test_models.py b/atomsci/ddm/test/integrative/seed_test/seed_test_models.py new file mode 100644 index 00000000..d99d7eaf --- /dev/null +++ b/atomsci/ddm/test/integrative/seed_test/seed_test_models.py @@ -0,0 +1,292 @@ +#!/usr/bin/env python +"""Testing the reproducibility of seeding a random seed in AMPL to reproduce models.""" +import pandas as pd +import copy +import os +import json + +from atomsci.ddm.pipeline import model_pipeline as mp +from atomsci.ddm.pipeline import parameter_parser as parse + +#------------------------------------------------------------------- +""" +This script does the following: +1. Generates a model +2. Extracts the seed from the model's metadata +3. Runs the model training again with seed +4. Compares the prediction scores to ensure they're identical + +Creates and tests the following models: +- RF, NN +- regression, classification +- train_valid_test split, k-fold cv split +""" +#------------------------------------------------------------------- + +def get_test_set(dataset_key, split_csv, id_col): + """ + Read the dataset key and split_uuid to split the dataset into split components + """ + df = pd.read_csv(dataset_key) + split_df = pd.read_csv(split_csv) + test_df = df[df[id_col].isin(split_df[split_df['subset'] == 'test']['cmpd_id'])] + return test_df + +def find_best_test_metric(model_metrics): + for metric in model_metrics: + if metric['label'] == 'best' and metric['subset'] == 'test': + return metric + return None + +def extract_seed(metadata_path): + with open(metadata_path, 'r') as f: + metadata = json.load(f) + return metadata.get('seed') + +def modify_params_with_seed(pparams, seed): + pparams.seed = seed + return pparams + +def saved_model_identity(pparams): + + retrain_pparams = copy.copy(pparams) + + model_pipe = mp.ModelPipeline(pparams) + + if not pparams.previously_split: + split_uuid = model_pipe.split_dataset() + pparams.split_uuid = split_uuid + pparams.previously_split = True + pparams.split_only = False + try: + model_pipe.train_model() + except Exception as e: + print(f"Error during model training: {e}") + return + + # load model metrics from file + + try: + with open(os.path.join(pparams.output_dir, 'model_metrics.json'), 'r') as f: + model_metrics = json.load(f) + except Exception as e: + print(f"Error during loading model metrics: {e}") + return + + original_metrics = find_best_test_metric(model_metrics) + if pparams.prediction_type == 'regression': + original_mae = original_metrics['prediction_results']['mae_score'] + original_r2 = original_metrics['prediction_results']['r2_score'] + original_rms_score = original_metrics['prediction_results']['rms_score'] + elif pparams.prediction_type == 'classification': + original_accuracy = original_metrics['prediction_results']['accuracy_score'] + original_precision = original_metrics['prediction_results']['precision'] + original_recall = original_metrics['prediction_results']['recall_score'] + original_prc_auc = original_metrics['prediction_results']['prc_auc_score'] + + + # extract the seed + metadata_path = os.path.join(pparams.output_dir, 'model_metadata.json') + seed = extract_seed(metadata_path) + + # add the seed to the params + retrain_pparams.seed = seed + retrain_pparams.model_uuid = None + + # retrain the model + retrain_pipe = mp.ModelPipeline(retrain_pparams) + retrain_pipe.train_model() + #retrain_pipe = train(pparams) + + # extract the metrics from the retrained model + with open(os.path.join(pparams.output_dir, 'model_metrics.json'), 'r') as f: + retrained_model_metrics = json.load(f) + + retrained_metrics = find_best_test_metric(retrained_model_metrics) + if pparams.prediction_type == 'regression': + retrained_mae = retrained_metrics['prediction_results']['mae_score'] + retrained_r2 = retrained_metrics['prediction_results']['r2_score'] + retrained_rms_score = retrained_metrics['prediction_results']['rms_score'] + elif pparams.prediction_type == 'classification': + retrained_accuracy = retrained_metrics['prediction_results']['accuracy_score'] + retrained_precision = retrained_metrics['prediction_results']['precision'] + retrained_recall = retrained_metrics['prediction_results']['recall_score'] + retrained_prc_auc = retrained_metrics['prediction_results']['prc_auc_score'] + + if pparams.prediction_type == 'regression': + print("MAE difference:", abs(original_mae-retrained_mae)) + print("R2 difference:", abs(original_r2 - retrained_r2)) + print("RMS Score difference:", abs(original_rms_score - retrained_rms_score)) + + assert abs(original_mae-retrained_mae) < 1e-9 \ + and abs(original_r2 - retrained_r2) < 1e-9 \ + and abs(original_rms_score - retrained_rms_score) < 1e-9 + + elif pparams.prediction_type == 'classification': + print("Accuracy difference:", abs(original_accuracy - retrained_accuracy)) + print("Precision difference:", abs(original_precision - retrained_precision)) + print("Recall difference:", abs(original_recall - retrained_recall)) + print("PRC AUC difference:", abs(original_prc_auc- retrained_prc_auc)) + + assert abs(original_accuracy - retrained_accuracy) < 1e-9 \ + and abs(original_precision - retrained_precision) < 1e-9 \ + and abs(original_recall - retrained_recall) < 1e-9 \ + and abs(original_prc_auc - retrained_prc_auc) < 1e-9 + +#------------------------------------------------------------------- +# Random Forest +def test_RF_regression_reproducibility(): + script_path = os.path.dirname(os.path.realpath(__file__)) + json_file = os.path.join(script_path, 'model_json/rf_regression_train_valid_test.json') + + pparams = parse.wrapper(['--config_file', json_file]) + pparams.dataset_key = os.path.join(script_path, '../../test_datasets/aurka_chembl_base_smiles_union.csv') + pparams.result_dir=script_path + + saved_model_identity(pparams) + +def test_RF_classification_reproducibility(): + script_path = os.path.dirname(os.path.realpath(__file__)) + json_file = os.path.join(script_path, 'model_json/rf_classification_train_valid_test.json') + + pparams = parse.wrapper(['--config_file', json_file]) + pparams.dataset_key = os.path.join(script_path, '../../test_datasets/aurka_chembl_base_smiles_union.csv') + pparams.result_dir=script_path + + saved_model_identity(pparams) + +def test_RF_regression_kfold_cv_reproducibility(): + script_path = os.path.dirname(os.path.realpath(__file__)) + json_file = os.path.join(script_path, 'model_json/rf_regression_kfold_test.json') + + pparams = parse.wrapper(['--config_file', json_file]) + pparams.dataset_key = os.path.join(script_path, '../../test_datasets/aurka_chembl_base_smiles_union.csv') + pparams.result_dir=script_path + + saved_model_identity(pparams) + +def test_RF_classification_kfold_cv_reproducibility(): + script_path = os.path.dirname(os.path.realpath(__file__)) + json_file = os.path.join(script_path, 'model_json/rf_classification_kfold_test.json') + + pparams = parse.wrapper(['--config_file', json_file]) + pparams.dataset_key = os.path.join(script_path, '../../test_datasets/aurka_chembl_base_smiles_union.csv') + pparams.result_dir=script_path + + saved_model_identity(pparams) + +# Neural Network +def test_NN_regression_reproducibility(): + script_path = os.path.dirname(os.path.realpath(__file__)) + json_file = os.path.join(script_path, 'model_json/nn_regression_train_valid_test.json') + + pparams = parse.wrapper(['--config_file', json_file]) + pparams.dataset_key = os.path.join(script_path, '../../test_datasets/aurka_chembl_base_smiles_union.csv') + pparams.result_dir=script_path + + saved_model_identity(pparams) + +def test_NN_regression_kfold_cv_reproducibility(): + script_path = os.path.dirname(os.path.realpath(__file__)) + json_file = os.path.join(script_path, 'model_json/nn_regression_kfold_test.json') + + pparams = parse.wrapper(['--config_file', json_file]) + pparams.dataset_key = os.path.join(script_path, '../../test_datasets/aurka_chembl_base_smiles_union.csv') + pparams.result_dir=script_path + + saved_model_identity(pparams) + +def test_NN_classification_reproducibility(): + script_path = os.path.dirname(os.path.realpath(__file__)) + json_file = os.path.join(script_path, 'model_json/nn_classification_train_valid_test.json') + + pparams = parse.wrapper(['--config_file', json_file]) + pparams.dataset_key = os.path.join(script_path, '../../test_datasets/aurka_chembl_base_smiles_union.csv') + pparams.result_dir=script_path + + saved_model_identity(pparams) + +def test_NN_classification_kfold_cv_reproducibility(): + script_path = os.path.dirname(os.path.realpath(__file__)) + json_file = os.path.join(script_path, 'model_json/nn_classification_kfold_test.json') + + pparams = parse.wrapper(['--config_file', json_file]) + pparams.dataset_key = os.path.join(script_path, '../../test_datasets/aurka_chembl_base_smiles_union.csv') + pparams.result_dir=script_path + + saved_model_identity(pparams) + +# XGBoost +def test_xgboost_regression_reproducibility(): + script_path = os.path.dirname(os.path.realpath(__file__)) + json_file = os.path.join(script_path, 'model_json/xgboost_regression_train_valid_test.json') + + pparams = parse.wrapper(['--config_file', json_file]) + pparams.dataset_key = os.path.join(script_path, '../../test_datasets/aurka_chembl_base_smiles_union.csv') + pparams.result_dir=script_path + + saved_model_identity(pparams) + +def test_xgboost_classification_reproducibility(): + script_path = os.path.dirname(os.path.realpath(__file__)) + json_file = os.path.join(script_path, 'model_json/xgboost_classification_train_valid_test.json') + + pparams = parse.wrapper(['--config_file', json_file]) + pparams.dataset_key = os.path.join(script_path, '../../test_datasets/aurka_chembl_base_smiles_union.csv') + pparams.result_dir=script_path + + saved_model_identity(pparams) + +def test_xgboost_regression_kfold_cv_reproducibility(): + script_path = os.path.dirname(os.path.realpath(__file__)) + json_file = os.path.join(script_path, 'model_json/xgboost_regression_train_valid_test.json') + + pparams = parse.wrapper(['--config_file', json_file]) + pparams.dataset_key = os.path.join(script_path, '../../test_datasets/aurka_chembl_base_smiles_union.csv') + pparams.result_dir=script_path + + saved_model_identity(pparams) + +def test_xgboost_classification_kfold_cv_reproducibility(): + script_path = os.path.dirname(os.path.realpath(__file__)) + json_file = os.path.join(script_path, 'model_json/xgboost_classification_train_valid_test.json') + + pparams = parse.wrapper(['--config_file', json_file]) + pparams.dataset_key = os.path.join(script_path, '../../test_datasets/aurka_chembl_base_smiles_union.csv') + pparams.result_dir=script_path + + saved_model_identity(pparams) + +if __name__ == "__main__": + # ------ random forest + print("test_RF_regression_reproducibility") + test_RF_regression_reproducibility() + print("test_RF_regression_kfold_reproducibility") + test_RF_regression_kfold_cv_reproducibility() + print("test_RF_classification_reproducibility") + test_RF_classification_reproducibility() + print("test_RF_classification_kfold_reproducibility") + test_RF_classification_kfold_cv_reproducibility() + + # ------ neural network + print("test_NN_regression_reproducibility") + test_NN_regression_reproducibility() + print("test_NN_regression_kfold_reproducibility") + test_NN_regression_kfold_cv_reproducibility() + print("test_NN_classification_reproducibility") + test_NN_classification_reproducibility() + print("test_NN_classification_kfold_reproducibility") + test_NN_classification_kfold_cv_reproducibility() + + # ------ xgboost + print("test_xgboost_regression_reproducibility") + test_xgboost_regression_reproducibility() + print("test_xgboost_regression_kfold_reproducibility") + test_xgboost_regression_kfold_cv_reproducibility() + print("test_xgboost_classification_reproducibility") + test_xgboost_classification_reproducibility() + print("test_xgboost_classification_kfold_reproducibility") + test_xgboost_classification_kfold_cv_reproducibility() + + print("Passed!") + \ No newline at end of file diff --git a/atomsci/ddm/test/integrative/seed_test/seed_test_splitting.py b/atomsci/ddm/test/integrative/seed_test/seed_test_splitting.py new file mode 100644 index 00000000..0b102aeb --- /dev/null +++ b/atomsci/ddm/test/integrative/seed_test/seed_test_splitting.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python +"""Testing the reproducibility of seeding a random seed in AMPL splitters to recreate split datasets.""" + +import pandas as pd +import copy +import json +import os + +from atomsci.ddm.pipeline import model_pipeline as mp +from atomsci.ddm.pipeline import parameter_parser as parse + +#---------------------------------------------------------------------------------------------------------- +def split_dataset(pparams): + model_pipe = mp.ModelPipeline(pparams) + split_uuid = model_pipe.split_dataset() + pparams.split_uuid = split_uuid + return pparams + +def extract_seed(metadata_path): + with open(metadata_path, 'r') as f: + metadata = json.load(f) + return metadata.get('seed') + +def modify_params_with_seed(pparams, seed): + pparams.seed = seed + return pparams + +def compare_splits(original_split_csv, retrained_split_csv): + original_split = pd.read_csv(original_split_csv) + retrained_split = pd.read_csv(retrained_split_csv) + + comparison_df = original_split.merge(retrained_split, on='cmpd_id', suffixes=('_original', '_retrained')) + + # Initialize a variable to track if all comparisons are valid + all_match = True + + # Iterate through rows to compare the 'subset' and 'fold' columns + for index, row in comparison_df.iterrows(): + subset_match = (row['subset_original'] == row['subset_retrained']) + fold_match = (row['fold_original'] == row['fold_retrained']) + + if not (subset_match and fold_match): + print(f"Mismatch found for cmpd_id {row['cmpd_id']}: " + f"original subset = {row['subset_original']}, " + f"retrained subset = {row['subset_retrained']}, " + f"original fold = {row['fold_original']}, " + f"retrained fold = {row['fold_retrained']}") + all_match = False + + return all_match + +def perform_splits_and_compare(pparams): + starting_pparams=split_dataset(pparams) + # original split + script_path = os.path.dirname(os.path.realpath(__file__)) + dataset_path = os.path.join(script_path, '../../test_datasets/') + if starting_pparams.split_strategy == 'k_fold_cv': + original_split_csv = os.path.join(dataset_path, f"{starting_pparams.dataset_name}_{starting_pparams.num_folds}_fold_cv_{starting_pparams.splitter}_{starting_pparams.split_uuid}.csv") + else: + original_split_csv = os.path.join(dataset_path, f"{starting_pparams.dataset_name}_{starting_pparams.split_strategy}_{starting_pparams.splitter}_{starting_pparams.split_uuid}.csv") + + # extract the seed + metadata_path = os.path.join(starting_pparams.output_dir, 'split_metadata.json') + seed = extract_seed(metadata_path) + + # Retrain split with the same seed + retrain_pparams = copy.copy(pparams) + retrain_pparams.split_uuid = None + retrain_pparams.seed = seed + + retrain_pparams = split_dataset(retrain_pparams) + script_path = os.path.dirname(os.path.realpath(__file__)) + dataset_path = os.path.join(script_path, '../../test_datasets/') + if starting_pparams.split_strategy == 'k_fold_cv': + retrained_split_csv = os.path.join(dataset_path, f"{retrain_pparams.dataset_name}_{retrain_pparams.num_folds}_fold_cv_{retrain_pparams.splitter}_{retrain_pparams.split_uuid}.csv") + else: + retrained_split_csv = os.path.join(dataset_path, f"{retrain_pparams.dataset_name}_{retrain_pparams.split_strategy}_{retrain_pparams.splitter}_{retrain_pparams.split_uuid}.csv") + + # Compare splits + + splits_match = compare_splits(original_split_csv, retrained_split_csv) + + if splits_match is True: + print("The splits match exactly!") + else: + print("The splits do not match.") + + return splits_match + +#---------------------------------------------------------------------------------------------------------- + +def test_random_train_valid_test_split_reproducibility(): + script_path = os.path.dirname(os.path.realpath(__file__)) + json_file = os.path.join(script_path, 'split_json/test_random_train_valid_test_split.json') + + pparams = parse.wrapper(['--config_file', json_file]) + pparams.dataset_key = os.path.join(script_path, '../../test_datasets/aurka_chembl_base_smiles_union.csv') + pparams.result_dir=script_path + + perform_splits_and_compare(pparams) + +def test_scaffold_train_valid_test_split_reproducibility(): + script_path = os.path.dirname(os.path.realpath(__file__)) + json_file = os.path.join(script_path, 'split_json/test_scaffold_train_valid_test_split.json') + + pparams = parse.wrapper(['--config_file', json_file]) + pparams.dataset_key = os.path.join(script_path, '../../test_datasets/aurka_chembl_base_smiles_union.csv') + pparams.result_dir=script_path + + perform_splits_and_compare(pparams) + +def test_kfold_random_split_reproducibility(): + script_path = os.path.dirname(os.path.realpath(__file__)) + json_file = os.path.join(script_path, 'split_json/test_kfold_random_split.json') + + pparams = parse.wrapper(['--config_file', json_file]) + pparams.dataset_key = os.path.join(script_path, '../../test_datasets/aurka_chembl_base_smiles_union.csv') + pparams.result_dir=script_path + + perform_splits_and_compare(pparams) + +def test_kfold_scaffold_split_reproducibility(): + script_path = os.path.dirname(os.path.realpath(__file__)) + json_file = os.path.join(script_path, 'split_json/test_kfold_scaffold_split.json') + + pparams = parse.wrapper(['--config_file', json_file]) + pparams.dataset_key = os.path.join(script_path, '../../test_datasets/aurka_chembl_base_smiles_union.csv') + pparams.result_dir=script_path + + perform_splits_and_compare(pparams) + +#---------------------------------------------------------------------------------------------------------- + +if __name__ == "__main__": + print("test_random_train_valid_test_split_reproducibility") + test_random_train_valid_test_split_reproducibility() + + print("test_scaffold_train_valid_test_split_reproducibility") + test_scaffold_train_valid_test_split_reproducibility() + + print("test_kfold_random_split_reproducibility") + test_kfold_random_split_reproducibility() + + print("test_kfold_scaffold_split_reproducibility") + test_kfold_scaffold_split_reproducibility() + + print("Passed!") \ No newline at end of file diff --git a/atomsci/ddm/test/integrative/seed_test/split_json/test_kfold_random_split.json b/atomsci/ddm/test/integrative/seed_test/split_json/test_kfold_random_split.json new file mode 100644 index 00000000..352e86c5 --- /dev/null +++ b/atomsci/ddm/test/integrative/seed_test/split_json/test_kfold_random_split.json @@ -0,0 +1,13 @@ +{"verbose": "True", +"datastore": "False", +"save_results": "False", +"featurizer": "ecfp", +"split_strategy": "k_fold_cv", +"splitter": "random", +"split_test_frac": "0.15", +"split_valid_frac": "0.15", +"transformers": "True", +"id_col": "compound_id", +"smiles_col": "base_rdkit_smiles", +"response_cols":"active", +"max_epochs":"100"} \ No newline at end of file diff --git a/atomsci/ddm/test/integrative/seed_test/split_json/test_kfold_scaffold_split.json b/atomsci/ddm/test/integrative/seed_test/split_json/test_kfold_scaffold_split.json new file mode 100644 index 00000000..00976ccd --- /dev/null +++ b/atomsci/ddm/test/integrative/seed_test/split_json/test_kfold_scaffold_split.json @@ -0,0 +1,13 @@ +{"verbose": "True", +"datastore": "False", +"save_results": "False", +"featurizer": "ecfp", +"split_strategy": "k_fold_cv", +"splitter": "scaffold", +"split_test_frac": "0.15", +"split_valid_frac": "0.15", +"transformers": "True", +"id_col": "compound_id", +"smiles_col": "base_rdkit_smiles", +"response_cols":"active", +"max_epochs":"100"} \ No newline at end of file diff --git a/atomsci/ddm/test/integrative/seed_test/split_json/test_random_train_valid_test_split.json b/atomsci/ddm/test/integrative/seed_test/split_json/test_random_train_valid_test_split.json new file mode 100644 index 00000000..188a8aa0 --- /dev/null +++ b/atomsci/ddm/test/integrative/seed_test/split_json/test_random_train_valid_test_split.json @@ -0,0 +1,13 @@ +{"verbose": "True", +"datastore": "False", +"save_results": "False", +"featurizer": "ecfp", +"split_strategy": "train_valid_test", +"splitter": "random", +"split_test_frac": "0.15", +"split_valid_frac": "0.15", +"transformers": "True", +"id_col": "compound_id", +"smiles_col": "base_rdkit_smiles", +"response_cols":"active", +"max_epochs":"100"} \ No newline at end of file diff --git a/atomsci/ddm/test/integrative/seed_test/split_json/test_scaffold_train_valid_test.json b/atomsci/ddm/test/integrative/seed_test/split_json/test_scaffold_train_valid_test.json new file mode 100644 index 00000000..f913cc9d --- /dev/null +++ b/atomsci/ddm/test/integrative/seed_test/split_json/test_scaffold_train_valid_test.json @@ -0,0 +1,13 @@ +{"verbose": "True", +"datastore": "False", +"save_results": "False", +"featurizer": "ecfp", +"split_strategy": "train_valid_test", +"splitter": "scaffold", +"split_test_frac": "0.15", +"split_valid_frac": "0.15", +"transformers": "True", +"id_col": "compound_id", +"smiles_col": "base_rdkit_smiles", +"response_cols":"active", +"max_epochs":"100"} \ No newline at end of file From 22b03183bb43af548074b3c8242531c7fdaf5a91 Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Fri, 19 Jul 2024 13:14:41 -0700 Subject: [PATCH 02/57] now it runs --- atomsci/ddm/pipeline/perf_data.py | 72 +++---------------------------- 1 file changed, 7 insertions(+), 65 deletions(-) diff --git a/atomsci/ddm/pipeline/perf_data.py b/atomsci/ddm/pipeline/perf_data.py index 4fe2eb00..39f6f028 100644 --- a/atomsci/ddm/pipeline/perf_data.py +++ b/atomsci/ddm/pipeline/perf_data.py @@ -16,8 +16,6 @@ from atomsci.ddm.pipeline import transformations as trans -import pdb - # ****************************************************************************************************************************** def rms_error(y_real, y_pred): @@ -220,28 +218,6 @@ def __init__(self, model_dataset, subset): self.model_score = None self.weights = None - # **************************************************************************************** - def accumulate_preds(self, predicted_vals, ids, pred_stds=None): - """Raises: - NotImplementedError: The method is implemented by subclasses - """ - raise NotImplementedError - - # **************************************************************************************** - def get_pred_values(self): - """Raises: - NotImplementedError: The method is implemented by subclasses - """ - raise NotImplementedError - - # **************************************************************************************** - def compute_perf_metrics(self, per_task=False): - """Raises: - NotImplementedError: The method is implemented by subclasses - """ - raise NotImplementedError - - # **************************************************************************************** # class RegressionPerfData def model_choice_score(self, score_type='r2'): @@ -410,28 +386,7 @@ def __init__(self, model_dataset, subset): self.weights = None # **************************************************************************************** - def accumulate_preds(self, predicted_vals, ids, pred_stds=None): - """Raises: - NotImplementedError: The method is implemented by subclasses - """ - raise NotImplementedError - - # **************************************************************************************** - def get_pred_values(self): - """Raises: - NotImplementedError: The method is implemented by subclasses - """ - raise NotImplementedError - - # **************************************************************************************** - def compute_perf_metrics(self, per_task=False): - """Raises: - NotImplementedError: The method is implemented by subclasses - """ - raise NotImplementedError - - # **************************************************************************************** # class HybridPerfData def model_choice_score(self, score_type='r2'): """Computes a score function based on the accumulated predicted values, to be used for selecting @@ -634,22 +589,6 @@ def __init__(self, model_dataset, subset): self.model_score = None self.weights = None - # **************************************************************************************** - def accumulate_preds(self, predicted_vals, ids, pred_stds=None): - """Raises: - NotImplementedError: The method is implemented by subclasses - """ - raise NotImplementedError - - # **************************************************************************************** - def get_pred_values(self): - """Raises: - NotImplementedError: The method is implemented by subclasses - """ - raise NotImplementedError - - - # **************************************************************************************** # class ClassificationPerfData def model_choice_score(self, score_type='roc_auc'): """Computes a score function based on the accumulated predicted values, to be used for selecting @@ -667,9 +606,10 @@ def model_choice_score(self, score_type='roc_auc'): over tasks. """ + ids, pred_classes, class_probs, prob_stds = self.get_pred_values() - real_vals = self.get_real_values() - weights = self.get_weights() + real_vals = self.get_real_values(ids=ids) + weights = self.get_weights(ids=ids) scores = [] for i in range(self.num_tasks): @@ -1294,7 +1234,10 @@ def get_pred_values(self): probability estimates (only available for the 'train' and 'test' subsets; None otherwise). """ - ids = sorted(self.pred_vals.keys()) + all_ids = sorted(self.pred_vals.keys()) + # with kfold + SMOTE, not all ids have predictions + ids = [id for id in all_ids if not (self.pred_vals[id].size == 0)] + if self.subset in ['train', 'test', 'train_valid']: #class_probs = np.concatenate([dc.trans.undo_transforms(self.pred_vals[id], self.transformers).mean(axis=0, keepdims=True) # for id in ids], axis=0) @@ -1309,7 +1252,6 @@ def get_pred_values(self): prob_stds = None pred_classes = np.argmax(class_probs, axis=2) - pdb.set_trace() return (ids, pred_classes, class_probs, prob_stds) From 30ea3608075d8f0ac6f9db2c85cf6a4cf6e6b815 Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Fri, 19 Jul 2024 13:14:41 -0700 Subject: [PATCH 03/57] kfold changes --- atomsci/ddm/pipeline/perf_data.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/atomsci/ddm/pipeline/perf_data.py b/atomsci/ddm/pipeline/perf_data.py index 39f6f028..7e0dfd86 100644 --- a/atomsci/ddm/pipeline/perf_data.py +++ b/atomsci/ddm/pipeline/perf_data.py @@ -386,7 +386,6 @@ def __init__(self, model_dataset, subset): self.weights = None # **************************************************************************************** - # class HybridPerfData def model_choice_score(self, score_type='r2'): """Computes a score function based on the accumulated predicted values, to be used for selecting @@ -588,7 +587,8 @@ def __init__(self, model_dataset, subset): self.perf_metrics = [] self.model_score = None self.weights = None - + + # **************************************************************************************** # class ClassificationPerfData def model_choice_score(self, score_type='roc_auc'): """Computes a score function based on the accumulated predicted values, to be used for selecting @@ -606,7 +606,6 @@ def model_choice_score(self, score_type='roc_auc'): over tasks. """ - ids, pred_classes, class_probs, prob_stds = self.get_pred_values() real_vals = self.get_real_values(ids=ids) weights = self.get_weights(ids=ids) From dc1f7c4634bba3c431a7ac3cb2a531b2c82f7419 Mon Sep 17 00:00:00 2001 From: Rose Wilfong Date: Wed, 31 Jul 2024 15:03:57 -0400 Subject: [PATCH 04/57] seed test --- atomsci/ddm/pipeline/model_datasets.py | 3 +- atomsci/ddm/pipeline/perf_data.py | 1 - .../seed_test/seed_test_splitting.py | 42 +++++++++++++++---- .../test_fingerprint_train_valid_test.json | 13 ++++++ .../test_kfold_fingerprint_split.json | 13 ++++++ 5 files changed, 61 insertions(+), 11 deletions(-) create mode 100644 atomsci/ddm/test/integrative/seed_test/split_json/test_fingerprint_train_valid_test.json create mode 100644 atomsci/ddm/test/integrative/seed_test/split_json/test_kfold_fingerprint_split.json diff --git a/atomsci/ddm/pipeline/model_datasets.py b/atomsci/ddm/pipeline/model_datasets.py index 8ca3ce7a..84de954b 100644 --- a/atomsci/ddm/pipeline/model_datasets.py +++ b/atomsci/ddm/pipeline/model_datasets.py @@ -699,7 +699,7 @@ def combined_training_data(self): combined_y = np.concatenate((train.y, valid.y), axis=0) combined_w = np.concatenate((train.w, valid.w), axis=0) combined_ids = np.concatenate((train.ids, valid.ids)) - self.combined_train_valid_data = NumpyDataset(combined_X, combined_y, w=combined_w, ids=combined_ids) + self.combined_train_valid_data = NumpyDataset(combined_X, combined_y, w=combined_w, ids=combined_ids) return self.combined_train_valid_data # **************************************************************************************** @@ -737,7 +737,6 @@ def get_subset_responses_and_weights(self, subset, transformers): if subset not in self.subset_response_dict: if subset in ('train', 'valid', 'train_valid'): for fold, (train, valid) in enumerate(self.train_valid_dsets): - print('(get_subset_responses_and_weights) for fold', fold) dataset = self.combined_training_data() elif subset == 'test': dataset = self.test_dset diff --git a/atomsci/ddm/pipeline/perf_data.py b/atomsci/ddm/pipeline/perf_data.py index 7e0dfd86..2c1f9857 100644 --- a/atomsci/ddm/pipeline/perf_data.py +++ b/atomsci/ddm/pipeline/perf_data.py @@ -1121,7 +1121,6 @@ def __init__(self, model_dataset, transformers, subset, predict_probs=True, tran self.subset = subset if self.subset in ('train', 'valid', 'train_valid'): for fold, (train, valid) in enumerate(model_dataset.train_valid_dsets): - print('iterating through fold:', fold) dataset = model_dataset.combined_training_data() elif self.subset == 'test': dataset = model_dataset.test_dset diff --git a/atomsci/ddm/test/integrative/seed_test/seed_test_splitting.py b/atomsci/ddm/test/integrative/seed_test/seed_test_splitting.py index 0b102aeb..e87e1584 100644 --- a/atomsci/ddm/test/integrative/seed_test/seed_test_splitting.py +++ b/atomsci/ddm/test/integrative/seed_test/seed_test_splitting.py @@ -109,6 +109,16 @@ def test_scaffold_train_valid_test_split_reproducibility(): perform_splits_and_compare(pparams) +def test_fingerprint_train_valid_test_split_reproducibility(): + script_path = os.path.dirname(os.path.realpath(__file__)) + json_file = os.path.join(script_path, 'split_json/test_fingerprint_train_valid_test.json') + + pparams = parse.wrapper(['--config_file', json_file]) + pparams.dataset_key = os.path.join(script_path, '../../test_datasets/aurka_chembl_base_smiles_union.csv') + pparams.result_dir=script_path + + perform_splits_and_compare(pparams) + def test_kfold_random_split_reproducibility(): script_path = os.path.dirname(os.path.realpath(__file__)) json_file = os.path.join(script_path, 'split_json/test_kfold_random_split.json') @@ -129,19 +139,35 @@ def test_kfold_scaffold_split_reproducibility(): perform_splits_and_compare(pparams) +def test_kfold_fingerprint_split_reproducibility(): + script_path = os.path.dirname(os.path.realpath(__file__)) + json_file = os.path.join(script_path, 'split_json/test_kfold_fingerprint_split.json') + + pparams = parse.wrapper(['--config_file', json_file]) + pparams.dataset_key = os.path.join(script_path, '../../test_datasets/aurka_chembl_base_smiles_union.csv') + pparams.result_dir=script_path + + perform_splits_and_compare(pparams) + #---------------------------------------------------------------------------------------------------------- if __name__ == "__main__": - print("test_random_train_valid_test_split_reproducibility") - test_random_train_valid_test_split_reproducibility() + #print("test_random_train_valid_test_split_reproducibility") + #test_random_train_valid_test_split_reproducibility() + + #print("test_scaffold_train_valid_test_split_reproducibility") + #test_scaffold_train_valid_test_split_reproducibility() + + print("test_fingerprint_train_valid_test_split_reproducibility") + test_fingerprint_train_valid_test_split_reproducibility() - print("test_scaffold_train_valid_test_split_reproducibility") - test_scaffold_train_valid_test_split_reproducibility() + #print("test_kfold_random_split_reproducibility") + #test_kfold_random_split_reproducibility() - print("test_kfold_random_split_reproducibility") - test_kfold_random_split_reproducibility() + #print("test_kfold_scaffold_split_reproducibility") + #test_kfold_scaffold_split_reproducibility() - print("test_kfold_scaffold_split_reproducibility") - test_kfold_scaffold_split_reproducibility() + print("test_kfold_fingerprint_split_reproducibility") + test_kfold_fingerprint_split_reproducibility() print("Passed!") \ No newline at end of file diff --git a/atomsci/ddm/test/integrative/seed_test/split_json/test_fingerprint_train_valid_test.json b/atomsci/ddm/test/integrative/seed_test/split_json/test_fingerprint_train_valid_test.json new file mode 100644 index 00000000..bafe7399 --- /dev/null +++ b/atomsci/ddm/test/integrative/seed_test/split_json/test_fingerprint_train_valid_test.json @@ -0,0 +1,13 @@ +{"verbose": "True", +"datastore": "False", +"save_results": "False", +"featurizer": "ecfp", +"split_strategy": "train_valid_test", +"splitter": "fingerprint", +"split_test_frac": "0.15", +"split_valid_frac": "0.15", +"transformers": "True", +"id_col": "compound_id", +"smiles_col": "base_rdkit_smiles", +"response_cols":"active", +"max_epochs":"100"} \ No newline at end of file diff --git a/atomsci/ddm/test/integrative/seed_test/split_json/test_kfold_fingerprint_split.json b/atomsci/ddm/test/integrative/seed_test/split_json/test_kfold_fingerprint_split.json new file mode 100644 index 00000000..e03a34a0 --- /dev/null +++ b/atomsci/ddm/test/integrative/seed_test/split_json/test_kfold_fingerprint_split.json @@ -0,0 +1,13 @@ +{"verbose": "True", +"datastore": "False", +"save_results": "False", +"featurizer": "ecfp", +"split_strategy": "k_fold_cv", +"splitter": "fingerprint", +"split_test_frac": "0.15", +"split_valid_frac": "0.15", +"transformers": "True", +"id_col": "compound_id", +"smiles_col": "base_rdkit_smiles", +"response_cols":"active", +"max_epochs":"100"} \ No newline at end of file From 7b139675289c01452bdde69939d2ea4a1d8bae2c Mon Sep 17 00:00:00 2001 From: Rose Wilfong Date: Wed, 31 Jul 2024 15:20:03 -0400 Subject: [PATCH 05/57] ruff linter suggestions --- .../integrative/sampling_test/sampling_test.py | 4 ---- .../integrative/seed_test/seed_test_splitting.py | 16 ++++++++-------- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/atomsci/ddm/test/integrative/sampling_test/sampling_test.py b/atomsci/ddm/test/integrative/sampling_test/sampling_test.py index 41e2a7f7..9e867fb6 100644 --- a/atomsci/ddm/test/integrative/sampling_test/sampling_test.py +++ b/atomsci/ddm/test/integrative/sampling_test/sampling_test.py @@ -88,13 +88,9 @@ def saved_model_identity(pparams): model_tar = model_pipe.params.model_tarball_path pred_df = pfm.predict_from_model_file(model_tar, test_df, id_col=id_col, smiles_col=smiles_col, response_col=response_col) - pred_df2 = pfm.predict_from_model_file(model_tar, test_df, id_col=id_col, - smiles_col=smiles_col, response_col=response_col) X = pred_df[response_col+'_actual'].values y = pred_df[response_col+'_pred'].values - X2 = pred_df2[response_col+'_actual'].values - y2 = pred_df2[response_col+'_pred'].values accuracy = skmetrics.accuracy_score(X, y) precision = skmetrics.precision_score(X, y, average='weighted') diff --git a/atomsci/ddm/test/integrative/seed_test/seed_test_splitting.py b/atomsci/ddm/test/integrative/seed_test/seed_test_splitting.py index e87e1584..a158e765 100644 --- a/atomsci/ddm/test/integrative/seed_test/seed_test_splitting.py +++ b/atomsci/ddm/test/integrative/seed_test/seed_test_splitting.py @@ -152,20 +152,20 @@ def test_kfold_fingerprint_split_reproducibility(): #---------------------------------------------------------------------------------------------------------- if __name__ == "__main__": - #print("test_random_train_valid_test_split_reproducibility") - #test_random_train_valid_test_split_reproducibility() + print("test_random_train_valid_test_split_reproducibility") + test_random_train_valid_test_split_reproducibility() - #print("test_scaffold_train_valid_test_split_reproducibility") - #test_scaffold_train_valid_test_split_reproducibility() + print("test_scaffold_train_valid_test_split_reproducibility") + test_scaffold_train_valid_test_split_reproducibility() print("test_fingerprint_train_valid_test_split_reproducibility") test_fingerprint_train_valid_test_split_reproducibility() - #print("test_kfold_random_split_reproducibility") - #test_kfold_random_split_reproducibility() + print("test_kfold_random_split_reproducibility") + test_kfold_random_split_reproducibility() - #print("test_kfold_scaffold_split_reproducibility") - #test_kfold_scaffold_split_reproducibility() + print("test_kfold_scaffold_split_reproducibility") + test_kfold_scaffold_split_reproducibility() print("test_kfold_fingerprint_split_reproducibility") test_kfold_fingerprint_split_reproducibility() From 6fb1c62bcbcecc89c44f5aa64f55ebd45d380436 Mon Sep 17 00:00:00 2001 From: Rose Wilfong Date: Thu, 1 Aug 2024 14:42:23 -0400 Subject: [PATCH 06/57] updated kfoldregression --- atomsci/ddm/pipeline/perf_data.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/atomsci/ddm/pipeline/perf_data.py b/atomsci/ddm/pipeline/perf_data.py index 2c1f9857..63559acd 100644 --- a/atomsci/ddm/pipeline/perf_data.py +++ b/atomsci/ddm/pipeline/perf_data.py @@ -869,7 +869,8 @@ def __init__(self, model_dataset, transformers, subset, transformed=True): """ self.subset = subset if self.subset in ('train', 'valid', 'train_valid'): - dataset = model_dataset.combined_training_data() + for fold, (train, valid) in enumerate(model_dataset.train_valid_dsets): + dataset = model_dataset.combined_training_data() elif self.subset == 'test': dataset = model_dataset.test_dset else: From fc24463bc7cde6fae0c81303205489c17122a30c Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Wed, 11 Sep 2024 11:17:55 -0700 Subject: [PATCH 07/57] added imblearn to pip requirements --- pip/cpu_requirements.txt | 2 ++ pip/cuda_requirements.txt | 2 ++ pip/mchip_requirements.txt | 2 ++ pip/rocm_requirements.txt | 2 ++ 4 files changed, 8 insertions(+) diff --git a/pip/cpu_requirements.txt b/pip/cpu_requirements.txt index f3feb350..c5010ab9 100644 --- a/pip/cpu_requirements.txt +++ b/pip/cpu_requirements.txt @@ -29,6 +29,8 @@ pyarrow bravado +imblearn==0.12.3 + # optional for home users: prettier images in RDKit # requires pkg-config to build: sudo apt-get pkg-config # requires Cairo: sudo apt-get install libcairo2-dev diff --git a/pip/cuda_requirements.txt b/pip/cuda_requirements.txt index c223ad16..4241f052 100644 --- a/pip/cuda_requirements.txt +++ b/pip/cuda_requirements.txt @@ -31,6 +31,8 @@ pyarrow bravado +imblearn==0.12.3 + # optional for home users: prettier images in RDKit # requires pkg-config to build: sudo apt-get pkg-config # requires Cairo: sudo apt-get install libcairo2-dev diff --git a/pip/mchip_requirements.txt b/pip/mchip_requirements.txt index 6bb58ff1..47a55885 100644 --- a/pip/mchip_requirements.txt +++ b/pip/mchip_requirements.txt @@ -30,6 +30,8 @@ pyarrow bravado +imblearn==0.12.3 + # optional for home users: prettier images in RDKit # requires pkg-config to build: sudo apt-get pkg-config # requires Cairo: sudo apt-get install libcairo2-dev diff --git a/pip/rocm_requirements.txt b/pip/rocm_requirements.txt index 296f7b70..a9b3e85c 100644 --- a/pip/rocm_requirements.txt +++ b/pip/rocm_requirements.txt @@ -19,6 +19,8 @@ umap-learn pyarrow +imblearn==0.12.3 + # requires pkg-config to build: sudo apt-get pkg-config # requires Cairo: sudo apt-get install libcairo2-dev # pycairo From 561c3bb86f7b1259a6157ffec6a8e0aed8a4ba4e Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Wed, 11 Sep 2024 11:19:45 -0700 Subject: [PATCH 08/57] unpin imblearn --- pip/cpu_requirements.txt | 2 +- pip/cuda_requirements.txt | 2 +- pip/mchip_requirements.txt | 2 +- pip/rocm_requirements.txt | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pip/cpu_requirements.txt b/pip/cpu_requirements.txt index c5010ab9..00afdfa3 100644 --- a/pip/cpu_requirements.txt +++ b/pip/cpu_requirements.txt @@ -29,7 +29,7 @@ pyarrow bravado -imblearn==0.12.3 +imblearn # optional for home users: prettier images in RDKit # requires pkg-config to build: sudo apt-get pkg-config diff --git a/pip/cuda_requirements.txt b/pip/cuda_requirements.txt index 4241f052..3b16ac75 100644 --- a/pip/cuda_requirements.txt +++ b/pip/cuda_requirements.txt @@ -31,7 +31,7 @@ pyarrow bravado -imblearn==0.12.3 +imblearn # optional for home users: prettier images in RDKit # requires pkg-config to build: sudo apt-get pkg-config diff --git a/pip/mchip_requirements.txt b/pip/mchip_requirements.txt index 47a55885..cf005cd4 100644 --- a/pip/mchip_requirements.txt +++ b/pip/mchip_requirements.txt @@ -30,7 +30,7 @@ pyarrow bravado -imblearn==0.12.3 +imblearn # optional for home users: prettier images in RDKit # requires pkg-config to build: sudo apt-get pkg-config diff --git a/pip/rocm_requirements.txt b/pip/rocm_requirements.txt index a9b3e85c..08601156 100644 --- a/pip/rocm_requirements.txt +++ b/pip/rocm_requirements.txt @@ -19,7 +19,7 @@ umap-learn pyarrow -imblearn==0.12.3 +imblearn # requires pkg-config to build: sudo apt-get pkg-config # requires Cairo: sudo apt-get install libcairo2-dev From 49dc67b2c5b7512d03f1e8760581cbf72accddb5 Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Wed, 11 Sep 2024 15:41:19 -0700 Subject: [PATCH 09/57] Clean up unused random_state or seed parameters or assignments. --- atomsci/ddm/pipeline/model_pipeline.py | 18 ++++++++++++------ atomsci/ddm/pipeline/model_wrapper.py | 20 +++----------------- atomsci/ddm/pipeline/splitting.py | 6 ------ 3 files changed, 15 insertions(+), 29 deletions(-) diff --git a/atomsci/ddm/pipeline/model_pipeline.py b/atomsci/ddm/pipeline/model_pipeline.py index 8e769e9b..775decf5 100644 --- a/atomsci/ddm/pipeline/model_pipeline.py +++ b/atomsci/ddm/pipeline/model_pipeline.py @@ -254,7 +254,7 @@ def __init__(self, params, ds_client=None, mlmt_client=None, random_state=None, # **************************************************************************************** - def load_featurize_data(self, params=None, random_state=None, seed=None): + def load_featurize_data(self, params=None): """Loads the dataset from the datastore or the file system and featurizes it. If we are training a new model, split the dataset into training, validation and test sets. @@ -594,7 +594,7 @@ def split_dataset(self, featurization=None): # **************************************************************************************** - def train_model(self, featurization=None, random_state=None, seed=None): + def train_model(self, featurization=None): """Build model described by self.params on the training dataset described by self.params. Generate predictions for the training, validation, and test datasets, and save the predictions and @@ -1108,7 +1108,9 @@ def run_models(params, shared_featurization=None, generator=False): # Create the ModelWrapper object. pipeline.model_wrapper = model_wrapper.create_model_wrapper(pipeline.params, featurization, - pipeline.ds_client) + pipeline.ds_client, + random_state=pipeline.random_state, + random_seed=pipeline.seed) # Get the tarball containing the saved model from the datastore, and extract it into model_dir. model_dataset_oid = metadata_dict['model_parameters']['model_dataset_oid'] @@ -1200,7 +1202,9 @@ def regenerate_results(result_dir, params=None, metadata_dict=None, shared_featu # Create the ModelWrapper object. pipeline.model_wrapper = model_wrapper.create_model_wrapper(pipeline.params, featurization, - pipeline.ds_client) + pipeline.ds_client, + random_state=pipeline.random_state, + seed=pipeline.seed) # Get the tarball containing the saved model from the datastore, and extract it into model_dir (old format) # or output_dir (new format) according to the format of the tarball contents. @@ -1312,7 +1316,7 @@ def create_prediction_pipeline(params, model_uuid, collection_name=None, featuri pipeline.orig_params = orig_params # Create the ModelWrapper object. - pipeline.model_wrapper = model_wrapper.create_model_wrapper(pipeline.params, featurization, + pipeline.model_wrapper = model_wrapper.(pipeline.params, featurization, pipeline.ds_client) orig_log_level = pipeline.log.getEffectiveLevel() @@ -1428,7 +1432,9 @@ def create_prediction_pipeline_from_file(params, reload_dir, model_path=None, mo pipeline.orig_params = orig_params # Create the ModelWrapper object. - pipeline.model_wrapper = model_wrapper.create_model_wrapper(pipeline.params, featurization) + pipeline.model_wrapper = model_wrapper.create_model_wrapper(pipeline.params, featurization, + random_state=pipeline.random_state, + random_seed=pipeline.seed) orig_log_level = pipeline.log.getEffectiveLevel() if verbose: diff --git a/atomsci/ddm/pipeline/model_wrapper.py b/atomsci/ddm/pipeline/model_wrapper.py index 11f0805d..b584d710 100644 --- a/atomsci/ddm/pipeline/model_wrapper.py +++ b/atomsci/ddm/pipeline/model_wrapper.py @@ -1202,9 +1202,6 @@ def __init__(self, params, featurizer, ds_client, random_state=None, seed=None): """ super().__init__(params, featurizer, ds_client, random_state=random_state, seed=seed) - self.random_state = random_state - self.seed = seed - if self.params.layer_sizes is None: if self.params.featurizer == 'ecfp': self.params.layer_sizes = [1000, 500] @@ -1617,9 +1614,6 @@ def __init__(self, params, featurizer, ds_client, random_state=None, seed=None): self.model_dir = self.best_model_dir os.makedirs(self.best_model_dir, exist_ok=True) - self.random_state = random_state - self.seed = seed - self.model = self.make_dc_model(self.best_model_dir, random_state=random_state, seed=seed) # **************************************************************************************** @@ -1823,11 +1817,9 @@ def __init__(self, params, featurizer, ds_client, random_state=None, seed=None): ds_client: datastore client. """ super().__init__(params, featurizer, ds_client, random_state=random_state, seed=seed) - self.random_state = random_state - self.seed = seed # **************************************************************************************** - def make_dc_model(self, model_dir, random_state=None, seed=None): + def make_dc_model(self, model_dir): """Build a DeepChem model. Builds a model, wraps it in DeepChem's wrapper and returns it @@ -1978,11 +1970,9 @@ def __init__(self, params, featurizer, ds_client, random_state=None, seed=None): ds_client: datastore client. """ super().__init__(params, featurizer, ds_client, random_state=random_state, seed=seed) - self.random_state = random_state - self.seed = seed # **************************************************************************************** - def make_dc_model(self, model_dir, random_state=None, seed=None): + def make_dc_model(self, model_dir): """Build a DeepChem model. Builds a model, wraps it in DeepChem's wrapper and returns it @@ -2334,8 +2324,6 @@ def __init__(self, params, featurizer, ds_client, random_state=None, seed=None): # use NNModelWrapper init. super().__init__(params, featurizer, ds_client, random_state=random_state, seed=seed) self.num_epochs_trained = 0 - self.random_state = random_state - self.seed = seed self.model = self.recreate_model() # **************************************************************************************** @@ -2720,13 +2708,11 @@ def __init__(self, params, featurizer, ds_client, random_state=None, seed=None): model: The dc.models.GraphConvModel, MultitaskRegressor, or MultitaskClassifier object, as specified by the params attribute """ - super().__init__(params, featurizer, ds_client) + super().__init__(params, featurizer, ds_client, random_state=random_state, seed=seed) # TODO (ksm): The next two attributes aren't used; suggest we drop them. self.g = tf.Graph() self.sess = tf.compat.v1.Session(graph=self.g) self.num_epochs_trained = 0 - self.random_state = random_state - self.seed = seed self.model = self.recreate_model(model_dir=self.model_dir) # **************************************************************************************** diff --git a/atomsci/ddm/pipeline/splitting.py b/atomsci/ddm/pipeline/splitting.py index be667046..2211274a 100644 --- a/atomsci/ddm/pipeline/splitting.py +++ b/atomsci/ddm/pipeline/splitting.py @@ -298,8 +298,6 @@ def __init__(self, params, random_state=None, seed=None): """ super().__init__(params, random_state, seed) self.num_folds = params.num_folds - self.random_state = random_state - self.seed = seed # **************************************************************************************** @@ -424,8 +422,6 @@ def __init__(self, params, random_state=None, seed=None): """ super().__init__(params, random_state=random_state, seed=seed) self.num_folds = 1 - self.random_state = random_state - self.seed = seed # **************************************************************************************** def get_split_prefix(self, parent=''): @@ -571,8 +567,6 @@ def __init__(self, params, random_state=None, seed=None): """This Splitting only does one thing and ignores all splitter parameters""" self.splitter = ProductionSplitter() self.split = 'production' - self.random_state = random_state - self.seed = seed # **************************************************************************************** def get_split_prefix(self, parent=''): From b41b7d57d227fbe3860ad7db0c4ffebbecbcad4b Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Wed, 11 Sep 2024 16:03:46 -0700 Subject: [PATCH 10/57] fixed merging error --- atomsci/ddm/pipeline/model_datasets.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/atomsci/ddm/pipeline/model_datasets.py b/atomsci/ddm/pipeline/model_datasets.py index f071ef23..aa656cbc 100644 --- a/atomsci/ddm/pipeline/model_datasets.py +++ b/atomsci/ddm/pipeline/model_datasets.py @@ -660,7 +660,7 @@ def combined_training_data(self): combined_y = np.concatenate((train.y, valid.y), axis=0) combined_w = np.concatenate((train.w, valid.w), axis=0) combined_ids = np.concatenate((train.ids, valid.ids)) - self.combined_train_valid_data = NumpyDataset(combined_X, combined_y, w=combined_w, ids=combined_ids) + self.combined_train_valid_data = NumpyDataset(combined_X, combined_y, w=combined_w, ids=combined_ids) return self.combined_train_valid_data # **************************************************************************************** @@ -697,8 +697,7 @@ def get_subset_responses_and_weights(self, subset, transformers): """ if subset not in self.subset_response_dict: if subset in ('train', 'valid', 'train_valid'): - for fold, (train, valid) in enumerate(self.train_valid_dsets): - dataset = self.combined_training_data() + dataset = self.combined_training_data() elif subset == 'test': dataset = self.test_dset else: From b65ba09ef705fc93c9624e861aa2322a05ad017e Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Wed, 11 Sep 2024 16:24:24 -0700 Subject: [PATCH 11/57] Fixed find and replace bug --- atomsci/ddm/pipeline/model_pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/atomsci/ddm/pipeline/model_pipeline.py b/atomsci/ddm/pipeline/model_pipeline.py index 775decf5..1f357b04 100644 --- a/atomsci/ddm/pipeline/model_pipeline.py +++ b/atomsci/ddm/pipeline/model_pipeline.py @@ -1316,7 +1316,7 @@ def create_prediction_pipeline(params, model_uuid, collection_name=None, featuri pipeline.orig_params = orig_params # Create the ModelWrapper object. - pipeline.model_wrapper = model_wrapper.(pipeline.params, featurization, + pipeline.model_wrapper = model_wrapper.create_model_wrapper(pipeline.params, featurization, pipeline.ds_client) orig_log_level = pipeline.log.getEffectiveLevel() From 84babd21fd4e1557d69c31342591816f59fa6e8c Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Wed, 11 Sep 2024 16:31:45 -0700 Subject: [PATCH 12/57] make_dc_model does not need random_state or seed arguments --- atomsci/ddm/pipeline/model_pipeline.py | 4 ++-- atomsci/ddm/pipeline/model_wrapper.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/atomsci/ddm/pipeline/model_pipeline.py b/atomsci/ddm/pipeline/model_pipeline.py index 1f357b04..1644abc4 100644 --- a/atomsci/ddm/pipeline/model_pipeline.py +++ b/atomsci/ddm/pipeline/model_pipeline.py @@ -1110,7 +1110,7 @@ def run_models(params, shared_featurization=None, generator=False): pipeline.model_wrapper = model_wrapper.create_model_wrapper(pipeline.params, featurization, pipeline.ds_client, random_state=pipeline.random_state, - random_seed=pipeline.seed) + seed=pipeline.seed) # Get the tarball containing the saved model from the datastore, and extract it into model_dir. model_dataset_oid = metadata_dict['model_parameters']['model_dataset_oid'] @@ -1434,7 +1434,7 @@ def create_prediction_pipeline_from_file(params, reload_dir, model_path=None, mo # Create the ModelWrapper object. pipeline.model_wrapper = model_wrapper.create_model_wrapper(pipeline.params, featurization, random_state=pipeline.random_state, - random_seed=pipeline.seed) + seed=pipeline.seed) orig_log_level = pipeline.log.getEffectiveLevel() if verbose: diff --git a/atomsci/ddm/pipeline/model_wrapper.py b/atomsci/ddm/pipeline/model_wrapper.py index b584d710..ba74966d 100644 --- a/atomsci/ddm/pipeline/model_wrapper.py +++ b/atomsci/ddm/pipeline/model_wrapper.py @@ -1614,7 +1614,7 @@ def __init__(self, params, featurizer, ds_client, random_state=None, seed=None): self.model_dir = self.best_model_dir os.makedirs(self.best_model_dir, exist_ok=True) - self.model = self.make_dc_model(self.best_model_dir, random_state=random_state, seed=seed) + self.model = self.make_dc_model(self.best_model_dir) # **************************************************************************************** def train(self, pipeline): @@ -1683,7 +1683,7 @@ def train(self, pipeline): self.best_epoch = 0 # **************************************************************************************** - def make_dc_model(self, model_dir, random_state=None, seed=None): + def make_dc_model(self, model_dir): """Build a DeepChem model. Builds a model, wraps it in DeepChem's wrapper and returns it From a821f6aebac6a5ebd2fc2dee02c053af6ed399e9 Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Thu, 12 Sep 2024 08:17:57 -0700 Subject: [PATCH 13/57] Changed constructor of ProductionSplitter to call Splitting's init function. --- atomsci/ddm/pipeline/splitting.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/atomsci/ddm/pipeline/splitting.py b/atomsci/ddm/pipeline/splitting.py index 2211274a..1b8ca52a 100644 --- a/atomsci/ddm/pipeline/splitting.py +++ b/atomsci/ddm/pipeline/splitting.py @@ -198,7 +198,9 @@ def __init__(self, params, random_state=None, seed=None): self.params = params self.split = params.splitter - if params.splitter == 'index': + if params.production: + self.splitter = ProductionSplitter() + elif params.splitter == 'index': self.splitter = dc.splits.IndexSplitter() elif params.splitter == 'random': self.splitter = dc.splits.RandomSplitter() @@ -565,8 +567,9 @@ def split( class ProductionSplitting(Splitting): def __init__(self, params, random_state=None, seed=None): """This Splitting only does one thing and ignores all splitter parameters""" - self.splitter = ProductionSplitter() + super().__init__(params, random_state=random_state, seed=seed) self.split = 'production' + self.num_folds = 1 # **************************************************************************************** def get_split_prefix(self, parent=''): From 31f3d5f451341794a30889d56194deba4c828d45 Mon Sep 17 00:00:00 2001 From: Rose Wilfong Date: Thu, 12 Sep 2024 09:13:59 -0700 Subject: [PATCH 14/57] removed heads --- atomsci/ddm/pipeline/perf_data.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/atomsci/ddm/pipeline/perf_data.py b/atomsci/ddm/pipeline/perf_data.py index 7f876f1c..996c370c 100644 --- a/atomsci/ddm/pipeline/perf_data.py +++ b/atomsci/ddm/pipeline/perf_data.py @@ -1113,17 +1113,9 @@ def __init__(self, model_dataset, transformers, subset, predict_probs=True, tran self.subset = subset if self.subset in ('train', 'valid', 'train_valid'): -<<<<<<< HEAD -<<<<<<< HEAD for fold, (train, valid) in enumerate(model_dataset.train_valid_dsets): dataset = model_dataset.combined_training_data() -======= - dataset = model_dataset.combined_training_data() ->>>>>>> upstream/1.7.0 -======= - for fold, (train, valid) in enumerate(model_dataset.train_valid_dsets): - dataset = model_dataset.combined_training_data() ->>>>>>> origin/1.7.0 + elif self.subset == 'test': dataset = model_dataset.test_dset else: @@ -1249,10 +1241,7 @@ def get_pred_values(self): class_probs = np.concatenate([dc.trans.undo_transforms(self.pred_vals[id], self.transformers) for id in ids], axis=0) prob_stds = None pred_classes = np.argmax(class_probs, axis=2) -<<<<<<< HEAD -======= ->>>>>>> origin/1.7.0 return (ids, pred_classes, class_probs, prob_stds) From d074f6502c6e90508ef08cbfe1e5dc1e5592bb25 Mon Sep 17 00:00:00 2001 From: Rose Wilfong Date: Thu, 12 Sep 2024 09:20:55 -0700 Subject: [PATCH 15/57] removed unused library --- atomsci/ddm/pipeline/model_datasets.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/atomsci/ddm/pipeline/model_datasets.py b/atomsci/ddm/pipeline/model_datasets.py index 18264fef..aa656cbc 100644 --- a/atomsci/ddm/pipeline/model_datasets.py +++ b/atomsci/ddm/pipeline/model_datasets.py @@ -15,8 +15,6 @@ import getpass import sys -from collections import defaultdict - feather_supported = True try: import pyarrow.feather as feather From 2992bdff0631cf5e62c498e6c67b2370a56bfd4a Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Thu, 12 Sep 2024 11:56:50 -0700 Subject: [PATCH 16/57] Added more models for seeding test. --- ...tentivefp_regression_train_valid_test.json | 19 ++++++ ...hconv_classification_train_valid_test.json | 15 +++++ ...graphconv_regression_train_valid_test.json | 15 +++++ ...torchmpnn_regression_train_valid_test.json | 19 ++++++ .../integrative/seed_test/seed_test_models.py | 60 ++++++++++++++++++- 5 files changed, 126 insertions(+), 2 deletions(-) create mode 100644 atomsci/ddm/test/integrative/seed_test/model_json/attentivefp_regression_train_valid_test.json create mode 100644 atomsci/ddm/test/integrative/seed_test/model_json/graphconv_classification_train_valid_test.json create mode 100644 atomsci/ddm/test/integrative/seed_test/model_json/graphconv_regression_train_valid_test.json create mode 100644 atomsci/ddm/test/integrative/seed_test/model_json/pytorchmpnn_regression_train_valid_test.json diff --git a/atomsci/ddm/test/integrative/seed_test/model_json/attentivefp_regression_train_valid_test.json b/atomsci/ddm/test/integrative/seed_test/model_json/attentivefp_regression_train_valid_test.json new file mode 100644 index 00000000..c87a245b --- /dev/null +++ b/atomsci/ddm/test/integrative/seed_test/model_json/attentivefp_regression_train_valid_test.json @@ -0,0 +1,19 @@ +{"verbose": "True", +"datastore": "False", +"save_results": "False", +"model_type": "AttentiveFPModel", +"AttentiveFPModel_num_layers":"3", +"AttentiveFPModel_learning_rate": "0.0007", +"AttentiveFPModel_n_tasks": "1", +"featurizer":"MolGraphConvFeaturizer", +"MolGraphConvFeaturizer_use_edges":"True", +"prediction_type": "regression", +"split_strategy": "train_valid_test", +"splitter": "random", +"split_test_frac": "0.15", +"split_valid_frac": "0.15", +"transformers": "True", +"id_col": "compound_id", +"smiles_col": "base_rdkit_smiles", +"response_cols":"pIC50", +"max_epochs":"15"} \ No newline at end of file diff --git a/atomsci/ddm/test/integrative/seed_test/model_json/graphconv_classification_train_valid_test.json b/atomsci/ddm/test/integrative/seed_test/model_json/graphconv_classification_train_valid_test.json new file mode 100644 index 00000000..10b38ca1 --- /dev/null +++ b/atomsci/ddm/test/integrative/seed_test/model_json/graphconv_classification_train_valid_test.json @@ -0,0 +1,15 @@ +{"verbose": "True", +"datastore": "False", +"save_results": "False", +"model_type": "NN", +"featurizer": "graphconv", +"prediction_type": "classification", +"split_strategy": "train_valid_test", +"splitter": "random", +"split_test_frac": "0.15", +"split_valid_frac": "0.15", +"transformers": "True", +"id_col": "compound_id", +"smiles_col": "base_rdkit_smiles", +"response_cols":"active", +"max_epochs":"15"} \ No newline at end of file diff --git a/atomsci/ddm/test/integrative/seed_test/model_json/graphconv_regression_train_valid_test.json b/atomsci/ddm/test/integrative/seed_test/model_json/graphconv_regression_train_valid_test.json new file mode 100644 index 00000000..3fd351fd --- /dev/null +++ b/atomsci/ddm/test/integrative/seed_test/model_json/graphconv_regression_train_valid_test.json @@ -0,0 +1,15 @@ +{"verbose": "True", +"datastore": "False", +"save_results": "False", +"model_type": "NN", +"featurizer": "graphconv", +"prediction_type": "regression", +"split_strategy": "train_valid_test", +"splitter": "random", +"split_test_frac": "0.15", +"split_valid_frac": "0.15", +"transformers": "True", +"id_col": "compound_id", +"smiles_col": "base_rdkit_smiles", +"response_cols":"pIC50", +"max_epochs":"15"} \ No newline at end of file diff --git a/atomsci/ddm/test/integrative/seed_test/model_json/pytorchmpnn_regression_train_valid_test.json b/atomsci/ddm/test/integrative/seed_test/model_json/pytorchmpnn_regression_train_valid_test.json new file mode 100644 index 00000000..20b2e0b2 --- /dev/null +++ b/atomsci/ddm/test/integrative/seed_test/model_json/pytorchmpnn_regression_train_valid_test.json @@ -0,0 +1,19 @@ +{"verbose": "True", +"datastore": "False", +"save_results": "False", +"model_type": "PytorchMPNNModel", +"PytrochMPNNModel_mode": "regression", +"PytorchMPNNModel_learning_rate": "0.001", +"PytorchMPNNModel_n_tasks": "1", +"featurizer":"MolGraphConvFeaturizer", +"MolGraphConvFeaturizer_use_edges":"True", +"prediction_type": "regression", +"split_strategy": "train_valid_test", +"splitter": "random", +"split_test_frac": "0.15", +"split_valid_frac": "0.15", +"transformers": "True", +"id_col": "compound_id", +"smiles_col": "base_rdkit_smiles", +"response_cols":"pIC50", +"max_epochs":"15"} \ No newline at end of file diff --git a/atomsci/ddm/test/integrative/seed_test/seed_test_models.py b/atomsci/ddm/test/integrative/seed_test/seed_test_models.py index d99d7eaf..450de758 100644 --- a/atomsci/ddm/test/integrative/seed_test/seed_test_models.py +++ b/atomsci/ddm/test/integrative/seed_test/seed_test_models.py @@ -239,7 +239,7 @@ def test_xgboost_classification_reproducibility(): def test_xgboost_regression_kfold_cv_reproducibility(): script_path = os.path.dirname(os.path.realpath(__file__)) - json_file = os.path.join(script_path, 'model_json/xgboost_regression_train_valid_test.json') + json_file = os.path.join(script_path, 'model_json/xgboost_regression_kfold_test.json') pparams = parse.wrapper(['--config_file', json_file]) pparams.dataset_key = os.path.join(script_path, '../../test_datasets/aurka_chembl_base_smiles_union.csv') @@ -249,7 +249,39 @@ def test_xgboost_regression_kfold_cv_reproducibility(): def test_xgboost_classification_kfold_cv_reproducibility(): script_path = os.path.dirname(os.path.realpath(__file__)) - json_file = os.path.join(script_path, 'model_json/xgboost_classification_train_valid_test.json') + json_file = os.path.join(script_path, 'model_json/xgboost_classification_kfold_test.json') + + pparams = parse.wrapper(['--config_file', json_file]) + pparams.dataset_key = os.path.join(script_path, '../../test_datasets/aurka_chembl_base_smiles_union.csv') + pparams.result_dir=script_path + + saved_model_identity(pparams) + +# graphconv +def test_graphconv_classification_reproducibility(): + script_path = os.path.dirname(os.path.realpath(__file__)) + json_file = os.path.join(script_path, 'model_json/graphconv_classification_train_valid_test.json') + + pparams = parse.wrapper(['--config_file', json_file]) + pparams.dataset_key = os.path.join(script_path, '../../test_datasets/aurka_chembl_base_smiles_union.csv') + pparams.result_dir=script_path + + saved_model_identity(pparams) + +def test_graphconv_regression_reproducibility(): + script_path = os.path.dirname(os.path.realpath(__file__)) + json_file = os.path.join(script_path, 'model_json/graphconv_regression_train_valid_test.json') + + pparams = parse.wrapper(['--config_file', json_file]) + pparams.dataset_key = os.path.join(script_path, '../../test_datasets/aurka_chembl_base_smiles_union.csv') + pparams.result_dir=script_path + + saved_model_identity(pparams) + +# DCmodels +def test_attentivefp_regression_reproducibility(): + script_path = os.path.dirname(os.path.realpath(__file__)) + json_file = os.path.join(script_path, 'model_json/attentivefp_regression_train_valid_test.json') pparams = parse.wrapper(['--config_file', json_file]) pparams.dataset_key = os.path.join(script_path, '../../test_datasets/aurka_chembl_base_smiles_union.csv') @@ -257,6 +289,17 @@ def test_xgboost_classification_kfold_cv_reproducibility(): saved_model_identity(pparams) +def test_pytorchmpnn_regression_reproducibility(): + script_path = os.path.dirname(os.path.realpath(__file__)) + json_file = os.path.join(script_path, 'model_json/pytorchmpnn_regression_train_valid_test.json') + + pparams = parse.wrapper(['--config_file', json_file]) + pparams.dataset_key = os.path.join(script_path, '../../test_datasets/aurka_chembl_base_smiles_union.csv') + pparams.result_dir=script_path + + saved_model_identity(pparams) + + if __name__ == "__main__": # ------ random forest print("test_RF_regression_reproducibility") @@ -288,5 +331,18 @@ def test_xgboost_classification_kfold_cv_reproducibility(): print("test_xgboost_classification_kfold_reproducibility") test_xgboost_classification_kfold_cv_reproducibility() + # ------ graphconv + print("test_graphconv_classification_reproducibility") + test_graphconv_classification_reproducibility() + print("test_graphconv_regression_reproducibility") + test_graphconv_regression_reproducibility() + + # ------ dcmodels + print("test_attentivefp_regression_reproducibility") + test_attentivefp_regression_reproducibility() + + print("test_pytorchmpnn_regression_reproducibility") + test_pytorchmpnn_regression_reproducibility() + print("Passed!") \ No newline at end of file From ccebaed832c73d806b1fe7b42773ff150ed186e4 Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Thu, 12 Sep 2024 15:39:23 -0700 Subject: [PATCH 17/57] Fixed seed for GCNModel. Should pass regularly now. --- .../dc_models/reg_config_H1_fit_GCNModel.json | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/atomsci/ddm/test/integrative/dc_models/reg_config_H1_fit_GCNModel.json b/atomsci/ddm/test/integrative/dc_models/reg_config_H1_fit_GCNModel.json index df299aee..f173fb3b 100644 --- a/atomsci/ddm/test/integrative/dc_models/reg_config_H1_fit_GCNModel.json +++ b/atomsci/ddm/test/integrative/dc_models/reg_config_H1_fit_GCNModel.json @@ -7,6 +7,8 @@ "data_owner": "username", "parser_version":"1.0", + "seed":"2015821819", + "comment": "Input file", "comment": "----------------------------------------", "comment": "Note: dataset_key must be a path/file name: E.G. ./dataset.csv", @@ -44,5 +46,11 @@ "comment": "Results", "comment": "----------------------------------------", - "result_dir": "result" + "result_dir": "result", + + "comment": "Test", + "comment": "----------------------------------------", + "comment": "with the seed, the result should be 0.9871578924396036.", + "perf_threshold": "0.98" + } From dcc4809c3805ca25f4854b46cbe75145eba34b05 Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Thu, 12 Sep 2024 16:19:21 -0700 Subject: [PATCH 18/57] Set seed to guarantee resuts in class_config_delaney_fit_nn_ecfp.json --- .../delaney_Panel/jsons/class_config_delaney_fit_NN_ecfp.json | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/atomsci/ddm/test/integrative/delaney_Panel/jsons/class_config_delaney_fit_NN_ecfp.json b/atomsci/ddm/test/integrative/delaney_Panel/jsons/class_config_delaney_fit_NN_ecfp.json index 94683dcb..39a7cf99 100644 --- a/atomsci/ddm/test/integrative/delaney_Panel/jsons/class_config_delaney_fit_NN_ecfp.json +++ b/atomsci/ddm/test/integrative/delaney_Panel/jsons/class_config_delaney_fit_NN_ecfp.json @@ -37,5 +37,7 @@ "comment": "Results", "comment": "----------------------------------------", - "result_dir": "result" + "result_dir": "result", + + "seed":"3173915729" } From 922bf0c5035aac97ae3dad1ac10c018f2fef3b3a Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Wed, 18 Sep 2024 16:07:47 -0700 Subject: [PATCH 19/57] Moved 'test' from suffix to prefix --- .../sampling_test/sampling_test.py | 48 +++++++++---------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/atomsci/ddm/test/integrative/sampling_test/sampling_test.py b/atomsci/ddm/test/integrative/sampling_test/sampling_test.py index 9e867fb6..59a66d36 100644 --- a/atomsci/ddm/test/integrative/sampling_test/sampling_test.py +++ b/atomsci/ddm/test/integrative/sampling_test/sampling_test.py @@ -119,7 +119,7 @@ def saved_model_identity(pparams): #------------------------------------------------------------------- #-------- random forest -def train_valid_test_RF_SMOTE_test(): +def test_train_valid_test_RF_SMOTE(): script_path = os.path.dirname(os.path.realpath(__file__)) json_file = os.path.join(script_path, 'sampling_json/train_valid_test_RF_SMOTE.json') @@ -131,7 +131,7 @@ def train_valid_test_RF_SMOTE_test(): saved_model_identity(pparams) -def k_fold_cv_RF_SMOTE_test(): +def test_k_fold_cv_RF_SMOTE(): script_path = os.path.dirname(os.path.realpath(__file__)) json_file = os.path.join(script_path, 'sampling_json/kfold_cv_RF_SMOTE.json') @@ -143,7 +143,7 @@ def k_fold_cv_RF_SMOTE_test(): saved_model_identity(pparams) -def k_fold_cv_RF_undersampling_test(): +def test_k_fold_cv_RF_undersampling(): script_path = os.path.dirname(os.path.realpath(__file__)) json_file = os.path.join(script_path, 'sampling_json/kfold_cv_RF_undersampling.json') @@ -156,7 +156,7 @@ def k_fold_cv_RF_undersampling_test(): saved_model_identity(pparams) -def train_valid_test_RF_undersampling_test(): +def test_train_valid_test_RF_undersampling(): script_path = os.path.dirname(os.path.realpath(__file__)) json_file = os.path.join(script_path, 'sampling_json/train_valid_test_RF_undersampling.json') @@ -170,7 +170,7 @@ def train_valid_test_RF_undersampling_test(): #-------- neural network -def train_valid_test_NN_SMOTE_test(): +def test_train_valid_test_NN_SMOTE(): script_path = os.path.dirname(os.path.realpath(__file__)) json_file = os.path.join(script_path, 'sampling_json/train_valid_test_NN_SMOTE.json') @@ -182,7 +182,7 @@ def train_valid_test_NN_SMOTE_test(): saved_model_identity(pparams) -def train_valid_test_NN_undersampling_test(): +def test_train_valid_test_NN_undersampling(): script_path = os.path.dirname(os.path.realpath(__file__)) json_file = os.path.join(script_path, 'sampling_json/train_valid_test_NN_undersampling.json') @@ -194,7 +194,7 @@ def train_valid_test_NN_undersampling_test(): saved_model_identity(pparams) -def k_fold_cv_NN_SMOTE_test(): +def test_k_fold_cv_NN_SMOTE(): script_path = os.path.dirname(os.path.realpath(__file__)) json_file = os.path.join(script_path, 'sampling_json/kfold_cv_NN_SMOTE.json') @@ -206,7 +206,7 @@ def k_fold_cv_NN_SMOTE_test(): saved_model_identity(pparams) -def k_fold_cv_NN_undersampling_test(): +def test_k_fold_cv_NN_undersampling(): script_path = os.path.dirname(os.path.realpath(__file__)) json_file = os.path.join(script_path, 'sampling_json/kfold_cv_NN_undersampling.json') @@ -220,7 +220,7 @@ def k_fold_cv_NN_undersampling_test(): #-------- xgboost -def train_valid_test_xgboost_SMOTE_test(): +def test_train_valid_test_xgboost_SMOTE(): script_path = os.path.dirname(os.path.realpath(__file__)) json_file = os.path.join(script_path, 'sampling_json/train_valid_test_xgboost_SMOTE.json') @@ -232,7 +232,7 @@ def train_valid_test_xgboost_SMOTE_test(): saved_model_identity(pparams) -def train_valid_test_xgboost_undersampling_test(): +def test_train_valid_test_xgboost_undersampling(): script_path = os.path.dirname(os.path.realpath(__file__)) json_file = os.path.join(script_path, 'sampling_json/train_valid_test_xgboost_undersampling.json') @@ -244,7 +244,7 @@ def train_valid_test_xgboost_undersampling_test(): saved_model_identity(pparams) -def k_fold_cv_xgboost_SMOTE_test(): +def test_k_fold_cv_xgboost_SMOTE(): script_path = os.path.dirname(os.path.realpath(__file__)) json_file = os.path.join(script_path, 'sampling_json/kfold_cv_xgboost_SMOTE.json') @@ -256,7 +256,7 @@ def k_fold_cv_xgboost_SMOTE_test(): saved_model_identity(pparams) -def k_fold_cv_xgboost_undersampling_test(): +def test_k_fold_cv_xgboost_undersampling(): script_path = os.path.dirname(os.path.realpath(__file__)) json_file = os.path.join(script_path, 'sampling_json/kfold_cv_xgboost_undersampling.json') @@ -271,39 +271,39 @@ def k_fold_cv_xgboost_undersampling_test(): if __name__=='__main__': print('train_valid_test_RF_SMOTE_test') - train_valid_test_RF_SMOTE_test() + test_train_valid_test_RF_SMOTE() print('train_valid_test_NN_SMOTE_test') - train_valid_test_NN_SMOTE_test() + test_train_valid_test_NN_SMOTE() print("train_valid_test_RF_undersampling_test") - train_valid_test_RF_undersampling_test() + test_train_valid_test_RF_undersampling() print("train_valid_test_NN_undersampling_test") - train_valid_test_NN_undersampling_test() + test_train_valid_test_NN_undersampling() print("kfold_cv_NN_SMOTE_test") - k_fold_cv_NN_SMOTE_test() + test_k_fold_cv_NN_SMOTE() print("kfold_cv_NN_undersampling_test") - k_fold_cv_NN_undersampling_test() + test_k_fold_cv_NN_undersampling() print("kfold_cv_RF_SMOTE_test") - k_fold_cv_RF_SMOTE_test() + test_k_fold_cv_RF_SMOTE() print("kfold_cv_RF_undersampling_test") - k_fold_cv_RF_undersampling_test() + test_k_fold_cv_RF_undersampling() print("train_valid_test_xgboost_SMOTE_test") - train_valid_test_xgboost_SMOTE_test() + test_train_valid_test_xgboost_SMOTE() print("train_valid_test_xgboost_undersampling_test") - train_valid_test_xgboost_undersampling_test() + test_train_valid_test_xgboost_undersampling() print("k_fold_cv_xgboost_SMOTE_test") - k_fold_cv_xgboost_SMOTE_test() + test_k_fold_cv_xgboost_SMOTE() print("k_fold_cv_xgboost_undersampling_test") - k_fold_cv_xgboost_undersampling_test() + test_k_fold_cv_xgboost_undersampling() print("Passed!") \ No newline at end of file From 82838d189c58c08bbb40bf255c0f67f319a39ab3 Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Thu, 19 Sep 2024 11:30:00 -0700 Subject: [PATCH 20/57] Renamed these test files to start with test_ so they're caught by the test scripts --- .../sampling_test/{sampling_test.py => test_sampling.py} | 0 .../seed_test/{seed_test_models.py => test_seed_models.py} | 0 .../seed_test/{seed_test_splitting.py => test_seed_splitting.py} | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename atomsci/ddm/test/integrative/sampling_test/{sampling_test.py => test_sampling.py} (100%) rename atomsci/ddm/test/integrative/seed_test/{seed_test_models.py => test_seed_models.py} (100%) rename atomsci/ddm/test/integrative/seed_test/{seed_test_splitting.py => test_seed_splitting.py} (100%) diff --git a/atomsci/ddm/test/integrative/sampling_test/sampling_test.py b/atomsci/ddm/test/integrative/sampling_test/test_sampling.py similarity index 100% rename from atomsci/ddm/test/integrative/sampling_test/sampling_test.py rename to atomsci/ddm/test/integrative/sampling_test/test_sampling.py diff --git a/atomsci/ddm/test/integrative/seed_test/seed_test_models.py b/atomsci/ddm/test/integrative/seed_test/test_seed_models.py similarity index 100% rename from atomsci/ddm/test/integrative/seed_test/seed_test_models.py rename to atomsci/ddm/test/integrative/seed_test/test_seed_models.py diff --git a/atomsci/ddm/test/integrative/seed_test/seed_test_splitting.py b/atomsci/ddm/test/integrative/seed_test/test_seed_splitting.py similarity index 100% rename from atomsci/ddm/test/integrative/seed_test/seed_test_splitting.py rename to atomsci/ddm/test/integrative/seed_test/test_seed_splitting.py From 4e471cb25ccf0cf0e7ecfe315d75adf97bf2fc92 Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Thu, 19 Sep 2024 14:34:40 -0700 Subject: [PATCH 21/57] Changed MultitaskScaffoldSplit and GeneticAlgorithm to use a Generater and updated the test_split to use a fixed seed --- atomsci/ddm/pipeline/GeneticAlgorithm.py | 20 +++++++++++----- .../ddm/pipeline/MultitaskScaffoldSplit.py | 24 ++++++++++++------- .../integrative/multitask_split/test_split.py | 4 ++-- 3 files changed, 32 insertions(+), 16 deletions(-) diff --git a/atomsci/ddm/pipeline/GeneticAlgorithm.py b/atomsci/ddm/pipeline/GeneticAlgorithm.py index 0f6eb11a..c9d86dbd 100644 --- a/atomsci/ddm/pipeline/GeneticAlgorithm.py +++ b/atomsci/ddm/pipeline/GeneticAlgorithm.py @@ -1,10 +1,11 @@ import numpy as np +import uuid import scipy.spatial.distance as scipy_distance import multiprocessing import random from tqdm import tqdm import timeit -from typing import Any, Callable, List, Tuple +from typing import Any, Callable, List, Tuple, Optional N_PROCS = multiprocessing.cpu_count() @@ -22,7 +23,8 @@ def __init__(self, init_pop: List[List[Any]], fitness_func: Callable, crossover_func: Callable, - mutate_func: Callable): + mutate_func: Callable, + seed: Optional[int]): """ Creates a GeneticAlgorithm object @@ -40,8 +42,14 @@ def __init__(self, mutate_func: Callable A callable that takes a list of chromosomes and returns another list of mutated chromosomes + seed: Optional[int] + Seed for random number generator """ + if seed is None: + seed = uuid.uuid4().int % (2**32) + self.random_state = np.random.default_rng(seed) + self.pop = init_pop self.pop_scores = None self.num_pop = len(init_pop) @@ -177,13 +185,13 @@ def step(self, print_timings: bool = False): # select parents using rank selection i = timeit.default_timer() - new_pop = self.crossover_func(parents, self.num_pop) + new_pop = self.crossover_func(parents, self.num_pop, random_state=self.random_state) if print_timings: print('\tcrossover %0.2f min'%((timeit.default_timer()-i)/60)) # mutate population i = timeit.default_timer() - self.pop = self.mutate_func(new_pop) + self.pop = self.mutate_func(new_pop, random_state=self.random_state) if print_timings: print('\tmutate %0.2f min'%((timeit.default_timer()-i)/60)) print('total %0.2f min'%((timeit.default_timer()-start)/60)) @@ -199,7 +207,7 @@ def step(self, print_timings: bool = False): def fitness_func(chromosome): return 1 - scipy_distance.rogerstanimoto(chromosome, target_chromosome) - def crossover_func(parents, pop_size): + def crossover_func(parents, pop_size, random_state): new_pop = [] for i in range(num_pop): parent1 = parents[i%len(parents)] @@ -210,7 +218,7 @@ def crossover_func(parents, pop_size): return new_pop - def mutate_func(pop, mutate_chance=0.01): + def mutate_func(pop, random_state, mutate_chance=0.01): new_pop = [] for chromosome in pop: new_chromosome = list(chromosome) diff --git a/atomsci/ddm/pipeline/MultitaskScaffoldSplit.py b/atomsci/ddm/pipeline/MultitaskScaffoldSplit.py index e3432360..2da4b7f2 100644 --- a/atomsci/ddm/pipeline/MultitaskScaffoldSplit.py +++ b/atomsci/ddm/pipeline/MultitaskScaffoldSplit.py @@ -636,8 +636,8 @@ def split(self, A tuple with 3 elements that are training, validation, and test compound indices into dataset, respectively """ - if seed is not None: - np.random.seed(seed) + self.seed = seed + self.dataset = dataset self.diff_fitness_weight_tvt = diff_fitness_weight_tvt self.diff_fitness_weight_tvv = diff_fitness_weight_tvv @@ -674,7 +674,7 @@ def split(self, population.append(split_chromosome) gene_alg = ga.GeneticAlgorithm(population, self.grade, ga_crossover, - ga_mutate) + ga_mutate, self.seed) #gene_alg.iterate(num_generations) for i in range(self.num_generations): gene_alg.step(print_timings=print_timings) @@ -859,7 +859,8 @@ def train_valid_test_split(self, return train_dataset, valid_dataset, test_dataset def ga_crossover(parents: List[List[str]], - num_pop: int) -> List[List[str]]: + num_pop: int, + random_state: np.random.Generator) -> List[List[str]]: """Create the next generation from parents A random index is chosen and genes up to that index from @@ -872,6 +873,8 @@ def ga_crossover(parents: List[List[str]], A list of chromosomes. num_pop: int The number of new chromosomes to make + random_state: np.random.Generator + Random number generator Returns ------- List[List[str]] @@ -883,13 +886,14 @@ def ga_crossover(parents: List[List[str]], parent1 = parents[i%len(parents)] parent2 = parents[(i+1)%len(parents)] - crossover_point = random.randint(0, len(parents[0])-1) + crossover_point = random_state.integers(low=0, high=len(parents[0])-1, size=1)[0] new_pop.append(parent1[:crossover_point]+parent2[crossover_point:]) return new_pop def ga_mutate(new_pop: List[List[str]], - mutation_rate: float = .02) -> List[List[str]]: + random_state: np.random.Generator, + mutation_rate: float = .02,) -> List[List[str]]: """Mutate the population Each chromosome is copied and mutated at mutation_rate. @@ -900,6 +904,8 @@ def ga_mutate(new_pop: List[List[str]], ---------- new_pop: List[List[str]] A list of chromosomes. + random_state: np.random.Generator + Random number generator mutation_rate: float How often a mutation occurs. 0.02 is a good rate for my test sets. @@ -913,7 +919,7 @@ def ga_mutate(new_pop: List[List[str]], new_solution = list(solution) for i, gene in enumerate(new_solution): if random.random() < mutation_rate: - new_solution[i] = ['train', 'valid', 'test'][random.randint(0,2)] + new_solution[i] = ['train', 'valid', 'test'][random_state.integers(low=0, high=2, size=1)[0]] mutated.append(new_solution) return mutated @@ -1039,6 +1045,7 @@ def parse_args(): parser.add_argument('id_col', type=str, help='the column containing ids') parser.add_argument('response_cols', type=str, help='comma seperated string of response columns') parser.add_argument('output', type=str, help='name of the split file') + parser.add_argument('seed', type=int, default=0, help='name of the split file') return parser.parse_args() @@ -1054,5 +1061,6 @@ def parse_args(): mss = MultitaskScaffoldSplitter() mss_split_df = split_with(total_df, mss, smiles_col=args.smiles_col, id_col=args.id_col, response_cols=response_cols, - diff_fitness_weight=dfw, ratio_fitness_weight=rfw, num_generations=args.num_gens) + diff_fitness_weight=dfw, ratio_fitness_weight=rfw, num_generations=args.num_gens, + seed=args.seed) mss_split_df.to_csv(args.output, index=False) diff --git a/atomsci/ddm/test/integrative/multitask_split/test_split.py b/atomsci/ddm/test/integrative/multitask_split/test_split.py index d47ecc3b..dc2ddeac 100644 --- a/atomsci/ddm/test/integrative/multitask_split/test_split.py +++ b/atomsci/ddm/test/integrative/multitask_split/test_split.py @@ -69,7 +69,7 @@ def test_splits(): smiles_col=smiles_col, id_col=id_col, response_cols=response_cols, diff_fitness_weight_tvt=dfw, ratio_fitness_weight=rfw, num_generations=1, num_super_scaffolds=num_super_scaffolds, - frac_train=frac_train, frac_test=frac_test, frac_valid=frac_valid) + frac_train=frac_train, frac_test=frac_test, frac_valid=frac_valid, seed=0) mss_split_df.to_csv('one_gen_split.csv', index=False) assert len(total_df) == len(mss_split_df) @@ -86,7 +86,7 @@ def test_splits(): diff_fitness_weight_tvt=dfw, ratio_fitness_weight=rfw, num_generations=num_generations, num_super_scaffolds=num_super_scaffolds, - frac_train=frac_train, frac_test=frac_test, frac_valid=frac_valid) + frac_train=frac_train, frac_test=frac_test, frac_valid=frac_valid, seed=0) mss_split_df.to_csv('thirty_gen_split.csv', index=False) assert len(total_df) == len(mss_split_df) From baa54781d6bfbe5232e0d043356ebb8cc418452e Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Thu, 19 Sep 2024 16:24:21 -0700 Subject: [PATCH 22/57] Added test for MTSS seed and fixed a few cases were the wrong random method was used --- atomsci/ddm/pipeline/GeneticAlgorithm.py | 5 +- .../ddm/pipeline/MultitaskScaffoldSplit.py | 3 +- .../integrative/multitask_split/test_split.py | 51 ++++++++++++++++++- 3 files changed, 53 insertions(+), 6 deletions(-) diff --git a/atomsci/ddm/pipeline/GeneticAlgorithm.py b/atomsci/ddm/pipeline/GeneticAlgorithm.py index c9d86dbd..53e3d967 100644 --- a/atomsci/ddm/pipeline/GeneticAlgorithm.py +++ b/atomsci/ddm/pipeline/GeneticAlgorithm.py @@ -2,7 +2,6 @@ import uuid import scipy.spatial.distance as scipy_distance import multiprocessing -import random from tqdm import tqdm import timeit from typing import Any, Callable, List, Tuple, Optional @@ -213,7 +212,7 @@ def crossover_func(parents, pop_size, random_state): parent1 = parents[i%len(parents)] parent2 = parents[(i+1)%len(parents)] - crossover_point = random.randint(0, len(parents[0])-1) + crossover_point = random_state.integers(0, len(parents[0])-1, 1)[0] new_pop.append(parent1[:crossover_point]+parent2[crossover_point:]) return new_pop @@ -223,7 +222,7 @@ def mutate_func(pop, random_state, mutate_chance=0.01): for chromosome in pop: new_chromosome = list(chromosome) for i, g in enumerate(new_chromosome): - if random.random() < mutate_chance: + if random_state.random() < mutate_chance: if new_chromosome[i] == 0: new_chromosome[i] = 1 else: diff --git a/atomsci/ddm/pipeline/MultitaskScaffoldSplit.py b/atomsci/ddm/pipeline/MultitaskScaffoldSplit.py index 2da4b7f2..0437a407 100644 --- a/atomsci/ddm/pipeline/MultitaskScaffoldSplit.py +++ b/atomsci/ddm/pipeline/MultitaskScaffoldSplit.py @@ -1,6 +1,5 @@ import argparse import logging -import random import timeit import tempfile from typing import List, Optional, Set, Tuple @@ -918,7 +917,7 @@ def ga_mutate(new_pop: List[List[str]], for solution in new_pop: new_solution = list(solution) for i, gene in enumerate(new_solution): - if random.random() < mutation_rate: + if random_state.random() < mutation_rate: new_solution[i] = ['train', 'valid', 'test'][random_state.integers(low=0, high=2, size=1)[0]] mutated.append(new_solution) diff --git a/atomsci/ddm/test/integrative/multitask_split/test_split.py b/atomsci/ddm/test/integrative/multitask_split/test_split.py index dc2ddeac..f89d6fdb 100644 --- a/atomsci/ddm/test/integrative/multitask_split/test_split.py +++ b/atomsci/ddm/test/integrative/multitask_split/test_split.py @@ -41,6 +41,54 @@ def clean(): delete_file('thirty_gen_split.csv') delete_file('ss_split.csv') +def test_seeded_splits(): + clean() + + init_data() + + smiles_col = 'compound_id' + id_col = 'compound_id' + frac_train = 0.8 + frac_test = 0.1 + frac_valid = 0.1 + num_super_scaffolds = 60 + dfw = 2 # chemical distance importance weight + rfw = 1 # split fraction importance weight + + total_df = pd.read_csv('KCNA5_KCNH2_SCN5A_data.csv', dtype={id_col:str}) + response_cols = ['target_KCNA5_standard_value', + 'target_KCNH2_standard_value', + 'target_SCN5A_activity'] + + # ------------------------------------------------------------------------- + # one generation multitask scaffold split + mss = MultitaskScaffoldSplitter() + A_split_df = split_with(total_df, mss, + smiles_col=smiles_col, id_col=id_col, response_cols=response_cols, + diff_fitness_weight_tvt=dfw, ratio_fitness_weight=rfw, num_generations=1, + num_super_scaffolds=num_super_scaffolds, + frac_train=frac_train, frac_test=frac_test, frac_valid=frac_valid, seed=0) + + b_mss = MultitaskScaffoldSplitter() + B_split_df = split_with(total_df, b_mss, + smiles_col=smiles_col, id_col=id_col, response_cols=response_cols, + diff_fitness_weight_tvt=dfw, ratio_fitness_weight=rfw, num_generations=1, + num_super_scaffolds=num_super_scaffolds, + frac_train=frac_train, frac_test=frac_test, frac_valid=frac_valid, seed=0) + + c_mss = MultitaskScaffoldSplitter() + C_split_df = split_with(total_df, c_mss, + smiles_col=smiles_col, id_col=id_col, response_cols=response_cols, + diff_fitness_weight_tvt=dfw, ratio_fitness_weight=rfw, num_generations=1, + num_super_scaffolds=num_super_scaffolds, + frac_train=frac_train, frac_test=frac_test, frac_valid=frac_valid, seed=42) + + assert all(A_split_df['cmpd_id']==B_split_df['cmpd_id']) and all(A_split_df['subset']==B_split_df['subset']) + # compounds can be in the same order + assert not all(A_split_df['subset']==C_split_df['subset']) + + clean() + def test_splits(): clean() @@ -166,6 +214,7 @@ def test_pipeline_split_and_train(): clean() if __name__ == '__main__': - test_splits() + test_seeded_splits() + #test_splits() #test_pipeline_split_only() #test_pipeline_split_and_train() \ No newline at end of file From 4eb4ee4fb55bb7b93d628bae65edb19d4972696a Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Thu, 19 Sep 2024 16:47:22 -0700 Subject: [PATCH 23/57] renamed this file to match wahts in test_seed_splitting.py --- .../test_scaffold_train_valid_test_split.json | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 atomsci/ddm/test/integrative/seed_test/split_json/test_scaffold_train_valid_test_split.json diff --git a/atomsci/ddm/test/integrative/seed_test/split_json/test_scaffold_train_valid_test_split.json b/atomsci/ddm/test/integrative/seed_test/split_json/test_scaffold_train_valid_test_split.json new file mode 100644 index 00000000..f913cc9d --- /dev/null +++ b/atomsci/ddm/test/integrative/seed_test/split_json/test_scaffold_train_valid_test_split.json @@ -0,0 +1,13 @@ +{"verbose": "True", +"datastore": "False", +"save_results": "False", +"featurizer": "ecfp", +"split_strategy": "train_valid_test", +"splitter": "scaffold", +"split_test_frac": "0.15", +"split_valid_frac": "0.15", +"transformers": "True", +"id_col": "compound_id", +"smiles_col": "base_rdkit_smiles", +"response_cols":"active", +"max_epochs":"100"} \ No newline at end of file From 4588a9dfc33fe73c9dc6134e5d93e405cb13af88 Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Thu, 19 Sep 2024 16:49:59 -0700 Subject: [PATCH 24/57] renamed this to match the test --- .../split_json/test_scaffold_train_valid_test.json | 13 ------------- 1 file changed, 13 deletions(-) delete mode 100644 atomsci/ddm/test/integrative/seed_test/split_json/test_scaffold_train_valid_test.json diff --git a/atomsci/ddm/test/integrative/seed_test/split_json/test_scaffold_train_valid_test.json b/atomsci/ddm/test/integrative/seed_test/split_json/test_scaffold_train_valid_test.json deleted file mode 100644 index f913cc9d..00000000 --- a/atomsci/ddm/test/integrative/seed_test/split_json/test_scaffold_train_valid_test.json +++ /dev/null @@ -1,13 +0,0 @@ -{"verbose": "True", -"datastore": "False", -"save_results": "False", -"featurizer": "ecfp", -"split_strategy": "train_valid_test", -"splitter": "scaffold", -"split_test_frac": "0.15", -"split_valid_frac": "0.15", -"transformers": "True", -"id_col": "compound_id", -"smiles_col": "base_rdkit_smiles", -"response_cols":"active", -"max_epochs":"100"} \ No newline at end of file From ff58d023a804db0ece269573ffdca6b07ad1eace Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Tue, 24 Sep 2024 08:37:57 -0700 Subject: [PATCH 25/57] Removed try except blocks in test code. We need to see these errors --- .../sampling_test/test_sampling.py | 50 +++++++++---------- .../integrative/seed_test/test_seed_models.py | 18 ++----- 2 files changed, 28 insertions(+), 40 deletions(-) diff --git a/atomsci/ddm/test/integrative/sampling_test/test_sampling.py b/atomsci/ddm/test/integrative/sampling_test/test_sampling.py index 59a66d36..1e1f1aa4 100644 --- a/atomsci/ddm/test/integrative/sampling_test/test_sampling.py +++ b/atomsci/ddm/test/integrative/sampling_test/test_sampling.py @@ -64,11 +64,7 @@ def saved_model_identity(pparams): pparams.previously_split = True pparams.split_only=False - try: - model_pipe.train_model() - except Exception as e: - print(f"Error during model training: {e}") - return + model_pipe.train_model() #train_pipe = train(pparams) split_csv = os.path.join(script_path, '../../test_datasets/', model_pipe.data._get_split_key()) @@ -270,40 +266,40 @@ def test_k_fold_cv_xgboost_undersampling(): #------------------------------------------------------------------- if __name__=='__main__': - print('train_valid_test_RF_SMOTE_test') - test_train_valid_test_RF_SMOTE() + #print('train_valid_test_RF_SMOTE_test') + #test_train_valid_test_RF_SMOTE() - print('train_valid_test_NN_SMOTE_test') - test_train_valid_test_NN_SMOTE() + #print('train_valid_test_NN_SMOTE_test') + #test_train_valid_test_NN_SMOTE() - print("train_valid_test_RF_undersampling_test") - test_train_valid_test_RF_undersampling() + #print("train_valid_test_RF_undersampling_test") + #test_train_valid_test_RF_undersampling() - print("train_valid_test_NN_undersampling_test") - test_train_valid_test_NN_undersampling() + #print("train_valid_test_NN_undersampling_test") + #test_train_valid_test_NN_undersampling() print("kfold_cv_NN_SMOTE_test") test_k_fold_cv_NN_SMOTE() - print("kfold_cv_NN_undersampling_test") - test_k_fold_cv_NN_undersampling() + #print("kfold_cv_NN_undersampling_test") + #test_k_fold_cv_NN_undersampling() - print("kfold_cv_RF_SMOTE_test") - test_k_fold_cv_RF_SMOTE() + #print("kfold_cv_RF_SMOTE_test") + #test_k_fold_cv_RF_SMOTE() - print("kfold_cv_RF_undersampling_test") - test_k_fold_cv_RF_undersampling() + #print("kfold_cv_RF_undersampling_test") + #test_k_fold_cv_RF_undersampling() - print("train_valid_test_xgboost_SMOTE_test") - test_train_valid_test_xgboost_SMOTE() + #print("train_valid_test_xgboost_SMOTE_test") + #test_train_valid_test_xgboost_SMOTE() - print("train_valid_test_xgboost_undersampling_test") - test_train_valid_test_xgboost_undersampling() + #print("train_valid_test_xgboost_undersampling_test") + #test_train_valid_test_xgboost_undersampling() - print("k_fold_cv_xgboost_SMOTE_test") - test_k_fold_cv_xgboost_SMOTE() + #print("k_fold_cv_xgboost_SMOTE_test") + #test_k_fold_cv_xgboost_SMOTE() - print("k_fold_cv_xgboost_undersampling_test") - test_k_fold_cv_xgboost_undersampling() + #print("k_fold_cv_xgboost_undersampling_test") + #test_k_fold_cv_xgboost_undersampling() print("Passed!") \ No newline at end of file diff --git a/atomsci/ddm/test/integrative/seed_test/test_seed_models.py b/atomsci/ddm/test/integrative/seed_test/test_seed_models.py index 450de758..7231ec28 100644 --- a/atomsci/ddm/test/integrative/seed_test/test_seed_models.py +++ b/atomsci/ddm/test/integrative/seed_test/test_seed_models.py @@ -58,21 +58,13 @@ def saved_model_identity(pparams): pparams.split_uuid = split_uuid pparams.previously_split = True pparams.split_only = False - try: - model_pipe.train_model() - except Exception as e: - print(f"Error during model training: {e}") - return + + model_pipe.train_model() # load model metrics from file - - try: - with open(os.path.join(pparams.output_dir, 'model_metrics.json'), 'r') as f: - model_metrics = json.load(f) - except Exception as e: - print(f"Error during loading model metrics: {e}") - return - + with open(os.path.join(pparams.output_dir, 'model_metrics.json'), 'r') as f: + model_metrics = json.load(f) + original_metrics = find_best_test_metric(model_metrics) if pparams.prediction_type == 'regression': original_mae = original_metrics['prediction_results']['mae_score'] From 0028ed7a66982ef46fe3a6b203a868d0e6d011cd Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Tue, 24 Sep 2024 13:49:31 -0700 Subject: [PATCH 26/57] Added seed to this test so that it passes more consistently --- .../ddm/test/integrative/delaney_RF/config_delaney_fit_RF.json | 1 + 1 file changed, 1 insertion(+) diff --git a/atomsci/ddm/test/integrative/delaney_RF/config_delaney_fit_RF.json b/atomsci/ddm/test/integrative/delaney_RF/config_delaney_fit_RF.json index 0aaf9a37..0941eb2e 100644 --- a/atomsci/ddm/test/integrative/delaney_RF/config_delaney_fit_RF.json +++ b/atomsci/ddm/test/integrative/delaney_RF/config_delaney_fit_RF.json @@ -25,6 +25,7 @@ "comment": "Model", "comment": "----------------------------------------", "model_type": "RF", + "seed": "0", "comment": "Results", "comment": "----------------------------------------", From 0c83b6b6b5a94cc6fce944ea2a099775bbbcff09 Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Tue, 24 Sep 2024 15:36:02 -0700 Subject: [PATCH 27/57] combined_training_data now accounts for synthetic datasets --- atomsci/ddm/pipeline/model_datasets.py | 20 ++++++++++++++++++++ atomsci/ddm/pipeline/perf_data.py | 10 +++++----- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/atomsci/ddm/pipeline/model_datasets.py b/atomsci/ddm/pipeline/model_datasets.py index aa656cbc..f7d0cb75 100644 --- a/atomsci/ddm/pipeline/model_datasets.py +++ b/atomsci/ddm/pipeline/model_datasets.py @@ -655,11 +655,31 @@ def combined_training_data(self): # All of the splits have the same combined train/valid data, regardless of whether we're using # k-fold or train/valid/test splitting. if self.combined_train_valid_data is None: + # normally combining one fold is sufficient, but if SMOTE is being used + # each fold will have compounds unique to it. (train, valid) = self.train_valid_dsets[0] combined_X = np.concatenate((train.X, valid.X), axis=0) combined_y = np.concatenate((train.y, valid.y), axis=0) combined_w = np.concatenate((train.w, valid.w), axis=0) combined_ids = np.concatenate((train.ids, valid.ids)) + + contains_synthetic = any(id.startswith('synthetic_') for id in train.ids) + if contains_synthetic: + # for each successive fold, merge in any new compounds + for train, valid in self.train_valid_dsets[1:]: + fold_ids = np.concatenate((train.ids, valid.ids)) + new_id_indexes = [i for i in range(len(fold_ids)) if i not in combined_ids] + + fold_ids = fold_ids[new_id_indexes] + fold_X = np.concatenate((train.X, valid.X), axis=0)[new_id_indexes] + fold_y = np.concatenate((train.y, valid.y), axis=0)[new_id_indexes] + fold_w = np.concatenate((train.w, valid.w), axis=0)[new_id_indexes] + + combined_X = np.concatenate((combined_X, fold_X), axis=0) + combined_y = np.concatenate((combined_y, fold_y), axis=0) + combined_w = np.concatenate((combined_w, fold_w), axis=0) + combined_ids = np.concatenate((combined_ids, fold_ids)) + self.combined_train_valid_data = NumpyDataset(combined_X, combined_y, w=combined_w, ids=combined_ids) return self.combined_train_valid_data diff --git a/atomsci/ddm/pipeline/perf_data.py b/atomsci/ddm/pipeline/perf_data.py index 996c370c..dfef580e 100644 --- a/atomsci/ddm/pipeline/perf_data.py +++ b/atomsci/ddm/pipeline/perf_data.py @@ -861,8 +861,7 @@ def __init__(self, model_dataset, transformers, subset, transformed=True): """ self.subset = subset if self.subset in ('train', 'valid', 'train_valid'): - for fold, (train, valid) in enumerate(model_dataset.train_valid_dsets): - dataset = model_dataset.combined_training_data() + dataset = model_dataset.combined_training_data() elif self.subset == 'test': dataset = model_dataset.test_dset else: @@ -1113,9 +1112,7 @@ def __init__(self, model_dataset, transformers, subset, predict_probs=True, tran self.subset = subset if self.subset in ('train', 'valid', 'train_valid'): - for fold, (train, valid) in enumerate(model_dataset.train_valid_dsets): - dataset = model_dataset.combined_training_data() - + dataset = model_dataset.combined_training_data() elif self.subset == 'test': dataset = model_dataset.test_dset else: @@ -1132,6 +1129,8 @@ def __init__(self, model_dataset, transformers, subset, predict_probs=True, tran self.num_cmpds = dataset.y.shape[0] self.num_tasks = dataset.y.shape[1] self.num_classes = len(set(model_dataset.dataset.y.flatten())) + # pred vals maps compound ids to a matrix of predictions. + # predictions will be concatentated one by one as they come in in accumulate_preds self.pred_vals = dict([(id, np.empty((0, self.num_tasks, self.num_classes), dtype=np.float32)) for id in dataset.ids]) real_vals, self.weights = model_dataset.get_subset_responses_and_weights(self.subset, []) @@ -1179,6 +1178,7 @@ def accumulate_preds(self, predicted_vals, ids, pred_stds=None): """ class_probs = self._reshape_preds(predicted_vals) for i, id in enumerate(ids): + # Record predictions for each compound. self.pred_vals[id] = np.concatenate([self.pred_vals[id], class_probs[i,:,:].reshape((1,self.num_tasks,-1))], axis=0) self.folds += 1 real_vals = self.get_real_values(ids) From 0a616b2fdc4d3a60b071dd802e2a26b6c89f3e47 Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Tue, 24 Sep 2024 16:56:21 -0700 Subject: [PATCH 28/57] set uncertainty false for classification test since it is unsupported. Set response column to 'active' since that's the classification column. Added warning for when the expected number of classes doesn't match the number classes found --- atomsci/ddm/pipeline/model_datasets.py | 6 ++++++ .../graphconv_classification_train_valid_test.json | 1 + .../model_json/rf_classification_train_valid_test.json | 2 +- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/atomsci/ddm/pipeline/model_datasets.py b/atomsci/ddm/pipeline/model_datasets.py index f7d0cb75..65a511be 100644 --- a/atomsci/ddm/pipeline/model_datasets.py +++ b/atomsci/ddm/pipeline/model_datasets.py @@ -479,6 +479,12 @@ def _check_classes(self): (Boolean): boolean specifying if all classes are specified in all splits """ ref_class_set = get_classes(self.train_valid_dsets[0][0].y) + num_classes = len(ref_class_set) + if num_classes != self.params.class_number: + logger = logging.getLogger('ATOM') + logger.warning(f"Expected class_number:{self.params.class_number} " + f"classes but got {num_classes} instead. Double check " + "response columns or class_number parameter.") for train, valid in self.train_valid_dsets: if not ref_class_set == get_classes(train.y): return False diff --git a/atomsci/ddm/test/integrative/seed_test/model_json/graphconv_classification_train_valid_test.json b/atomsci/ddm/test/integrative/seed_test/model_json/graphconv_classification_train_valid_test.json index 10b38ca1..7f0380e2 100644 --- a/atomsci/ddm/test/integrative/seed_test/model_json/graphconv_classification_train_valid_test.json +++ b/atomsci/ddm/test/integrative/seed_test/model_json/graphconv_classification_train_valid_test.json @@ -12,4 +12,5 @@ "id_col": "compound_id", "smiles_col": "base_rdkit_smiles", "response_cols":"active", +"uncertainty":"False", "max_epochs":"15"} \ No newline at end of file diff --git a/atomsci/ddm/test/integrative/seed_test/model_json/rf_classification_train_valid_test.json b/atomsci/ddm/test/integrative/seed_test/model_json/rf_classification_train_valid_test.json index 4263b742..4777506f 100644 --- a/atomsci/ddm/test/integrative/seed_test/model_json/rf_classification_train_valid_test.json +++ b/atomsci/ddm/test/integrative/seed_test/model_json/rf_classification_train_valid_test.json @@ -11,5 +11,5 @@ "transformers": "True", "id_col": "compound_id", "smiles_col": "base_rdkit_smiles", -"response_cols":"pIC50", +"response_cols":"active", "max_epochs":"100"} \ No newline at end of file From c3b1922fd3101383af275fb3a12607f2a82f8c4a Mon Sep 17 00:00:00 2001 From: Rose Wilfong Date: Tue, 24 Sep 2024 22:22:02 -0400 Subject: [PATCH 29/57] updated tests --- .../sampling_test/test_sampling.py | 110 ++++++++++++------ .../integrative/seed_test/test_seed_models.py | 2 +- 2 files changed, 75 insertions(+), 37 deletions(-) diff --git a/atomsci/ddm/test/integrative/sampling_test/test_sampling.py b/atomsci/ddm/test/integrative/sampling_test/test_sampling.py index 1e1f1aa4..488fcb6d 100644 --- a/atomsci/ddm/test/integrative/sampling_test/test_sampling.py +++ b/atomsci/ddm/test/integrative/sampling_test/test_sampling.py @@ -31,30 +31,20 @@ def get_test_set(dataset_key, split_csv, id_col): return test_df -def split(pparams): - split_params=copy.copy(pparams) - split_params.split_only=True - split_params.previously_split=False - - model_pipeline= mp.ModelPipeline(split_params) - split_uuid = model_pipeline.split_dataset() - - return split_uuid - -def train(pparams): - train_pipe = mp.ModelPipeline(pparams) - train_pipe.train_model() - - return train_pipe - def find_best_test_metric(model_metrics): for metric in model_metrics: if metric['label'] == 'best' and metric['subset']=='test': return metric return None +def extract_seed(metadata_path): + with open(metadata_path, 'r') as f: + metadata = json.load(f) + return metadata.get('seed') + def saved_model_identity(pparams): script_path = os.path.dirname(os.path.realpath(__file__)) + retrain_pparams = copy.copy(pparams) model_pipe = mp.ModelPipeline(pparams) @@ -66,7 +56,6 @@ def saved_model_identity(pparams): model_pipe.train_model() - #train_pipe = train(pparams) split_csv = os.path.join(script_path, '../../test_datasets/', model_pipe.data._get_split_key()) test_df = get_test_set(pparams.dataset_key, split_csv, pparams.id_col) @@ -75,6 +64,11 @@ def saved_model_identity(pparams): model_metrics = json.load(f) metrics = find_best_test_metric(model_metrics) + original_accuracy = metrics['prediction_results']['accuracy_score'] + original_precision = metrics['prediction_results']['precision'] + original_recall = metrics['prediction_results']['recall_score'] + original_prc_auc = metrics['prediction_results']['prc_auc_score'] + id_col = metrics['input_dataset']['id_col'] response_col=metrics['input_dataset']['response_cols'][0] smiles_col = metrics['input_dataset']['smiles_col'] @@ -84,7 +78,9 @@ def saved_model_identity(pparams): model_tar = model_pipe.params.model_tarball_path pred_df = pfm.predict_from_model_file(model_tar, test_df, id_col=id_col, smiles_col=smiles_col, response_col=response_col) - + # generate another prediction from the same model file + pred_df2 = pfm.predict_from_model_file(model_tar, test_df, id_col=id_col, smiles_col=smiles_col, response_col=response_col) + X = pred_df[response_col+'_actual'].values y = pred_df[response_col+'_pred'].values @@ -93,25 +89,67 @@ def saved_model_identity(pparams): recall = skmetrics.recall_score(X, y, average='weighted') prc_auc = skmetrics.average_precision_score(X, y) - saved_accuracy = metrics['prediction_results']['accuracy_score'] - saved_precision = metrics['prediction_results']['precision'] - saved_recall = metrics['prediction_results']['recall_score'] - saved_prc_auc = metrics['prediction_results']['prc_auc_score'] + # return the metrics from the second prediction + X2 = pred_df2[response_col+'_actual'].values + y2 = pred_df2[response_col+'_pred'].values - # show results + x2_accuracy = skmetrics.accuracy_score(X2, y2) + x2_precision = skmetrics.precision_score(X2, y2, average='weighted') + x2_recall = skmetrics.recall_score(X2, y2, average='weighted') + x2_prc_auc = skmetrics.average_precision_score(X2, y2) + + #saved_accuracy = metrics['prediction_results']['accuracy_score'] + #saved_precision = metrics['prediction_results']['precision'] + #saved_recall = metrics['prediction_results']['recall_score'] + #saved_prc_auc = metrics['prediction_results']['prc_auc_score'] + + # show results and compare the two predictions print(metrics['subset']) print(pred_df.columns) - print("Accuracy difference:", abs(accuracy - saved_accuracy)) - print("Precision difference:", abs(precision - saved_precision)) - print("Recall difference:", abs(recall-saved_recall)) - print("PRC AUC difference:", abs(prc_auc-saved_prc_auc)) - - assert abs(accuracy-saved_accuracy) < 1 \ - and abs(precision - saved_precision) < 1 \ - and abs(recall-saved_recall) < 1 \ - and abs(prc_auc - saved_prc_auc) < 1 \ + print("Prediction results") + print("Accuracy difference:", abs(accuracy - x2_accuracy)) + print("Precision difference:", abs(precision - x2_precision)) + print("Recall difference:", abs(recall-x2_recall)) + print("PRC AUC difference:", abs(prc_auc-x2_prc_auc)) + + assert abs(accuracy - x2_accuracy) < 1e-9 \ + and abs(precision - x2_precision) < 1e-9 \ + and abs(recall - x2_recall) < 1e-9 \ + and abs(prc_auc - x2_prc_auc) < 1e-9 \ and (test_length == len(test_df)) + # create another test to ensure that the sampling methods are reproducible with the seed + metadata_path = os.path.join(pparams.output_dir, 'model_metadata.json') + seed = extract_seed(metadata_path) + + # create a duplicate parameters and add the seed + retrain_pparams.seed = seed + retrain_pparams.model_uuid = None + + # retrain the model + retrain_pipe = mp.ModelPipeline(retrain_pparams) + retrain_pipe.train_model() + + # extract the metrics from the retrained model + with open(os.path.join(retrain_pparams.output_dir, 'model_metrics.json'), 'r') as f: + retrained_model_metrics = json.load(f) + + retrained_metrics = find_best_test_metric(retrained_model_metrics) + retrained_accuracy = retrained_metrics['prediction_results']['accuracy_score'] + retrained_precision = retrained_metrics['prediction_results']['precision'] + retrained_recall = retrained_metrics['prediction_results']['recall_score'] + retrained_prc_auc = retrained_metrics['prediction_results']['prc_auc_score'] + + print("Model reproducibility results") + print("Accuracy difference:", abs(original_accuracy-retrained_accuracy)) + print("Precision difference:", abs(original_precision-retrained_precision)) + print("Recall difference:", abs(original_recall-retrained_recall)) + print("PRC AUC difference:", abs(original_prc_auc-retrained_prc_auc)) + + assert abs(original_accuracy - retrained_accuracy) < 1e-9 \ + and abs(original_precision - retrained_precision) < 1e-9 \ + and abs(original_recall - retrained_recall) < 1e-9 \ + and abs(original_prc_auc - retrained_prc_auc) < 1e-9 #------------------------------------------------------------------- #-------- random forest @@ -278,14 +316,14 @@ def test_k_fold_cv_xgboost_undersampling(): #print("train_valid_test_NN_undersampling_test") #test_train_valid_test_NN_undersampling() - print("kfold_cv_NN_SMOTE_test") - test_k_fold_cv_NN_SMOTE() + #print("kfold_cv_NN_SMOTE_test") + #test_k_fold_cv_NN_SMOTE() #print("kfold_cv_NN_undersampling_test") #test_k_fold_cv_NN_undersampling() - #print("kfold_cv_RF_SMOTE_test") - #test_k_fold_cv_RF_SMOTE() + print("kfold_cv_RF_SMOTE_test") + test_k_fold_cv_RF_SMOTE() #print("kfold_cv_RF_undersampling_test") #test_k_fold_cv_RF_undersampling() diff --git a/atomsci/ddm/test/integrative/seed_test/test_seed_models.py b/atomsci/ddm/test/integrative/seed_test/test_seed_models.py index 7231ec28..9e9a1f2f 100644 --- a/atomsci/ddm/test/integrative/seed_test/test_seed_models.py +++ b/atomsci/ddm/test/integrative/seed_test/test_seed_models.py @@ -91,7 +91,7 @@ def saved_model_identity(pparams): #retrain_pipe = train(pparams) # extract the metrics from the retrained model - with open(os.path.join(pparams.output_dir, 'model_metrics.json'), 'r') as f: + with open(os.path.join(retrain_pparams.output_dir, 'model_metrics.json'), 'r') as f: retrained_model_metrics = json.load(f) retrained_metrics = find_best_test_metric(retrained_model_metrics) From f2a30a96813c9d504d327f2205bb7585d9ee56d4 Mon Sep 17 00:00:00 2001 From: Rose Wilfong Date: Tue, 24 Sep 2024 22:29:41 -0400 Subject: [PATCH 30/57] resolve errors --- atomsci/ddm/pipeline/model_datasets.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/atomsci/ddm/pipeline/model_datasets.py b/atomsci/ddm/pipeline/model_datasets.py index fb2957e2..7ad2fee9 100644 --- a/atomsci/ddm/pipeline/model_datasets.py +++ b/atomsci/ddm/pipeline/model_datasets.py @@ -723,13 +723,8 @@ def get_subset_responses_and_weights(self, subset, transformers): """ if subset not in self.subset_response_dict: if subset in ('train', 'valid', 'train_valid'): -<<<<<<< HEAD for fold, (train, valid) in enumerate(self.train_valid_dsets): dataset = self.combined_training_data() - -======= - dataset = self.combined_training_data() ->>>>>>> 0c83b6b6b5a94cc6fce944ea2a099775bbbcff09 elif subset == 'test': dataset = self.test_dset else: From 410f03d0dfad7a686bd33f83498f32076e01e143 Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Wed, 25 Sep 2024 10:23:05 -0700 Subject: [PATCH 31/57] Added seed to test_balancing_transformer for more consistent outputs --- .../balancing_trans/test_balancing_transformer.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/atomsci/ddm/test/integrative/balancing_trans/test_balancing_transformer.py b/atomsci/ddm/test/integrative/balancing_trans/test_balancing_transformer.py index 9e5cd0eb..2508b0ea 100644 --- a/atomsci/ddm/test/integrative/balancing_trans/test_balancing_transformer.py +++ b/atomsci/ddm/test/integrative/balancing_trans/test_balancing_transformer.py @@ -100,7 +100,9 @@ def train_model_wo_balan(dset_key, split_uuid, res_dir): "save_results": "False", "max_epochs": "500", "early_stopping_patience": "50", - "verbose": "False" + "verbose": "False", + + "seed":"0" } for i in range(nreps): @@ -147,7 +149,9 @@ def train_model_w_balan(dset_key, split_uuid, res_dir): "save_results": "False", "max_epochs": "500", "early_stopping_patience": "50", - "verbose": "False" + "verbose": "False", + + "seed":"0" } for i in range(nreps): From f2478935ef1148f9a4f78429506986d23101960c Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Wed, 25 Sep 2024 10:29:49 -0700 Subject: [PATCH 32/57] added a test to make sure that multitask problems don't work with SMOTE --- ...nanobret_multitask_classification_data.csv | 431 ++++++++++++++++++ ...d_e34ba827-a532-4313-9e63-8a9b0ed18ba9.csv | 431 ++++++++++++++++++ .../sampling_test/test_sampling_mtss_model.py | 77 ++++ 3 files changed, 939 insertions(+) create mode 100755 atomsci/ddm/test/integrative/sampling_test/nanobret_multitask_classification_data.csv create mode 100755 atomsci/ddm/test/integrative/sampling_test/nanobret_multitask_classification_data_train_valid_test_multitaskscaffold_e34ba827-a532-4313-9e63-8a9b0ed18ba9.csv create mode 100755 atomsci/ddm/test/integrative/sampling_test/test_sampling_mtss_model.py diff --git a/atomsci/ddm/test/integrative/sampling_test/nanobret_multitask_classification_data.csv b/atomsci/ddm/test/integrative/sampling_test/nanobret_multitask_classification_data.csv new file mode 100755 index 00000000..f2d8d242 --- /dev/null +++ b/atomsci/ddm/test/integrative/sampling_test/nanobret_multitask_classification_data.csv @@ -0,0 +1,431 @@ +,compound_id,base_rdkit_smiles,NEK1_relation,NEK1_mean_pIC50,NEK11_relation,NEK11_mean_pIC50,NEK2_relation,NEK2_mean_pIC50,NEK3_relation,NEK3_mean_pIC50,NEK4_relation,NEK4_mean_pIC50,NEK5_relation,NEK5_mean_pIC50,NEK6_relation,NEK6_mean_pIC50,NEK9_relation,NEK9_mean_pIC50,NEK1_active,NEK11_active,NEK2_active,NEK3_active,NEK4_active,NEK5_active,NEK6_active,NEK9_active +0,PAR_272,O=C(c1ccc(-c2ccnc(Nc3ccc(N4CCOCC4)cc3)n2)cc1)N1CCC(N2CCCCC2)CC1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +1,PAR_315,CCN(CC)S(=O)(=O)c1cc(Nc2nccc(-c3ccc(C(=O)NCC#N)cc3)n2)ccc1Cl,,5.552056004461681,,5.4574860448455205,,5.238326759769438,,5.430757234170256,,5.445163827978223,,5.486603647930933,<,4.522878745280337,,5.252417203511357,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +2,ZDG_7_52_4,CN1CCN(c2ccc(Nc3nccc(-c4ccc(C(=O)NCC#N)cc4)n3)nc2)CC1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +3,HO_N_101,COc1ccc(Nc2nc(-c3ccc(OC)c(OC)c3)cc3nccn23)c(C(N)=O)c1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +4,PAR_337,CC(C)(Oc1ccccc1)C(=O)Nc1ccc(-c2ccnc(Nc3ccc(N4CCOCC4)cc3)n2)cc1,,5.779291879459036,,5.032271077856564,,4.6690495057318,,6.592673502322183,,4.820981977097612,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0 +5,Narazaciclib,CN1CCN(c2ccc(Nc3ncc4cc(C#N)c(=O)n(C5CCCC5)c4n3)cc2)CC1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +6,HO_N_57_1,COc1ccc(-c2cc3nccn3c(Nc3ccccc3C#N)n2)cc1OC,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +7,BA_03_61_01,COc1cc(Nc2ncc3c(C)cc(=O)n(C4CCOCC4)c3n2)cc(OC)c1OC,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +8,PAR_363,N#CCNC(=O)c1ccc(-c2ccnc(Nc3cccc(CN4CCOCC4)c3)n2)cc1,,6.151159915288579,,4.583360913493843,,4.832123723463034,,6.1451321031000266,,4.635987721962014,,6.851435811405105,<,4.522878745280337,,5.269015756005547,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0 +9,PAR_182,CN1CCN(c2ccc(Nc3nccc(-c4ccc(C(=O)NCC#N)cc4)n3)cc2F)CC1,,5.509596275828427,<,4.522878745280337,,4.835439584676596,,5.8385226992247405,<,4.522878745280337,,6.49132807858564,<,4.522878745280337,,5.156335184776021,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0 +10,PAR_335,C#Cc1cc(Nc2nccc(-c3ccc(NC(=O)C4CCCN4)cc3)n2)ccc1Cl,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,,5.237059415948903,<,4.522878745280337,,5.254416153865931,<,4.522878745280337,,4.802579991224726,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +11,BA_03_69_c,COc1cc2ncnc(-c3ccc(C(N)=O)c(F)c3)c2cc1OC,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +12,PAR_334,O=C(Nc1ccc(-c2ccnc(Nc3cccc(N4CCOCC4)c3)n2)cc1)C1CCCN1,,5.027331341083788,<,4.522878745280337,<,4.522878745280337,,6.230662437350819,<,4.522878745280337,,5.892418801404399,<,4.522878745280337,,5.308137455897522,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0 +13,ZDG_7_50_2,CN1CCN(c2ccc(Nc3nccc(-c4ccc(NC(=O)[C@@H]5CCCN5)cc4)n3)cc2F)CC1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +14,HO_N_135_4_A,COc1ccc(-c2cc3nccn3c(Nc3ncccc3C(N)=O)n2)cc1OC,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +15,3746,N#Cc1c(Nc2nc(Nc3ccc(N4CCNCC4)cc3)ncc2Cl)cccc1OCc1c(F)cccc1F,,,,,,7.30102999566398,,,,,,,,,,,,,1.0,,,,, +16,3827,CCOc1ccc2c(-c3ccnc(Nc4cccc(Br)c4)n3)cnn2n1,,,,,,,,,,,,6.752026733638193,,,,,,,,,,1.0,, +17,Altiratinib,O=C(Nc1cc(Oc2cc(F)c(NC(=O)C3(C(=O)Nc4ccc(F)cc4)CC3)cc2F)ccn1)C1CC1,,5.250812839247721,,5.092031771170105,,5.0900326887847855,,5.022491146605316,,5.346348290654262,<,4.522878745280337,<,4.522878745280337,,4.975134466817432,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +18,PAR_252,O=C(Nc1ccc(-c2ccnc(Nc3ccc(N4CCOCC4)cc3)n2)cc1)c1cccc(-c2cccnc2)c1,,5.0542453284308175,,5.100565607974711,,4.900463929543359,,5.012345426898412,,4.460761357368833,,4.980072153113726,<,4.522878745280337,,4.766580186299862,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +19,ATH686,CCN1CCN(Cc2ccc(NC(=O)Nc3ccc(Oc4ccnc(N)n4)cc3)cc2C(F)(F)F)CC1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,,4.681749789015572,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +20,TL01_022,COc1ccc(Nc2nccc(-c3ccc(S(=O)(=O)NCCN)cc3)n2)cc1,,5.712814683943481,<,4.522878745280337,,4.6322874985094575,,4.978363242436672,<,4.522878745280337,,6.341054794006962,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0 +21,ZDG_7_43_3,Cn1cc(-c2ccc(N3CCOCC3)cc2)c2c(N)ncnc21,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +22,BA_03_53_12,COc1ccc2c(-c3ccnc(Nc4cccc(N5CCOCC5)c4)n3)cnn2n1,,6.062751461067501,,5.118481547126134,,5.487245060747732,,6.373063665619277,,5.033558942245732,<,4.522878745280337,<,4.522878745280337,,6.705887594577249,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0 +23,PAR_379,CN(C)CCCNC(=O)c1ccc(-c2ccnc(Nc3ccc(N4CCOCC4)cc3)n2)cc1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,,5.05937779651617,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +24,PAR_376,O=C(CC12CC3CC(CC(C3)C1)C2)Nc1ccc(-c2ccnc(Nc3ccc(N4CCOCC4)cc3)n2)cc1,,4.953028453525107,,5.006589807434891,,4.846953375389458,,5.046613355088627,,4.981795846064236,,5.115687643819264,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +25,PAR_375,Cc1cccc(C(=O)Nc2ccc(-c3ccnc(Nc4ccc(N5CCOCC5)cc4)n3)cc2)n1,,5.051954664361819,,5.048577381602782,,4.824723451097711,,4.943373766429193,,4.945726828962131,,5.188651826933941,<,4.522878745280337,,4.915625881147178,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +26,PAR_331,O=C(Nc1ccc(-c2ccnc(Nc3cc(N4CCOCC4)cc(C(F)(F)F)c3)n2)cc1)C1CCCN1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,,5.738666862982437,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +27,BA_03_50_04,COc1ccc2ncnc(Nc3cccc(S(N)(=O)=O)c3)c2c1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +28,PAR_403,COc1ccccc1CC(=O)N[C@H]1CCN(c2ccnc(Nc3cc(OC)c(OC)c(OC)c3)n2)C1,,4.622548972536967,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +29,GSK329,CNc1cc(Oc2c(Cl)cc(NC(=O)Nc3cccc(C(F)(F)F)c3)cc2Cl)ncn1,<,4.522878745280337,,5.1792726384345125,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +30,PAR_225,N#CCNC(=O)c1ccc(-c2ccnc(NCCn3ccnc3)n2)cc1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +31,PAR_380,O=C(Nc1cc(O)ccn1)c1ccc(-c2ccnc(Nc3ccc(N4CCOCC4)cc3)n2)cc1,,4.920029645072777,<,4.522878745280337,<,4.522878745280337,,6.095333598804788,<,4.522878745280337,,6.511594894580872,<,4.522878745280337,,5.183946010215339,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0 +32,ZDG_7_46_3,Cn1cc(-c2cc3ccccc3o2)c2c(N)ncnc21,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +33,HO_N_138_3_A,Cn1ncc2cc(-c3cc4nccn4c(Nc4ncccc4C(N)=O)n3)ccc21,,4.55056495781959,<,4.522878745280337,<,4.522878745280337,,4.672852955355713,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +34,HO_N_90,COc1ccc(-c2cc3nccn3c(Nc3cc4ccccc4cn3)n2)cc1OC,,4.764212049256466,,4.67578301752895,,4.644063314968692,<,4.522878745280337,,4.765623989645558,<,4.522878745280337,<,4.522878745280337,,4.90264845805378,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +35,PAR_142,O=C(NCCC1CCOCC1)c1ccc(-c2ccnc(Nc3ccc(N4CCOCC4)cc3)n2)cc1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,,4.6308182614698215,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +36,BA_03_56_12,CCOc1ccc2c(-c3ccnc(Nc4cccc(N5CCOCC5)c4)n3)cnn2n1,,6.02374236875252,,5.056969533906477,,5.365743632392702,,6.281789923080026,,4.976667870676705,,7.744195610646878,<,4.522878745280337,,6.463483079487036,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0 +37,TL01_026,COc1ccccc1Nc1nccc(-c2ccc(S(=O)(=O)NCCN)cc2)n1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +38,GW843682,COc1cc2ncn(-c3cc(OCc4ccccc4C(F)(F)F)c(C(N)=O)s3)c2cc1OC,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +39,BA_03_60_04,CCCCn1c(=O)cc(C)c2cnc(Nc3ccc(N4CCN(C)CC4)c(F)c3)nc21,,4.561733678166957,<,4.522878745280337,,5.711470552571224,<,4.522878745280337,,4.637347915145522,,5.159545510921731,<,4.522878745280337,,5.401638890483255,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +40,HO_N_135_2_A,NC(=O)c1cccnc1Nc1nc(-c2ccc3c(c2)OCCO3)cc2nccn12,<,4.522878745280337,,4.986066288377341,,5.090364874929746,,5.502197651941389,,5.08345638049453,,5.49749941815019,<,4.522878745280337,,5.4066312032053725,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +41,AST487,CCN1CCN(Cc2ccc(NC(=O)Nc3ccc(Oc4cc(NC)ncn4)cc3)cc2C(F)(F)F)CC1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +42,PAR_310,N#CCNC(=O)c1ccc(-c2ccnc(Nc3ccc(Cl)cc3)n2)cc1,,5.458554470785787,,5.096843984530558,,4.728376307122485,,5.254035848504938,<,4.522878745280337,,6.376312835930973,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0 +43,PAR_268,CC(C)(C)CNC(=O)c1ccc(-c2ccnc(Nc3ccc(N4CCOCC4)cc3)n2)cc1,,5.890530010199987,,4.985652618950243,,4.924572396157946,,5.500165740249142,<,4.522878745280337,,6.4941348376070565,<,4.522878745280337,,5.183988608659089,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0 +44,HO_N_133_2_A,NC(=O)c1cccnc1Nc1nc(-c2ccnc(F)c2)cc2nccn12,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +45,BA_03_53_01,COc1ccc2c(-c3ccnc(Nc4cc(OC)c(OC)c(OC)c4)n3)cnn2n1,,5.773880703765544,,5.112542418995005,,5.374874677070926,,5.551742176902253,,4.927485323089146,,7.174881717215393,<,4.522878745280337,,6.891121194058334,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0 +46,TL01_019,O=S(=O)(NCCNS(=O)(=O)C1CC1)c1ccc(-c2ccnc(Nc3ccccc3)n2)cc1,,5.6302895225802505,,5.015573097083401,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,,6.447615587988817,<,4.522878745280337,,4.991791916486836,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0 +47,PAR_291,COc1cc(Nc2nccc(-c3ccc(C(=O)NCC#N)cc3)n2)ccc1N1CCOCC1,,4.653285401787367,<,4.522878745280337,<,4.522878745280337,,4.997500809031838,<,4.522878745280337,,5.326959152242042,<,4.522878745280337,,4.894680040351775,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +48,ZDG_7_40_C,Cc1ccccc1CCC(=O)N1CCc2cc(-c3cn(C)c4ncnc(N)c34)ccc21,,5.200969778775301,,4.776809615678486,,5.775092389602924,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +49,ZDG_6_64,Cn1nc(-c2cccc(NS(C)(=O)=O)c2)c2nc(Nc3ccc(Cl)cc3)ncc21,,5.424252817681134,,5.174190495793877,,5.275611497701943,,5.367483415538971,,5.211276375328347,<,4.522878745280337,<,4.522878745280337,,5.421185973017695,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +50,HO_N_136_6_A,Cn1nccc1-c1cc2nccn2c(Nc2ncccc2C(N)=O)n1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +51,BA_03_59_02,Cc1cc(=O)n(C2CCC2)c2nc(Nc3cccc(N4CCOCC4)c3)ncc12,,5.001601822254985,,4.693183054753221,,6.373729650814312,<,4.522878745280337,<,4.522878745280337,,6.082990515782506,<,4.522878745280337,,5.437486302264579,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0 +52,BA_03_78_d,Clc1ccc(CCNc2ncnc3sc(Br)cc23)cc1,<,4.522878745280337,,5.350427583242821,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +53,ZDG_7_44_1,Cn1cc(C2=CCOCC2)c2c(N)ncnc21,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +54,PAR_294,CN1CCN(c2ccc(Nc3nccc(-c4ccc(C(=O)NCC#N)cc4)n3)cc2)CC1,,5.044635900449657,<,4.522878745280337,,4.560321091507077,,5.273291252702858,<,4.522878745280337,,5.827809393613137,<,4.522878745280337,,4.842322140279872,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +55,BBT594,CC(=O)Nc1cc(Oc2ccc3c(c2)CCN3C(=O)Nc2ccc(CN3CCN(C)CC3)c(C(F)(F)F)c2)ncn1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,,4.799164165543371,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +56,PAR_298,CN1CCN(C(=O)c2ccc(Nc3nccc(-c4ccc(C(=O)NCC#N)cc4)n3)cc2)CC1,,5.251145281525807,<,4.522878745280337,<,4.522878745280337,,5.0225671244809424,<,4.522878745280337,,5.613491143015396,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +57,PAR_269,O=C(NCC1CCC1)c1ccc(-c2ccnc(Nc3ccc(N4CCOCC4)cc3)n2)cc1,,4.994533139752917,,4.927268579769375,,4.883063139568752,,5.352743745097549,,4.959082244697873,,5.608610180689029,<,4.522878745280337,,5.144686349387098,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +58,PAR_404,COc1cc(Nc2nccc(N3CC[C@H](NC(=O)c4ccc(F)cc4F)C3)n2)cc(OC)c1OC,,5.558465684840074,,4.541361905337933,,4.900332026351334,<,4.522878745280337,,4.739977521082981,,5.572057466455448,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +59,EF_3_203,CS(=O)(=O)c1cccc(Nc2nccc(N3CC[C@H](NC(=O)COc4ccccc4)C3)n2)c1,,4.784048510365078,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,,5.0071295739106665,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +60,HO_N_96,CN1CCN(c2ccc(Nc3ncc4c(n3)c(-c3cccc(NS(C)(=O)=O)c3)nn4C)nc2)CC1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,,5.408824154707516,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +61,ZDG_7_51_5,Cc1cc(=O)n(C2CCCC2)c2nc(Nc3ccc(C(=O)N4CCN(C)CC4)cc3)ncc12,,4.90796498375648,<,4.522878745280337,,5.137433522832007,<,4.522878745280337,,4.62633228335401,,5.742267504842507,<,4.522878745280337,,5.141072538859764,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +62,TL01_020,NCCNS(=O)(=O)c1ccc(-c2ccnc(NC3CCNCC3)n2)cc1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +63,PAR_261,CC1CC1C(=O)Nc1ccc(-c2ccnc(Nc3ccc(N4CCOCC4)cc3)n2)cc1,,5.354036283168988,,4.914258788385221,,4.810569214288567,,5.059421625497709,,4.602053372163663,,5.929412530692798,<,4.522878745280337,,4.926643299867915,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +64,HO_N_42,COc1ccc(-c2cc3nccn3c(Nc3ccccc3C(N)=O)n2)cc1OC,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +65,BA_03_50_15,COc1ccc2ncnc(NCCc3cccc(F)c3)c2c1,<,4.522878745280337,,5.138509873236028,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +66,PAR_158,CC(C#N)NC(=O)c1ccc(-c2ccnc(Nc3ccc(N4CCOCC4)cc3)n2)cc1,,5.7386084272523705,,4.701338559819312,,5.121475239081094,,6.005153475423644,,4.729687113950752,,6.675994355441816,<,4.522878745280337,,5.366898760053524,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0 +67,PAR_355,Cc1cc(=O)n(C2CCCC2)c2nc(Nc3ccn(C(F)F)n3)ncc12,,4.600182760304779,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,,5.357145720986832,<,4.522878745280337,,4.694706720393334,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +68,GSK2606414,Cn1cc(-c2ccc3c(c2)CCN3C(=O)Cc2cccc(C(F)(F)F)c2)c2c(N)ncnc21,,4.750939925719619,,5.224233686651976,,5.383612695573137,<,4.522878745280337,,4.256337912912628,,4.674469636983738,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +69,PAR_402,CNc1ccc(C(=O)N[C@H]2CCN(c3ccnc(Nc4cc(OC)c(OC)c(OC)c4)n3)C2)cc1,,5.3976646548335685,,5.152360176139837,,5.100609381453203,<,4.522878745280337,,5.219988218415373,,5.787184715248259,<,4.522878745280337,,4.920008110197255,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +70,3820,CCOc1ccc2c(-c3ccnc(Nc4ccc(N5CCN(C)CC5)c(F)c4)n3)cnn2n1,,,,,,,,,,,,7.327902142064281,,,,,,,,,,1.0,, +71,Tovorafenib,C[C@@H](NC(=O)c1ncnc(N)c1Cl)c1ncc(C(=O)Nc2cc(C(F)(F)F)c(Cl)cn2)s1,<,4.522878745280337,,4.768657012738542,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +72,ZDG_6_75_4,COc1cc(Nc2ncc3c(n2)c(-c2ccc(F)c(F)c2)nn3C)cc(OC)c1OC,,6.084243400188249,<,4.522878745280337,,5.6204374173285006,,5.94641893903062,<,4.522878745280337,,7.6506234286994035,<,4.522878745280337,,5.529835856645433,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0 +73,BA_03_50_18,COc1ccc2ncnc(NCCc3cccc(Br)c3)c2c1,<,4.522878745280337,,5.173425534719117,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +74,PAR_323,Cc1ccc(Nc2nccc(-c3ccc(NC(=O)C4CCCN4)cc3)n2)cc1O,,4.835689359282434,,4.573961326928361,<,4.522878745280337,,5.734303057224471,<,4.522878745280337,,5.839903129257649,<,4.522878745280337,,4.7368123982363,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +75,PAR_112,CC(C)(CN)CNC(=O)c1ccc(-c2ccnc(Nc3ccc(N4CCOCC4)cc3)n2)cc1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +76,HO_N_105,COc1ccc(-c2cc3nccn3c(Nc3ccc(N4CCN(C)CC4)cc3)n2)cc1OC,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +77,BA_03_56_11,CCOc1ccc2c(-c3ccnc(Nc4cc(N5CCOCC5)cc(C(F)(F)F)c4)n3)cnn2n1,,4.87182347956317,<,4.522878745280337,,4.742710778519723,,4.791763951581726,<,4.522878745280337,,6.229989330291368,<,4.522878745280337,,6.356460014325777,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0 +78,GCN2_IN_1,c1nn(C2CCOCC2)cc1Nc1ncc2nnn(-c3ccc4cn[nH]c4c3)c2n1,,4.73444963419206,<,4.522878745280337,,5.686990572523224,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +79,BA_03_61_07,COc1ccccc1Nc1ncc2c(C)cc(=O)n(C3CCOCC3)c2n1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +80,PAR_314,Cc1cc(Nc2nccc(-c3ccc(C(=O)NCC#N)cc3)n2)ccc1N1CCOCC1,,5.400563622013164,,4.642276208868983,,4.830992345816636,,5.410659988231124,<,4.522878745280337,,6.183031258097228,<,4.522878745280337,,5.102286733139374,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0 +81,PAR_157,N#CNC(=O)c1ccc(-c2ccnc(Nc3ccc(N4CCOCC4)cc3)n2)cc1,,4.608026971547095,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +82,PAR_109,CC(C)N(C(=O)c1ccc(-c2ccnc(Nc3ccc(N4CCOCC4)cc3)n2)cc1)C(C)C,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,,4.74539665738371,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +83,ZDG_7_39_A,Cn1cc(-c2ccc3c(c2)CCN3C(=O)Cc2ccc(C(C)(C)C)cc2)c2c(N)ncnc21,,4.715600709998829,,5.687502309749495,,6.283088612988116,<,4.522878745280337,,5.209563989701855,,4.666105933885059,<,4.522878745280337,<,4.522878745280337,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0 +84,PAR_406,COc1cc(Nc2nccc(N3CC[C@H](NC(=O)C4CCN(C)CC4)C3)n2)cc(OC)c1OC,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +85,BA_03_55_01,COc1cc(Nc2nccc(-c3cnn4ncccc34)n2)cc(OC)c1OC,,5.132987567403431,<,4.522878745280337,,4.856408882448614,<,4.522878745280337,<,4.522878745280337,,5.939016896221744,<,4.522878745280337,,5.883734193517436,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +86,Axitinib,CNC(=O)c1ccccc1Sc1ccc2c(/C=C/c3ccccn3)n[nH]c2c1,<,4.522878745280337,,4.867104511077254,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +87,BA_03_50_05,COc1ccc2ncnc(Nc3c(F)cccc3F)c2c1,<,4.522878745280337,,4.596618862743608,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +88,3829,COc1cccc(Nc2nccc(-c3cnn4nc(OC)ccc34)n2)c1,,5.255655162286165,,4.846115892045417,,4.900122251889135,,5.610499231954557,,4.715548992582638,,6.705214911312019,<,4.522878745280337,,6.072790227115989,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0 +89,ZDG_7_31_A,Cn1cc(-c2ccc3c(c2)CCN3C(=O)Cc2cccc(Cl)c2)c2c(N)ncnc21,,5.452325606109792,,5.400397619401221,,5.831193316226468,<,4.522878745280337,<,4.522878745280337,,5.060020330334237,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +90,ZDG_2_91,CCn1nc(-c2cccc(NS(C)(=O)=O)c2)c2nc(Nc3cc(OC)c(OC)c(OC)c3)ncc21,<,4.522878745280337,,5.363141976659056,,6.800229265996677,,5.580865079060308,,5.392522529985525,,6.964068228651505,<,4.522878745280337,,6.9750003957975135,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0 +91,PAR_159,N#CCCNC(=O)c1ccc(-c2ccnc(Nc3ccc(N4CCOCC4)cc3)n2)cc1,,5.553248711844009,<,4.522878745280337,,4.720220027609966,,5.556286074241282,<,4.522878745280337,,5.991463349980734,<,4.522878745280337,,5.309241281674459,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +92,HO_N_133_4_A,NC(=O)c1cccnc1Nc1nc(-c2ccc(S(=O)(=O)N3CCCC3)cc2)cc2nccn12,,5.167744975172301,,5.0077681177720725,<,4.522878745280337,,4.627196303033837,<,4.522878745280337,,4.887691096286184,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +93,2093,CN(C)C/C=C/C(=O)Nc1cccc(C(=O)Nc2ccc(Nc3nccc(-c4cccnc4)n3)cc2F)c1,,,,,,,,,,,,5.6925039620867866,,,,,,,,,,0.0,, +94,PAR_405,COc1cc(Nc2nccc(N3CC[C@H](NC(=O)CC45CC6CC(CC(C6)C4)C5)C3)n2)cc(OC)c1OC,,4.981411343474717,,4.761225157557821,,4.72133828277282,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +95,HO_N_67,COc1ccc(-c2cc3nccn3c(Nc3ccccc3NS(C)(=O)=O)n2)cc1OC,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +96,HO_N_136_5_A,NC(=O)c1cccnc1Nc1nc(-c2ccccc2F)cc2nccn12,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,,4.766690674784758,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,,4.600551282564767,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +97,HO_N_57_2,COc1ccc(-c2cc3nccn3c(Nc3cccnc3C(N)=O)n2)cc1OC,,5.23997849830724,,4.969907720086619,,5.191222602450658,<,4.522878745280337,,5.242906129665422,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +98,PAR_320,CC(C)c1ccc(Nc2nccc(-c3ccc(C(=O)NCC#N)cc3)n2)cc1,,5.543472992608432,,4.928232072657166,<,4.522878745280337,,5.411014810538559,,4.755247751878081,,6.353178256283163,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0 +99,Encorafenib,COC(=O)N[C@@H](C)CNc1nccc(-c2cn(C(C)C)nc2-c2cc(Cl)cc(NS(C)(=O)=O)c2F)n1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +100,ZDG_6_51_2,COc1cc(Nc2ncc3c(n2)c(-c2cccc(NC(=O)CC#N)c2)nn3C)cc(OC)c1OC,,6.52944874090885,,5.696549972826222,,6.530886968451726,,6.357595202448904,,5.638167199966386,,8.103824589968996,<,4.522878745280337,,7.184540852844173,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0 +101,BA_03_56_14,CCCCNC(=O)c1cc(F)cc(Nc2nccc(-c3cnn4nc(OCC)ccc34)n2)c1,,5.189636537748015,<,4.522878745280337,,5.204904108573477,,5.659405529983294,,5.19138944283402,,6.214638971935892,<,4.522878745280337,,5.645197983557392,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0 +102,3819,CCOc1ccc2c(-c3ccnc(Nc4ccc(OC)c(OC)c4)n3)cnn2n1,,,,,,,,,,,,7.537602002101043,,,,,,,,,,1.0,, +103,Rac_CCT_250863,CC(/C=C\C(F)(F)F)Oc1cc(-c2cc(-c3cc(CN(C)C)cs3)cnc2N)ccc1C(N)=O,<,4.522878745280337,<,4.522878745280337,,5.471954824304111,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +104,PAR_330,CS(=O)(=O)c1cccc(Nc2nccc(-c3ccc(NC(=O)C4CCCN4)cc3)n2)c1,,4.944417751456352,,4.548317198621807,<,4.522878745280337,,5.730813648408817,<,4.522878745280337,,5.480188371406996,<,4.522878745280337,,5.378530517324077,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +105,PAR_377,CC1(C)CC1C(=O)Nc1ccc(-c2ccnc(Nc3ccc(N4CCOCC4)cc3)n2)cc1,,6.105174967139251,,5.753508816051484,,5.623341311836982,,5.80430564114047,,5.432611278347096,,6.792188865778642,<,4.522878745280337,<,4.522878745280337,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0 +106,PAR_383,CCCc1ccnc(NC(=O)c2ccc(-c3ccnc(Nc4ccc(N5CCOCC5)cc4)n3)cc2)c1,,5.29199862846524,,5.3124428431629465,,5.135043190488083,,5.174283523278509,,5.239410092259872,,5.375056794357965,<,4.522878745280337,,5.022852102453122,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +107,PAR_382,O=C(NCC1CC1)c1ccc(-c2ccnc(Nc3ccc(N4CCOCC4)cc3)n2)cc1,,5.371295924863484,,5.090630710422227,,4.993542118442684,,5.54391657291866,,5.043535161781369,,6.282045923356396,<,4.522878745280337,,5.492640141638793,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0 +108,ZDG_7_38_A,Cn1cc(-c2ccc3c(c2)CCN3C(=O)Cc2ccc(C#N)cc2)c2c(N)ncnc21,,5.04880558733107,,4.809090020124783,,5.060272274498873,<,4.522878745280337,,4.743080435741345,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +109,Bafetinib,Cc1ccc(NC(=O)c2ccc(CN3CC[C@H](N(C)C)C3)c(C(F)(F)F)c2)cc1Nc1nccc(-c2cncnc2)n1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +110,ZDG_7_48_1,CN(C)CC(=O)Nc1ccc(-c2cn(C)c3ncnc(N)c23)cc1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +111,BA_03_55_14,CCCCNC(=O)c1cc(F)cc(Nc2nccc(-c3cnn4ncccc34)n2)c1,,5.2047710048054485,,5.111339314960842,,5.124041250648066,,5.2210423353814015,,5.014494360747044,,5.303215985283345,<,4.522878745280337,,5.346217595956477,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +112,HO_N_131_5_A,Cn1cc(-c2cc3nccn3c(Nc3ncccc3C(N)=O)n2)cn1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,,4.628146143269041,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +113,PAR_371,Cc1nocc1C(=O)Nc1ccc(-c2ccnc(Nc3ccc(N4CCOCC4)cc3)n2)cc1,,5.117821924352508,,4.687588190158389,,4.704974794280974,,5.327696133354295,,4.743105526322666,,5.725625892672547,<,4.522878745280337,,4.960449315064702,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +114,PAR_369,CC1(C(=O)Nc2ccc(-c3ccnc(Nc4ccc(N5CCOCC5)cc4)n3)cc2)CCCC1,,6.127525053826286,,5.784071210080595,,5.50639449696777,,6.218905861112133,,4.771642567171203,,7.190435850083502,<,4.522878745280337,,6.041095247784531,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0 +115,ZDG_6_49_1,COc1cc(Nc2ncc3c(-c4cccc(NCC(F)(F)F)c4)nn(C)c3n2)cc(OC)c1OC,,6.691293263430712,,5.284512969713438,,6.212753276097712,,6.966896451516143,,5.369403937201246,<,4.522878745280337,<,4.522878745280337,,6.663831731885772,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0 +116,HO_N_136_4_A,CS(=O)(=O)Nc1cccc(-c2cc3nccn3c(Nc3ncccc3C(N)=O)n2)c1,,4.770388717953961,,4.543240069450591,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +117,ZDG_7_33_C,Cn1cc(-c2ccc3c(c2)CCN3C(=O)Cc2cscn2)c2c(N)ncnc21,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +118,BA_03_51_15,Fc1ccc(CCNc2ncnc3ccc(F)cc23)cc1,<,4.522878745280337,,5.230087157442037,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +119,BA_03_66_a,COc1cc2ncn(-c3cc(OCc4sccc4OC)c(C(N)=O)s3)c2cc1OC,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +120,BA_03_53_11,COc1ccc2c(-c3ccnc(Nc4cc(N5CCOCC5)cc(C(F)(F)F)c4)n3)cnn2n1,,5.113626204819444,,4.769725578487549,,5.080090690342586,,5.1028409189818,,4.804196269857379,,6.217860891122144,<,4.522878745280337,,6.7256421310134185,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0 +121,ZDG_7_25_3,COc1cc2ncn(-c3cc(OCc4ncc(C(C)(C)C)o4)c(C(N)=O)s3)c2cc1OC,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +122,HO_N_116,COc1ccc(-c2cc3nccn3c(Nc3cccc(S(C)(=O)=O)c3)n2)cc1OC,,4.776373940503477,,4.6008913847120745,<,4.522878745280337,<,4.522878745280337,,4.526746521215781,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +123,Brepocitinib,Cn1cc(Nc2nccc(N3CC4CCC(C3)N4C(=O)[C@@H]3CC3(F)F)n2)cn1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +124,HO_N_115,COc1ccc(-c2cc3nccn3c(NCCn3ccnc3)n2)cc1OC,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +125,BA_03_50_20,COc1ccc2ncnc(NCCc3ccc(Br)cc3)c2c1,<,4.522878745280337,,5.608640769372401,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +126,ZDG_7_44_3,Cn1cc(-c2cccc(OC(F)(F)F)c2)c2c(N)ncnc21,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,,4.417198605305975,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +127,BA_03_69_a,COc1cc2ncnc(-c3ccc(C(N)=O)cc3F)c2cc1OC,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +128,HO_N_137_A,Cc1noc(C)c1-c1cc2nccn2c(Nc2ncccc2C(N)=O)n1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +129,HO_N_135_4_E,COC(=O)c1cccnc1Nc1nc(-c2ccc(OC)c(OC)c2)cc2nccn12,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +130,PAR_249,O=C(Nc1ccc(-c2ccnc(Nc3ccc(N4CCOCC4)cc3)n2)cc1)C1CCC(O)CC1,,5.007421559397016,<,4.522878745280337,<,4.522878745280337,,5.55859615288902,<,4.522878745280337,,5.8989347544814414,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +131,PAR_361,CN1CCC(n2cc(Nc3nccc(-c4ccc(C(=O)NCC#N)cc4)n3)cn2)CC1,,5.695880618567663,<,4.522878745280337,,4.8412766118629005,,5.394258787411851,<,4.522878745280337,,5.953479188966091,<,4.522878745280337,,4.873614798292431,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +132,PAR_274,Cn1cc(CNC(=O)c2ccc(-c3ccnc(Nc4ccc(N5CCOCC5)cc4)n3)cc2)cn1,,4.936337868347781,,4.721888786879858,,4.647499419165893,,5.387318497835222,,4.658289417749323,,5.99379904251287,<,4.522878745280337,,5.211302575018534,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +133,ZDG_5_55_6,CCC(CN)NC(=O)c1ccc(-c2ccncc2)cc1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +134,ZDG_7_45_1,Cn1cc(-c2ccc(F)c(F)c2)c2c(N)ncnc21,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +135,BA_03_50_21,COc1ccc2ncnc(NCCc3ccc(C)cc3)c2c1,<,4.522878745280337,,5.094618114663208,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +136,HO_N_95,COc1ccc(-c2cc3nccn3c(Nc3ccc(C(F)(F)F)cc3C(N)=O)n2)cc1OC,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,,4.948428348248115,<,4.522878745280337,,4.742395135371631,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +137,EF_3_101,COc1cc(Nc2nccc(N3CC[C@H](NC(=O)Cc4ccc(C#N)cc4)C3)n2)cc(OC)c1OC,,5.276146420512976,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,,5.325549249924125,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +138,ZDG_6_50_1,COc1cc(Nc2ncc3c(-c4cccc(NC(=O)N5CCCC5)c4)nn(C)c3n2)cc(OC)c1OC,,6.0959209900587465,,5.649537330654971,,6.00111694950526,,5.103189913336463,,5.355920820754059,,7.112201749932475,<,4.522878745280337,,5.987651242211193,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0 +139,ZDG_7_52_6,CN1CCN(c2cccc(Nc3nccc(-c4ccc(C(=O)NCC#N)cc4)n3)c2)CC1,,5.896288707022026,,4.591986977837131,,5.022729260872255,,5.875633568854028,,4.567101299453912,,6.736106345680188,<,4.522878745280337,,5.48560900469894,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0 +140,HO_N_104,COc1ccc(-c2cc3nccn3c(Nc3ccc(N4CCN(C)CC4)cn3)n2)cc1OC,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +141,HO_N_99,CN1CCC(Nc2ncc3c(n2)c(-c2cccc(NS(C)(=O)=O)c2)nn3C)CC1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +142,PAR_277,O=C(NCC(C(F)(F)F)C(F)(F)F)c1ccc(-c2ccnc(Nc3ccc(N4CCOCC4)cc3)n2)cc1,,4.971615901589376,,4.599098959051357,<,4.522878745280337,,5.307259218619125,<,4.522878745280337,,6.143431243638508,<,4.522878745280337,,4.781712083555128,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0 +143,BA_03_50_01,COc1ccc2ncnc(CCc3ccccc3)c2c1,<,4.522878745280337,,5.238687151877101,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +144,ZDG_7_47_1,COc1ccc2cc(-c3cn(C)c4ncnc(N)c34)ccc2c1,,4.730819331639776,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +145,Culmerciclib,CC(C)c1c2cc(-c3nc(Nc4ccc(N5CCNCC5)cn4)ncc3F)ccc2nn1C,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +146,BA_03_55_12,c1cc(Nc2nccc(-c3cnn4ncccc34)n2)cc(N2CCOCC2)c1,,5.625983771126986,,4.96686922858421,,5.294320266103083,,5.976203849797534,,4.878083910966451,,6.683531800444131,<,4.522878745280337,,5.887845239897477,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0 +147,3_IN_PP1,CC(C)(C)n1nc(-c2c[nH]c3ccccc23)c2c(N)ncnc21,<,4.522878745280337,,4.741264885082795,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +148,GCN2iB,COc1ncc(Cl)cc1S(=O)(=O)Nc1ccc(F)c(C#Cc2cnc(N)nc2)c1F,,5.905601549290766,,5.494359015340263,,5.57696763782168,,5.521207096930893,,4.758877132726841,,6.438058632016267,<,4.522878745280337,,5.112039392346211,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0 +149,Tivozanib_hydrochloride_hydrate,COc1cc2nccc(Oc3ccc(NC(=O)Nc4cc(C)on4)c(Cl)c3)c2cc1OC,,4.90237267485998,,5.280179898185816,,4.983637127187442,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +150,PAR_89,O=C(Nc1nc2c(s1)-c1nc(-c3ccccc3Cl)ncc1CC2)NC1CCCCC1,,5.397748629891981,,4.927215857126657,,5.0284058896320944,,5.063894536958506,,4.93570189742568,,5.898064345968216,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +151,ZDG_7_23_1,COc1cc2ncn(-c3cc(OCc4ccc(Cl)cc4)c(C(N)=O)s3)c2cc1OC,,5.0638445779730095,,5.048480808348494,,5.154283327307828,<,4.522878745280337,,5.076771084592921,<,4.522878745280337,<,4.522878745280337,,4.827103001760606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +152,EF_3_103,COc1cc(Nc2nccc(N3CC[C@H](NC(=O)Cc4ccc(C(C)(C)C)cc4)C3)n2)cc(OC)c1OC,<,4.522878745280337,,4.75519581981897,,4.7533897131839575,<,4.522878745280337,,4.8156208231632975,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +153,PAR_244,O=C(Nc1ccc(-c2ccnc(Nc3ccc(N4CCOCC4)cc3)n2)cc1)c1ccc[nH]1,,5.307870853260097,,5.124929880831664,,5.060368222007157,,5.105359328915169,,5.136804697549763,,5.463892030482261,<,4.522878745280337,,5.115628742612033,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +154,MAX_40279,COc1cc(F)ccc1-c1c(C)sc2cnc(Nc3cnn(C4CCNCC4)c3)nc12,,4.994315816911533,,4.992062700349886,,4.643409145529222,<,4.522878745280337,<,4.522878745280337,,5.46550356817656,<,4.522878745280337,,5.3554389564331055,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +155,PAR_299,CN1CCN(c2ccc(Nc3nccc(-c4ccc(C(=O)NCC#N)cc4)n3)cc2Cl)CC1,,5.402937008085848,<,4.522878745280337,,4.913672332165274,,5.857028087339601,<,4.522878745280337,,6.195701854835417,<,4.522878745280337,,5.30909544980474,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0 +156,BA_03_50_08,COc1ccc2ncnc(NCCNS(C)(=O)=O)c2c1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +157,BA_03_53_14,CCCCNC(=O)c1cc(F)cc(Nc2nccc(-c3cnn4nc(OC)ccc34)n2)c1,,5.301257022949197,,4.647571668485639,,5.180096129581832,,5.5231721832556575,,4.973305149673089,,6.061845613898452,<,4.522878745280337,,5.856854053011399,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0 +158,BA_03_50_22,COc1ccc2ncnc(NCCc3cc(Cl)ccc3Cl)c2c1,<,4.522878745280337,,5.174096061139031,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +159,HO_N_134_E,COC(=O)c1cccnc1Nc1nc(-c2ccc(F)c(OC)c2)cc2nccn12,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +160,PAR_228,N#CCNC(=O)c1ccc(-c2ccnc(Nc3ccc(N4CCOCC4)nc3)n2)cc1,,5.199047896812319,<,4.522878745280337,<,4.522878745280337,,5.419393534017114,<,4.522878745280337,,6.15102429675409,<,4.522878745280337,,5.077157095414884,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0 +161,ZDG_6_48_7,COc1cc(Nc2ncc3c(-c4cccc(NC(=O)C5CC5)c4)nn(C)c3n2)cc(OC)c1OC,,6.608812792635328,,5.880296586710919,,6.677117459945991,,6.432528506602515,,5.6002246719027,<,4.522878745280337,<,4.522878745280337,,6.7503642073489205,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0 +162,ZDG_7_46_4,Cn1cc(-c2ccccc2C(F)(F)F)c2c(N)ncnc21,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +163,HO_N_136_3_A,NC(=O)c1cccnc1Nc1nc(-c2cccs2)cc2nccn12,,4.869697704669959,,4.741316049922417,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +164,ZDG_6_59_1,COc1cc(Nc2ncc3c(n2)c(C#CC(C)(C)O)nn3C)cc(OC)c1OC,,5.53435254925281,<,4.522878745280337,,6.164754945156086,,5.716178678170349,<,4.522878745280337,,6.53804870085516,<,4.522878745280337,,5.428355979001119,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0 +165,BA_03_53_06,COc1ccc2c(-c3ccnc(Nc4cccc(S(=O)(=O)C(F)(F)F)c4)n3)cnn2n1,,4.794035680291206,,5.00831752919434,<,4.522878745280337,<,4.522878745280337,,4.718394640831949,,5.184435892753189,<,4.522878745280337,,4.754263693738152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +166,ZDG_7_35_B,Cn1cc(-c2ccc3c(c2)CCN3C(=O)C2CC2(C)C)c2c(N)ncnc21,<,4.522878745280337,<,4.522878745280337,,4.529307812306122,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +167,BA_03_59_04,Cc1cc(=O)n(C2CCC2)c2nc(Nc3ccc(N4CCN(C)CC4)c(F)c3)ncc12,<,4.522878745280337,<,4.522878745280337,,5.07608559483432,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +168,ZDG_7_43_1,Cn1cc(-c2cccc(NS(C)(=O)=O)c2)c2c(N)ncnc21,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +169,PAR_322,CC(C)Oc1ccc(Nc2nccc(-c3ccc(C(=O)NCC#N)cc3)n2)cc1,,5.670568967240122,,4.799488863914139,,4.7474789540645,,5.600383453872854,,4.829449729046927,,6.757435575848146,<,4.522878745280337,,5.036644931759788,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0 +170,PAR_342,Cn1ccc(C(=O)Nc2ccc(-c3ccnc(Nc4ccc(N5CCOCC5)cc4)n3)cc2)n1,,5.125953308074533,,4.868552068704895,,4.691765667809468,,5.265252663608496,,4.902513637128014,,5.514806652393473,<,4.522878745280337,,4.877638414011683,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +171,HO_N_139_2,CNS(=O)(=O)c1ccccc1Nc1nc(-c2ccc(OC)c(OC)c2)cc2nccn12,,4.854251264510162,,4.829610548134223,,4.669920340904462,,5.114079721946229,,4.814355172887375,,4.934328040498036,<,4.522878745280337,,5.001589545970607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +172,HO_N_129,COc1ccc(-c2cc3nccn3c(Nc3cc(OC)c(OC)c(OC)c3)n2)cc1OC,,4.549123539647551,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +173,HO_N_134_A,COc1cc(-c2cc3nccn3c(Nc3ncccc3C(N)=O)n2)ccc1F,,4.64272200513473,,4.660609287879328,,4.64940575211658,,5.382040796273979,,4.964633396632863,,5.331311803157436,,4.960408141454957,,5.133822896182574,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +174,ZDG_7_41_A,Cn1cc(-c2ccc3c(c2)CCN3C(=O)[C@@H]2COc3ccccc3O2)c2c(N)ncnc21,,4.920483900338917,,5.358817744240419,,5.911720141175999,<,4.522878745280337,<,4.522878745280337,,4.785450760086494,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +175,ZDG_6_61_N2,COc1cc(Nc2ncc3c(n2)c(-c2cccc(NS(C)(=O)=O)c2)nn3C)cc(OC)c1OC,,7.239464524517891,,5.54532033618566,,6.929489972508245,,7.200588833375107,,5.651864459185521,<,4.522878745280337,<,4.522878745280337,,6.950860579077381,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0 +176,PAR_358,Cn1cc(Nc2nccc(-c3ccc(C(=O)NCC#N)cc3)n2)cn1,,5.980481432534826,<,4.522878745280337,<,4.522878745280337,,5.821544101545006,<,4.522878745280337,,6.659884862724448,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0 +177,HO_N_135_2_E,COC(=O)c1cccnc1Nc1nc(-c2ccc3c(c2)OCCO3)cc2nccn12,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +178,PAR_374,N#Cc1ccc(C(=O)Nc2ccc(-c3ccnc(Nc4ccc(N5CCOCC5)cc4)n3)cc2)cc1,,5.763846309947991,,5.286270031691473,,4.8848036103750445,,5.0976792113515215,,5.002424477521072,,6.252348624286782,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0 +179,BA_03_59_10,Cc1cc(=O)n(C2CCC2)c2nc(Nc3ccc(C(=O)NC(C)C)cc3)ncc12,,5.009976603015099,,4.655446188325175,,5.498859479811906,<,4.522878745280337,,4.831448025809144,,5.719625901340856,<,4.522878745280337,,4.836575009331562,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +180,PAR_311,COc1cc(N2CCOCC2)ccc1Nc1nccc(-c2ccc(C(=O)NCC#N)cc2)n1,,4.875562176211297,,4.672327574623886,,4.653894353690941,<,4.522878745280337,,4.583523615145392,,4.916383601137222,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +181,ZDG_5_55_7,NC[C@@H](NC(=O)c1ccc(-c2ccncc2)cc1)c1ccccc1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +182,ZDG_7_37_B,Cc1nn(C)c(C)c1CC(=O)N1CCc2cc(-c3cn(C)c4ncnc(N)c34)ccc21,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +183,BA_03_66_b,COc1cc2ncn(-c3cc(OCc4ccc(-c5ccncc5)s4)c(C(N)=O)s3)c2cc1OC,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +184,ZDG_7_11,COCCN(C)C(=O)c1ccc(Nc2ncc3c(C)cc(=O)n(C4CCCC4)c3n2)cc1,,5.038417673220524,<,4.522878745280337,,5.139779690319417,<,4.522878745280337,,4.819320265352316,,5.575840996623945,<,4.522878745280337,,5.1470809063912935,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +185,ZDG_7_43_2,Cn1cc(-c2ccc3c(c2)CC(=O)N3)c2c(N)ncnc21,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +186,PAR_205,Cc1cc(C(=O)N[C@H]2CCN(C(=O)Nc3nc4c(s3)-c3nc(-c5ccccc5Br)ncc3CC4)C2)nn1C,,4.706292274445287,<,4.522878745280337,<,4.522878745280337,,5.1667083090804,<,4.522878745280337,,5.350574849239629,<,4.522878745280337,,4.7491437507531495,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +187,ZDG_7_27_1,C#CCOc1cc(-n2cnc3cc(OC)c(OC)cc32)sc1C(N)=O,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +188,ZDG_7_39_C,COc1ccccc1CC(=O)N1CCc2cc(-c3cn(C)c4ncnc(N)c34)ccc21,,5.029349539740927,,5.152811042534423,,5.054639888758325,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +189,ZDG_2_93,COc1cc(Nc2ncc3c(n2)c(-c2cccc(NS(C)(=O)=O)c2)nn3CC(F)F)cc(OC)c1OC,,6.349622199477918,,5.315822524551972,,6.037269575499916,<,4.522878745280337,,5.203465699071809,,5.801494773398018,<,4.522878745280337,,5.90938460604092,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0 +190,CE_245677,COc1ccc(C(=O)c2cn(C(C)C)c3ncnc(N)c23)cc1NC(=O)Nc1ccc(Cl)cc1Cl,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +191,ZDG_7_14,Cc1cc(=O)n(C2CCCC2)c2nc(Nc3ccc(C(=O)N(C)C4CC4)cc3)ncc12,,5.444910558956879,,4.740223203749629,,5.403633688793094,<,4.522878745280337,<,4.522878745280337,,6.453841888779739,<,4.522878745280337,,5.147576584018467,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0 +192,BA_03_60_05,CCCCn1c(=O)cc(C)c2cnc(Nc3cccc(S(C)(=O)=O)c3)nc21,,4.71064546241489,<,4.522878745280337,,4.822919675499977,<,4.522878745280337,,4.553580199754428,,5.073861327272263,<,4.522878745280337,,5.320732617198839,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +193,HO_N_136_2_E,COC(=O)c1cccnc1Nc1nc(-c2ccc(N3CCOCC3)cc2)cc2nccn12,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +194,BA_03_59_03,Cc1cc(=O)n(C2CCC2)c2nc(Nc3ccc(N4CCOCC4)cc3)ncc12,<,4.522878745280337,<,4.522878745280337,,5.536492193799708,<,4.522878745280337,,4.543233595588991,,5.0516436816152375,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +195,PAR_324,COc1ccc(Nc2nccc(-c3ccc(NC(=O)C4CCCN4)cc3)n2)cc1Cl,,4.924146656739188,,4.858009853722174,<,4.522878745280337,,5.539619861523638,<,4.522878745280337,,5.267145068892569,<,4.522878745280337,,4.761567653682853,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +196,PAR_394,COc1cc(Nc2nccc(N3CC[C@H](NC(=O)COc4ccccc4)C3)n2)ccc1N1CCOCC1,<,4.522878745280337,,4.791377843945561,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +197,BA_03_51_19,COc1cccc(CCNc2ncnc3ccc(F)cc23)c1,<,4.522878745280337,,5.016639900263256,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +198,PAR_398,Cc1cc(Nc2nccc(N3CC[C@H](NC(=O)COc4ccccc4)C3)n2)ccc1N1CCOCC1,<,4.522878745280337,,5.378755296656826,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,,5.205494685035351,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +199,ZDG_7_21_2,COc1cc2ncn(-c3cc(OCc4ccc5ccccc5c4)c(C(N)=O)s3)c2cc1OC,,5.4205862155722455,,5.41167528380389,,5.437569032962673,,5.19403445575992,,5.445884596662054,<,4.522878745280337,<,4.522878745280337,,5.200270614667603,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +200,ZDG_6_75_3,COc1cc(Nc2ncc3c(n2)c(-c2cccc(S(C)(=O)=O)c2)nn3C)cc(OC)c1OC,,6.891252183368805,<,4.522878745280337,,7.120274625212618,,7.426467977227534,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,,7.468114687887472,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0 +201,HO_N_98,Cn1nc(-c2cccc(NS(C)(=O)=O)c2)c2nc(Nc3ccc(N4CCOCC4)cc3)ncc21,,6.106958653639852,,5.259735721832389,,6.212182010705377,,6.66685773646009,,5.277900710269178,,7.782353984182895,<,4.522878745280337,,6.607836931816712,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0 +202,PAR_90,O=C(NCc1cccc(F)c1)Nc1nc2c(s1)-c1nc(-c3ccccc3Br)ncc1CC2,,5.077431289362347,,4.914928066065915,,4.694492166193196,,5.061981637898671,,4.988231387689668,,5.545137437022733,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +203,BA_03_60_01,CCCCn1c(=O)cc(C)c2cnc(Nc3cc(OC)c(OC)c(OC)c3)nc21,,5.659723155021703,,4.689374220300698,,5.7300472567178415,,5.339274626147458,,4.907717682813136,,6.357928805442624,<,4.522878745280337,,5.845826566736619,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0 +204,BA_03_59_05,Cc1cc(=O)n(C2CCC2)c2nc(Nc3cccc(S(C)(=O)=O)c3)ncc12,,5.042704399864713,,4.780353647381379,,5.583684563735657,<,4.522878745280337,,4.85857669611246,,5.476412266461561,<,4.522878745280337,,5.600960957136003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +205,PAR_338,Cc1ccccc1CCC(=O)Nc1ccc(-c2ccnc(Nc3ccc(N4CCOCC4)cc3)n2)cc1,,5.773842492893935,,5.663966776523732,,5.606180250657419,,5.8085663054789824,,5.740628400795056,,6.166726880865653,<,4.522878745280337,,5.5277440733905285,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0 +206,ZDG_7_44_4,COc1ncccc1-c1cn(C)c2ncnc(N)c12,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +207,ZDG_7_43_4,COc1ccc(-c2cn(C)c3ncnc(N)c23)cc1OC,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +208,BA_03_59_07,COc1ccccc1Nc1ncc2c(C)cc(=O)n(C3CCC3)c2n1,,4.627146505382027,,4.53833768508282,,4.719127822874644,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +209,ZDG_6_48_4,COc1cc(Nc2ncc3c(-c4cccc(NS(=O)(=O)C(F)(F)F)c4)nn(C)c3n2)cc(OC)c1OC,,6.7362783823558665,,5.660878745330506,,7.070391042503128,,7.128762995687709,,4.933226575990378,<,4.522878745280337,<,4.522878745280337,,7.200467705751264,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0 +210,BA_03_65_c,COC(=O)c1sc(-n2cnc3cc(OC)c(OC)cc32)cc1OCc1sccc1F,,5.074721428638421,,4.965014934450227,,5.219896358959755,<,4.522878745280337,,4.849654758439626,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +211,HO_N_132_3_E,COC(=O)c1cccnc1Nc1nc(-c2ccc(OC)cc2)cc2nccn12,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +212,BA_03_61_04,Cc1cc(=O)n(C2CCOCC2)c2nc(Nc3ccc(N4CCN(C)CC4)c(F)c3)ncc12,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,,4.9873329366252745,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +213,3853,COc1ccc2c(-c3ccnc(Nc4cccc(C(F)(F)F)c4)n3)cnn2n1,,,,,,,,,,,,5.608888386297197,,,,,,,,,,0.0,, +214,ZDG_6_60,Cn1nc(-c2cccc(NS(C)(=O)=O)c2)c2nc(Nc3ccc(C4CC4)c(P(C)(C)=O)c3)ncc21,,5.957822765913966,,5.180300495637555,,5.810888369041113,,6.517318783070742,,4.961545672241482,<,4.522878745280337,<,4.522878745280337,,6.366753716258976,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0 +215,HO_N_133_1_A,NC(=O)c1cccnc1Nc1nc(C2=CCOCC2)cc2nccn12,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,,4.7820580655384335,<,4.522878745280337,,4.669912426300299,<,4.522878745280337,,4.574860731113751,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +216,PAR_360,N#CCNC(=O)c1ccc(-c2ccnc(Nc3ccc(CN4CCOCC4)cc3)n2)cc1,,5.396623625558428,,4.7179591764319255,,4.891123299789348,,5.435824480800892,,4.818275633982171,,5.611671019139783,<,4.522878745280337,,5.15556164378403,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +217,3723,Cc1cnc(Nc2ccc(N3CCNCC3)cc2)nc1Nc1cccc(OCc2ccccc2F)c1C#N,,,,,,7.30102999566398,,,,,,,,,,,,,1.0,,,,, +218,BA_03_80_A,Cc1cc(=O)n(C2CCOCC2)c2nc(Nc3ccc(N4CCN(C)CC4)cn3)ncc12,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +219,3790,COc1cc(Nc2nccc(N3CC[C@H](NC(=O)COc4ccccc4)C3)n2)cc(OC)c1OC,,,,,,,,,,,,5.721246399047171,,,,,,,,,,0.0,, +220,BA_03_60_03,CCCCn1c(=O)cc(C)c2cnc(Nc3ccc(N4CCOCC4)cc3)nc21,,4.915000776345461,,4.674997182380425,,5.900361252603651,,4.880849434839875,,5.079615199699194,,5.499268882601292,<,4.522878745280337,,5.317257991313651,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +221,BA_03_66_h,COc1cc2ncn(-c3cc(OCc4ccn(C)n4)c(C(N)=O)s3)c2cc1OC,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +222,BA_03_60_10,CCCCn1c(=O)cc(C)c2cnc(Nc3ccc(C(=O)NC(C)C)cc3)nc21,,4.678728850443628,<,4.522878745280337,,5.1861188093075405,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,,4.896264415193177,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +223,TC_S_7005,C[C@H](Nc1cc2c(-c3ccc4c(c3)OCO4)noc2cn1)c1ccccc1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,,4.83095972316797,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,,4.816293228943018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +224,ZDG_7_24_1,COc1cc2ncn(-c3cc(OCc4cccc(F)c4)c(C(N)=O)s3)c2cc1OC,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +225,PAR_168,N#CCNC(=O)c1ccc(-c2ccnc(Nc3ccc(N4CCOCC4)c(Cl)c3)n2)cc1,,6.1039715032828346,,5.27168686708747,,5.457184999498092,,6.155271166086964,,5.257464569418352,,7.11888929424705,<,4.522878745280337,,5.690901081386165,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0 +226,PAR_400,COc1ccc(CNc2nccc(N3CC[C@H](NC(=O)COc4ccccc4)C3)n2)cc1Cl,<,4.522878745280337,,4.651760068086324,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +227,RSS0680,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)CN2C(=O)[C@@H](NC(=O)CCCCCN2CCN(c3ccc(Nc4ncc5scc(-c6cccc(NS(C)(=O)=O)c6)c5n4)cc3)CC2)C(C)(C)C)cc1,,6.54261166447481,,5.75585800388447,,6.295388017169884,,5.999097187127363,,5.848763313386655,,6.030727908922796,<,4.522878745280337,,6.611177726392288,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0 +228,ALK_kinase_inhibitor_1,COc1ccc(F)cc1-c1c(CO)sc2cnc(Nc3ccc(N4CCN(C)CC4)cc3OC(C)C)nc12,<,4.522878745280337,<,4.522878745280337,,4.536829647512487,<,4.522878745280337,,4.543123388688776,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +229,PAR_111,NCCNC(=O)c1ccc(-c2ccnc(Nc3ccc(N4CCOCC4)cc3)n2)cc1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +230,BA_03_60_02,CCCCn1c(=O)cc(C)c2cnc(Nc3cccc(N4CCOCC4)c3)nc21,,5.343366504225447,,4.983140207370476,,5.92190220515168,,5.045371664249361,,4.993732625917914,,6.195057877289041,<,4.522878745280337,,6.116489048104294,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0 +231,HO_N_135_5_A,NC(=O)c1cccnc1Nc1nc(-c2ccccc2)cc2nccn12,,5.167932542347761,,4.786781893903362,<,4.522878745280337,<,4.522878745280337,,4.702448757509358,,4.997133813381141,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +232,ZDG_6_75_2,COc1cc(Nc2ncc3c(n2)c(-c2ccccc2NC(C)=O)nn3C)cc(OC)c1OC,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,,5.8419011094092586,<,4.522878745280337,,4.682799614428752,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +233,EF_3_105,COc1cc(Nc2nccc(N3CC[C@H](NC(=O)Cc4ccc(NC(C)=O)cc4)C3)n2)cc(OC)c1OC,,4.574774333558005,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,,4.593842407208746,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +234,ZDG_7_47_2,CN(C)C(=O)c1cccc(-c2cn(C)c3ncnc(N)c23)c1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +235,PAR_321,Cc1ccc(Nc2nccc(-c3ccc(C(=O)NCC#N)cc3)n2)cc1O,,6.009589815676634,,4.917865976255178,,4.864457187824321,,5.8264947165744125,,4.973637762948251,,6.774955460551948,<,4.522878745280337,,5.255202138876709,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0 +236,PAR_318,Cn1nnc2ccc(Nc3nccc(-c4ccc(C(=O)NCC#N)cc4)n3)cc21,,5.11781845030923,<,4.522878745280337,<,4.522878745280337,,5.101684406132499,<,4.522878745280337,,5.795336066086301,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +237,ZDG_7_42_1,Cc1ccccc1CC(=O)N1CCc2cc(-c3cn(C)c4ncnc(N)c34)ccc21,,5.61231584009862,,5.156010728689806,,6.243550038680226,<,4.522878745280337,<,4.522878745280337,,5.097975280610396,<,4.522878745280337,<,4.522878745280337,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0 +238,Derazantinib,COCCNCCc1cccc(Nc2ncc3c(n2)-c2ccccc2[C@H](c2ccccc2F)C3)c1,,4.946677510473712,,4.659559714931235,,5.083215299969158,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +239,PAR_345,O=C(Nc1ccc(-c2ccnc(Nc3ccc(N4CCOCC4)cc3)n2)cc1)C1(c2ccccc2)CCC1,,5.805406688792475,,5.103777576542542,,4.996365992609234,,6.423519472860726,,5.1412836734445975,,7.202756541176565,<,4.522878745280337,,5.409960957047747,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0 +240,ZDG_6_67,COc1ccc(-c2nn(C)c3cnc(Nc4cc(OC)c(OC)c(OC)c4)nc23)cc1OC,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,,6.927259857017201,<,4.522878745280337,,5.156576221429403,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0 +241,BA_03_50_06,CCS(=O)(=O)c1cccc(Nc2ncnc3ccc(OC)cc23)c1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +242,PAR_381,CN(CCN1CCCC1)C(=O)c1ccc(-c2ccnc(Nc3ccc(N4CCOCC4)cc3)n2)cc1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +243,ZDG_6_50_4,COc1cc(Nc2ncc3c(-c4cccc(NS(=O)(=O)CC(F)(F)F)c4)nn(C)c3n2)cc(OC)c1OC,,6.782040430564839,,5.692492267454069,,6.9100886213863655,,6.881171851093462,,5.440548237561658,<,4.522878745280337,<,4.522878745280337,,7.078945066753827,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0 +244,TL01_010,NCCNS(=O)(=O)c1ccc(-c2ccnc(Nc3ccccc3)n2)cc1,,6.1046622770278045,<,4.522878745280337,,5.094714401665003,,5.359374999407884,,4.807704775595017,,6.860381498185879,<,4.522878745280337,,4.815742834889345,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0 +245,TL01_024,COc1ccc(Nc2nccc(-c3ccc(S(=O)(=O)NCCNS(=O)(=O)C4CC4)cc3)n2)cc1,,5.80560660490504,,5.122882797819735,,4.864661445175623,<,4.522878745280337,<,4.522878745280337,,6.452344651678416,<,4.522878745280337,,4.977133610283185,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0 +246,HO_N_135_7_A,NC(=O)c1cccnc1Nc1nc(-c2cccc(N3CCOCC3)c2)cc2nccn12,,5.3590028575545645,,4.884641844798198,,4.6950213420499,<,4.522878745280337,,4.880768360848983,,5.207366809608026,<,4.522878745280337,,4.86971742205831,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +247,ZDG_6_48_2,CCS(=O)(=O)Nc1cccc(-c2nn(C)c3nc(Nc4cc(OC)c(OC)c(OC)c4)ncc23)c1,,7.080746134041689,,5.695443092049875,,6.770898194393608,,6.995989692893522,,5.648842078920682,<,4.522878745280337,<,4.522878745280337,,7.016264366350998,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0 +248,PAR_370,O=C(CN1CCOCC1)Nc1ccc(-c2ccnc(Nc3ccc(N4CCOCC4)cc3)n2)cc1,,4.826213105920755,,4.816469259671949,<,4.522878745280337,,5.711795001781335,<,4.522878745280337,,6.008672458852921,<,4.522878745280337,,4.90328024809868,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0 +249,PAR_241,O=C1CCC(C(=O)Nc2ccc(-c3ccnc(Nc4ccc(N5CCOCC5)cc4)n3)cc2)C1,,5.088605527511813,,4.628056532634228,<,4.522878745280337,,5.73702692588722,<,4.522878745280337,,6.133812442366555,<,4.522878745280337,,5.299515894450547,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0 +250,ZDG_7_26_1,CCNC(=O)c1cccc(COc2cc(-n3cnc4cc(OC)c(OC)cc43)sc2C(N)=O)c1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +251,Tpl2_Kinase_Inhibitor_1,N#Cc1cnc2cnc(NCc3cccnc3)cc2c1Nc1ccc(F)c(Cl)c1,<,4.522878745280337,,5.025389240195332,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +252,BA_03_61_10,Cc1cc(=O)n(C2CCOCC2)c2nc(Nc3ccc(C(=O)NC(C)C)cc3)ncc12,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +253,ZDG_7_46_1,Cn1cc(-c2cn(C)c3ncnc(N)c23)cn1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +254,HO_N_135_1_A,NC(=O)c1cccnc1Nc1nc(-c2cccc(C(F)(F)F)c2)cc2nccn12,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +255,PAR_162,Cc1nocc1C(=O)N[C@H]1CCN(C(=O)Nc2nc3c(s2)-c2nc(-c4ccccc4Br)ncc2CC3)C1,,5.555378610964486,,5.133606152079814,,5.025422728526052,,5.887663815348757,,4.955427327315712,,6.635858913662164,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0 +256,3842,COc1cc(Nc2nccc(-c3ccc4[nH]ccc4c3)n2)cc(C(F)(F)F)c1,,,,,,,,,,,,5.917933065714887,,,,,,,,,,0.0,, +257,BA_03_50_11,COc1ccc2ncnc(NCCc3ccncc3)c2c1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +258,AMG_47a,Cc1ccc(C(=O)Nc2cccc(C(F)(F)F)c2)cc1-c1ccc2nc(NCCN3CCOCC3)ncc2c1,<,4.522878745280337,,4.772798973483567,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +259,Golvatinib,CN1CCN(C2CCN(C(=O)Nc3cc(Oc4ccc(NC(=O)C5(C(=O)Nc6ccc(F)cc6)CC5)c(F)c4)ccn3)CC2)CC1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +260,PAR_372,O=C(Nc1ccc(-c2ccnc(Nc3ccc(N4CCOCC4)cc3)n2)cc1)c1cccc2c1OCO2,,5.2179944972202765,,5.037654317887743,,4.910870806351179,,5.22120758405103,,5.188357732924196,<,4.522878745280337,<,4.522878745280337,,5.031553392186277,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +261,HO_N_132_3_A,COc1ccc(-c2cc3nccn3c(Nc3ncccc3C(N)=O)n2)cc1,,5.046209872089897,,4.917674305168511,,4.779069386451229,<,4.522878745280337,,4.888429126486219,,5.33088176017039,<,4.522878745280337,,5.067141519727515,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +262,PAR_113,Cc1cccc(CNC(=O)c2ccc(-c3ccnc(Nc4ccc(N5CCOCC5)cc4)n3)cc2)c1,,5.217302986321796,,5.110465948069461,,5.028279794192806,,5.292558901588945,,5.149197872069948,,6.055950944435368,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0 +263,BSc5367,COC(=O)c1cccc(-c2cnc3[nH]cc(-c4cccnc4)c3c2)c1,,5.082177979315596,,4.9655607865278375,,4.73755422206139,,5.032137334346293,<,4.522878745280337,,5.978695546843435,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +264,2096,Fc1cc(-c2ccnc(Nc3ccc(-n4cnc(N5CCOCC5)n4)cc3)n2)cc(N2CCOCC2)c1,,,,,,,,,,,,5.8696662315049934,,,,,,,,,,0.0,, +265,BA_03_51_18,Fc1ccc2ncnc(NCCc3cccc(Br)c3)c2c1,<,4.522878745280337,,6.107209089137724,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0 +266,Casein_Kinase_II_Inhibitor_IV,COc1cc(Nc2ncc3ccn(-c4cccc(CCC#N)c4)c3n2)cc(OC)c1OC,,6.622840508828964,,5.183577810854469,,6.0280015818133,,5.806834079003082,,5.539397807365343,,7.208527440788626,<,4.522878745280337,,6.924195590604335,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0 +267,PAR_316,COc1ccc(Nc2nccc(-c3ccc(C(=O)NCC#N)cc3)n2)cc1Cl,,5.798704140084891,,5.048846842351343,,4.859488180352635,,5.611174510937102,,5.058627743056515,,6.428120192862645,<,4.522878745280337,,5.214629733947848,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0 +268,BI_882370,CCN1CCC(N(C)c2ccc3c(n2)c(-c2cncnc2)cn3-c2c(F)ccc(NS(=O)(=O)CC)c2F)CC1,,4.69122285715209,,4.890492432107752,,4.769411612209313,<,4.522878745280337,,4.799019010064749,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +269,3732,N#Cc1c(Nc2nc(Nc3ccc(C4CCNCC4)cc3)ncc2Cl)cccc1OCc1ccccc1F,,,,,,7.30102999566398,,,,,,,,,,,,,1.0,,,,, +270,S116836,Cc1ccc(C(=O)Nc2cc(-n3ccnc3)cc(C(F)(F)F)c2)cc1C#Cc1cnc(NC2CC2)nc1,,4.733622324873981,<,4.522878745280337,,4.890410185201036,,4.930514495386641,,4.970140129981151,<,4.522878745280337,<,4.522878745280337,,4.832507505969726,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +271,PAR_351,Cc1cc(=O)n(C2CCCC2)c2nc(Nc3ccn(C)n3)ncc12,,5.444699172100818,<,4.522878745280337,,5.586962035966202,<,4.522878745280337,,4.763486558006579,,6.379269097867836,<,4.522878745280337,,5.704715854657756,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0 +272,BA_03_50_19,COc1cccc(CCNc2ncnc3ccc(OC)cc23)c1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +273,PAR_356,Cc1cc(=O)n(C2CCCC2)c2nc(Nc3cccc(CN4CCOCC4)c3)ncc12,,4.70436486658277,<,4.522878745280337,,5.198768119199553,<,4.522878745280337,<,4.522878745280337,,5.756047241264547,<,4.522878745280337,,4.896988634663503,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +274,BA_03_50_07,COc1cccc([C@H](C)Nc2ncnc3ccc(OC)cc23)c1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +275,HO_N_138_2_A,Cn1cnc2ccc(-c3cc4nccn4c(Nc4ncccc4C(N)=O)n3)cc21,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +276,HO_N_73,COc1ccc(-c2cc3nccn3c(Nc3ccncc3C(N)=O)n2)cc1OC,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +277,BA_03_65_a,COC(=O)c1sc(-n2cnc3cc(OC)c(OC)cc32)cc1OCc1sccc1OC,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +278,ZDG_6_38,COc1cc(Nc2ncc3c(-c4cccc([N+](=O)[O-])c4)nn(C)c3n2)cc(OC)c1OC,,5.352601318996579,<,4.522878745280337,,4.724733398906056,,5.920407115946844,<,4.522878745280337,,7.0345314539649815,<,4.522878745280337,,5.703723860616155,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0 +279,EF_3_201,O=C(COc1ccccc1)N[C@H]1CCN(c2ccnc(Nc3ccc(F)cc3)n2)C1,<,4.522878745280337,,4.849411098115262,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +280,TL01_023,CC(=O)Nc1ccc(Nc2nccc(-c3ccc(S(=O)(=O)NCCN)cc3)n2)cc1,,4.684514698854729,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,,4.526497375134073,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +281,PAR_333,COc1cc(Nc2nccc(-c3ccc(NC(=O)C4CCCN4)cc3)n2)ccc1N1CCOCC1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +282,PAR_327,O=C(Nc1ccc(-c2ccnc(Nc3ccc(Cl)cc3)n2)cc1)C1CCCN1,<,4.522878745280337,,4.731767974323136,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,,5.051982890328434,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +283,ZDG_6_48_3,COc1cc(Nc2ncc3c(-c4cccc(NC(C)=O)c4)nn(C)c3n2)cc(OC)c1OC,,6.277591921572145,,5.494845765107401,,6.151241698278676,,6.15745762658822,,5.309162923367781,,7.884845915322762,<,4.522878745280337,,6.713127839372918,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0 +284,HO_N_71,COc1ccc(-c2cc3nccn3c(Nc3cnccc3C(N)=O)n2)cc1OC,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +285,ZDG_7_24_2,COc1cc2ncn(-c3cc(OCc4cc(C(F)(F)F)cc(C(F)(F)F)c4)c(C(N)=O)s3)c2cc1OC,,4.536065565647475,<,4.522878745280337,,4.565454954825468,<,4.522878745280337,,4.580553530573381,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +286,BA_03_59_01,COc1cc(Nc2ncc3c(C)cc(=O)n(C4CCC4)c3n2)cc(OC)c1OC,,5.8095204295214895,,4.793359288759993,,6.554206434466748,,4.788746293579548,,4.903651125964368,,6.693623348897365,<,4.522878745280337,,6.01253545631465,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0 +287,HO_N_135_8_A,NC(=O)c1cccnc1Nc1nc(-c2cc(C(F)(F)F)cc(C(F)(F)F)c2)cc2nccn12,<,4.522878745280337,,4.693649170809873,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +288,ZDG_7_15,Cn1cc(-c2cccc(C(F)(F)F)c2)c2c(N)ncnc21,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +289,PAR_395,O=C(COc1ccccc1)N[C@H]1CCN(c2ccnc(Nc3cccc(N4CCOCC4)c3)n2)C1,,4.8730652351051305,,4.789935576943914,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +290,ZDG_6_59_3,COc1cc(Nc2ncc3c(n2)c(C#CCN(C)C)nn3C)cc(OC)c1OC,,4.925516752117212,<,4.522878745280337,,5.229719574217726,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,,5.444903582758697,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +291,PAR_309,N#CCNC(=O)c1ccc(-c2ccnc(Nc3cccc(N4CCOCC4)c3)n2)cc1,,5.89300432206647,,4.898927360316853,,5.175955362843072,,5.839884473737776,,4.969805758716476,,6.755038622864529,<,4.522878745280337,,5.623268330211212,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0 +292,BA_03_61_05,CS(=O)(=O)c1cccc(Nc2ncc3ccc(=O)n(C4CCOCC4)c3n2)c1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +293,PAR_336,CC(C)Oc1ccc(Nc2nccc(-c3ccc(NC(=O)C4CCCN4)cc3)n2)cc1,<,4.522878745280337,,5.064189055843583,<,4.522878745280337,,5.697169228177547,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +294,ZDG_7_40_B,Cn1cc(-c2ccc3c(c2)CCN3C(=O)Cc2cccc3c2OCO3)c2c(N)ncnc21,,5.285138286337057,,4.944511813241703,,5.359068800417957,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +295,PAR_138,O=C(Nc1cc(O)[nH]n1)c1ccc(-c2ccnc(Nc3ccc(N4CCOCC4)cc3)n2)cc1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +296,HO_N_131_5_E,COC(=O)c1cccnc1Nc1nc(-c2cnn(C)c2)cc2nccn12,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +297,BA_03_56_04,CCOc1ccc2c(-c3ccnc(Nc4ccc(N5CCOCC5)cc4)n3)cnn2n1,,6.382563467723347,,5.442808122452164,,6.023964920206267,,6.716551966323366,,5.070433254155037,,7.804781549199357,<,4.522878745280337,,6.817368767842395,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0 +298,BA_03_78_c,CS(=O)(=O)c1ccc(CCNc2ncnc3sc(Br)cc23)cc1,<,4.522878745280337,,4.982287803241264,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +299,XL_019,O=C(Nc1ccc(-c2ccnc(Nc3ccc(N4CCOCC4)cc3)n2)cc1)[C@@H]1CCCN1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,,5.574508571207707,<,4.522878745280337,,5.118751627945101,<,4.522878745280337,,4.525261171671562,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +300,PAR_139,O=C(NC1CN2CCC1CC2)c1ccc(-c2ccnc(Nc3ccc(N4CCOCC4)cc3)n2)cc1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +301,ZDG_6_50_3,COc1cc(Nc2ncc3c(-c4cccc(NS(=O)(=O)N(C)C)c4)nn(C)c3n2)cc(OC)c1OC,,7.2636414639443965,,5.472371392516424,,6.780426843309521,,6.852293717481508,,5.658476042428143,<,4.522878745280337,<,4.522878745280337,,6.765197331288906,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0 +302,BA_03_51_22,Fc1ccc2ncnc(NCCc3cc(Cl)ccc3Cl)c2c1,<,4.522878745280337,,5.402880060144108,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +303,PAR_271,O=C(NCC12CCC(CC1)C2)c1ccc(-c2ccnc(Nc3ccc(N4CCOCC4)cc3)n2)cc1,,4.93508730665225,,4.898964375295649,,4.593051517658745,,5.166284731172521,,4.634132189775471,,5.280807709808106,<,4.522878745280337,,4.9921266697179405,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +304,3735,C[C@@H](Oc1cccc(Nc2nc(Nc3ccc(N4CCN(C)CC4)cc3)ncc2Cl)c1C#N)c1ccccc1F,,,,,,6.823908740944318,,,,,,,,,,,,,1.0,,,,, +305,3791,COc1cc(Nc2nccc(N3CC[C@@H](NC(=O)[C@@H]4COc5ccccc5O4)C3)n2)cc(OC)c1OC,,,,,,,,,,,,5.026872146400301,,,,,,,,,,0.0,, +306,ba_03_55_11,FC(F)(F)c1cc(Nc2nccc(-c3cnn4ncccc34)n2)cc(N2CCOCC2)c1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,,4.880794677881373,<,4.522878745280337,,5.908010613583455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +307,BA_03_56_01,CCOc1ccc2c(-c3ccnc(Nc4cc(OC)c(OC)c(OC)c4)n3)cnn2n1,,5.548684121326926,,4.576730188410739,,4.971611933928847,,5.172837185648649,<,4.522878745280337,,7.13102765360829,<,4.522878745280337,,6.443358002843733,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0 +308,BA_03_53_08,COc1ccc2c(-c3ccnc(Nc4cccc(S(C)(=O)=O)c4)n3)cnn2n1,,5.3941391822593125,,4.704974298184757,,4.627055792780061,,5.718845206864829,<,4.522878745280337,,6.896405638759742,<,4.522878745280337,,5.735125613105951,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0 +309,PAR_260,O=C(Nc1ccc(-c2ccnc(Nc3ccc(N4CCOCC4)cc3)n2)cc1)C1CCCC1,,5.364643116384741,,4.919894520601953,,4.782892139367345,,5.267484537020429,,4.681547148925979,,6.22558409869272,<,4.522878745280337,,5.207024547362414,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0 +310,HO_N_100,COc1ccc(-c2cc3nccn3c(Nc3ccc(Br)cc3C(N)=O)n2)cc1OC,,4.929800125507245,,4.958520192141293,,4.945548775435376,,5.447834988259078,,4.954234799643147,,5.213831432030042,<,4.522878745280337,,5.359627629229694,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +311,PAR_348,O=C(CC1Cc2ccccc2C1)Nc1ccc(-c2ccnc(Nc3ccc(N4CCOCC4)cc3)n2)cc1,,5.490429121138964,,5.24455236714161,,5.062735667799319,,5.232626165902529,,5.253711168111956,,6.1844141633771095,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0 +312,PAR_183,O=C(N[C@H]1CCN(C(=O)Nc2nc3c(s2)-c2nc(-c4ccccc4Br)ncc2CC3)C1)C1CC1,,4.726863738101162,<,4.522878745280337,<,4.522878745280337,,5.278847629271948,<,4.522878745280337,,5.576243165624568,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +313,HO_N_49,CNC(=O)c1ccccc1Nc1nc(-c2ccc(OC)c(OC)c2)cc2nccn12,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +314,HO_N_110,COc1ccc(-c2cc3nccn3c(Nc3ccc(N4CCOCC4)cc3)n2)cc1OC,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +315,G1T38,CC(C)N1CCN(c2ccc(Nc3ncc4cc5n(c4n3)C3(CCCCC3)CNC5=O)nc2)CC1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +316,PAR_401,CC(C)Oc1ccc(Nc2nccc(N3CC[C@H](NC(=O)COc4ccccc4)C3)n2)cc1Cl,<,4.522878745280337,,5.330784135217551,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +317,ZDG_6_47,COc1cc(Nc2ncc3c(-c4cccc(NC(=O)NC(C)C)c4)nn(C)c3n2)cc(OC)c1OC,,6.2415248779191135,,5.565425742941704,,6.266056824425502,,5.360749393514001,,5.316223306472722,<,4.522878745280337,<,4.522878745280337,,6.191882684062247,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0 +318,ZDG_7_32_B,Cn1cc(-c2ccc3c(c2)CCN3C(=O)C2CC(=O)N(c3cccc(F)c3)C2)c2c(N)ncnc21,,4.574573664014865,,4.639009402674452,,4.939457001176905,<,4.522878745280337,,4.711262806697011,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +319,BA_03_60_07,CCCCn1c(=O)cc(C)c2cnc(Nc3ccccc3OC)nc21,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +320,BA_03_53_13,COc1cc(F)cc(Nc2nccc(-c3cnn4nc(OC)ccc34)n2)c1,<,4.522878745280337,,4.695633228217571,<,4.522878745280337,,5.267069662565771,<,4.522878745280337,,6.558862363961111,<,4.522878745280337,,6.056005462689938,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0 +321,KW_2449,O=C(c1ccc(/C=C/c2n[nH]c3ccccc23)cc1)N1CCNCC1,,4.948693454498172,<,4.522878745280337,,4.887718432386475,<,4.522878745280337,<,4.522878745280337,,4.914243194598751,<,4.522878745280337,,5.172077927060755,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +322,PAR_312,CS(=O)(=O)c1cccc(Nc2nccc(-c3ccc(C(=O)NCC#N)cc3)n2)c1,,5.437692331304064,,4.721068208451692,,4.7616092095892775,,5.408702035088601,,4.76164237316853,,5.988516790479124,<,4.522878745280337,,5.35043781148929,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +323,PAR_275,C[C@H](NC(=O)c1ccc(-c2ccnc(Nc3ccc(N4CCOCC4)cc3)n2)cc1)c1ccccc1,,5.383519364384667,,4.728697888697462,,4.791472677138329,,5.48764661046281,,4.802972377414802,,6.098455015848394,<,4.522878745280337,,4.777662987898371,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0 +324,PAR_270,CN1CCC(CNC(=O)c2ccc(-c3ccnc(Nc4ccc(N5CCOCC5)cc4)n3)cc2)CC1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +325,ZDG_7_24_3,COc1cc2ncn(-c3cc(OCc4c(F)cccc4F)c(C(N)=O)s3)c2cc1OC,,4.969069819312674,<,4.522878745280337,,5.762940099478096,,5.021398828517733,,4.985809536051859,<,4.522878745280337,<,4.522878745280337,,5.019323835687544,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +326,HO_N_136_2_A,NC(=O)c1cccnc1Nc1nc(-c2ccc(N3CCOCC3)cc2)cc2nccn12,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,,4.545262682940036,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +327,ZDG_7_50_1,CN1CCN(c2ccc(Nc3nccc(-c4ccc(NC(=O)[C@@H]5CCCN5)cc4)n3)cc2Cl)CC1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,,5.543334033192413,<,4.522878745280337,,4.662880551711223,<,4.522878745280337,,4.526598286962623,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +328,BA_03_53_05,COc1ccc2c(-c3ccnc(Nc4ccccc4)n3)cnn2n1,,5.180564558632154,<,4.522878745280337,,4.738761595469503,<,4.522878745280337,<,4.522878745280337,,5.847229962194939,<,4.522878745280337,,5.384984275181187,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +329,PAR_357,Cn1nccc1Nc1nccc(-c2ccc(C(=O)NCC#N)cc2)n1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +330,ZDG_6_59_2,COc1cc(Nc2ncc3c(n2)c(C#CC2(O)CCCC2)nn3C)cc(OC)c1OC,,5.685274484736024,<,4.522878745280337,,6.1292204946011815,,5.680715631887232,<,4.522878745280337,,6.716608779237629,<,4.522878745280337,,5.470584302976446,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0 +331,PAR_354,Cc1cc(=O)n(C2CCCC2)c2nc(Nc3ccn(C4CCN(C)CC4)n3)ncc12,,4.698496042844905,<,4.522878745280337,,5.456566921861255,<,4.522878745280337,<,4.522878745280337,,5.5035703597579975,<,4.522878745280337,,5.358810350415297,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +332,ZDG_7_45_2,Cn1cc(-c2cccs2)c2c(N)ncnc21,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +333,PAR_362,N#CCNC(=O)c1ccc(-c2ccnc(Nc3cnn(C(F)F)c3)n2)cc1,,5.77941053541049,<,4.522878745280337,<,4.522878745280337,,5.543231659097812,<,4.522878745280337,,6.648579843608567,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0 +334,3833,CCOc1ccc2c(-c3ccnc(Nc4cc(OC)cc(OC)c4)n3)cnn2n1,,,,,,,,,,,,6.308034897232639,,,,,,,,,,1.0,, +335,HO_N_136_7_A,NC(=O)c1cccnc1Nc1nc(-c2ccccc2OC(F)(F)F)cc2nccn12,<,4.522878745280337,,4.803269399789653,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +336,HO_N_133_5_A,COc1ncccc1-c1cc2nccn2c(Nc2ncccc2C(N)=O)n1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +337,ZDG_2_92,COc1cc(Nc2ncc3c(n2)c(-c2cccc(NS(C)(=O)=O)c2)nn3C(C)C)cc(OC)c1OC,,6.536205092379067,,5.244166004673225,,6.082230741504421,,5.004846225536155,,5.209194819890519,,5.905229808621821,<,4.522878745280337,,5.4416650725039055,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0 +338,TBAP_001,CC(C)(C)c1cc(NC(=O)Nc2ccc(Oc3ccnc4[nH]c(=O)cnc34)cc2F)n(-c2cccc(F)c2)n1,,5.101565883784068,,5.034757595688686,,4.9773492337294485,,4.88164810111425,,5.166532930239859,<,4.522878745280337,,5.249676748945265,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +339,ZDG_7_35_A,CC1CC1C(=O)N1CCc2cc(-c3cn(C)c4ncnc(N)c34)ccc21,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +340,3838,CCOc1ccc2c(-c3ccnc(Nc4ccc(CN5CCOCC5)cc4)n3)cnn2n1,,,,,,,,,,,,6.052566278112948,,,,,,,,,,1.0,, +341,ALW_II_49_7,Cc1ccc(C(=O)Nc2cccc(C(F)(F)F)c2)cc1Nc1cncc(C(N)=O)c1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +342,3801,CCOc1ccc2c(-c3ccnc(Nc4cccc(S(C)(=O)=O)c4)n3)cnn2n1,,,,,,,,,,,,7.154901959985742,,,,,,,,,,1.0,, +343,HO_N_133_3_E,COC(=O)c1cccnc1Nc1nc(-c2cccc(OC)c2OC)cc2nccn12,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +344,BA_03_50_13,COc1ccc2ncnc(N[C@H](C)c3ccccc3F)c2c1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +345,BA_03_50_12,COc1ccc2ncnc(NCCc3ccc(F)cc3)c2c1,<,4.522878745280337,,4.979564642347633,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +346,BA_03_61_02,Cc1cc(=O)n(C2CCOCC2)c2nc(Nc3cccc(N4CCOCC4)c3)ncc12,,4.738035237054328,<,4.522878745280337,,4.862452908010375,<,4.522878745280337,,4.736607136839892,,5.305362952137882,<,4.522878745280337,,4.856607360583436,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +347,Ifidancitinib,COc1cc(Nc2ncc(C)c(Nc3ccc4oc(=O)[nH]c4c3)n2)cc(C)c1F,,5.387247664524138,,4.832264495121284,,4.5228994754347385,,5.030229217768204,,4.577296369141819,,5.298208185567936,<,4.522878745280337,,5.179591189738451,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +348,ZDG_6_66,COc1ccccc1Nc1ncc2c(n1)c(-c1cccc(NS(C)(=O)=O)c1)nn2C,,5.138546774720314,,4.971941526406618,,5.006132125882433,,4.928694690643835,,4.987415237762792,,6.614735179247774,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0 +349,HO_N_140,COc1ccc(-c2cc3nccn3c(Nc3ncccc3C(=O)O)n2)cc1OC,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +350,HO_N_62,COc1ccc(-c2cc3nccn3c(Nc3ccccn3)n2)cc1OC,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +351,ZDG_7_35_C,Cn1cc(-c2ccc3c(c2)CCN3C(=O)C2C(C)(C)C2(C)C)c2c(N)ncnc21,,4.931814837850233,,5.046661835207546,,5.472401646550287,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +352,GSK461364,C[C@@H](Oc1cc(-n2cnc3ccc(CN4CCN(C)CC4)cc32)sc1C(N)=O)c1ccccc1C(F)(F)F,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +353,3823,CCOc1ccc2c(-c3ccnc(Nc4ccc(C(N)=O)cc4)n3)cnn2n1,,,,,,,,,,,,7.075720713938117,,,,,,,,,,1.0,, +354,PAR_141,O=C(NCC1CCC(F)(F)CC1)c1ccc(-c2ccnc(Nc3ccc(N4CCOCC4)cc3)n2)cc1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +355,ZDG_7_26_3,CCN(C(=O)COc1cc(-n2cnc3cc(OC)c(OC)cc32)sc1C(N)=O)c1cccc(C)c1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +356,BA_03_50_03,COc1ccc2ncnc(NC3[C@H]4C[C@@H]5C[C@@H](C[C@H]3C5)C4)c2c1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +357,HO_N_133_3_A,COc1cccc(-c2cc3nccn3c(Nc3ncccc3C(N)=O)n2)c1OC,,4.636953072082088,,4.612097017819476,,4.705893907584375,,5.101435565833748,<,4.522878745280337,,5.346187827681289,<,4.522878745280337,,5.08700015944291,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +358,PAR_378,O=C(Nc1ccc(-c2ccnc(Nc3ccc(N4CCOCC4)cc3)n2)cc1)C1CC1,,5.795223621307058,,5.069636181663933,,5.189563149194148,,5.354112429809823,,5.090225183808611,,6.435809867643536,<,4.522878745280337,,5.320880002769998,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0 +359,PAR_396,CN(C)S(=O)(=O)c1cccc(Nc2nccc(N3CC[C@H](NC(=O)COc4ccccc4)C3)n2)c1,,5.639804514613448,,5.698986663575901,,5.589807383635014,<,4.522878745280337,,5.485285553246834,<,4.522878745280337,<,4.522878745280337,,5.391752647388778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +360,PAR_248,O=C(Nc1ccc(-c2ccnc(Nc3ccc(N4CCOCC4)cc3)n2)cc1)C1CCOCC1,,4.7701367862789015,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,,5.539222443114369,<,4.522878745280337,,4.828484144000552,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +361,ZDG_7_34_B,Cn1cc(-c2ccc3c(c2)CCN3C(=O)C2CCC(F)(F)CC2)c2c(N)ncnc21,<,4.522878745280337,<,4.522878745280337,,4.738800287970558,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +362,PAR_353,Cc1cc(=O)n(C2CCCC2)c2nc(Nc3ccc(CN4CCOCC4)cc3)ncc12,,4.889178578148,,4.694100978880336,,5.521752502766849,<,4.522878745280337,<,4.522878745280337,,5.609121210067395,<,4.522878745280337,,5.216615473969439,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +363,PAR_140,O=C(c1ccc(-c2ccnc(Nc3ccc(N4CCOCC4)cc3)n2)cc1)N1CC[C@@H](O)C1,,4.947851149814153,<,4.522878745280337,<,4.522878745280337,,4.9167148935059615,<,4.522878745280337,,5.738652702744123,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +364,ZDG_7_41_3,Cn1cc(-c2ccc3c(c2)CCN3C(=O)Cc2ccc3c(c2)OCCO3)c2c(N)ncnc21,,5.009846998035169,,5.068850956510059,,5.460293990716453,<,4.522878745280337,,4.979278393101506,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +365,ON1231320,Cn1c(=O)c(S(=O)(=O)c2ccc(F)cc2F)cc2cnc(Nc3ccc4[nH]ccc4c3)nc21,,5.414618604428587,,5.277990016375206,,5.399388668716516,,5.37920890878158,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,,5.257878075412741,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +366,BA_03_61_03,Cc1cc(=O)n(C2CCOCC2)c2nc(Nc3ccc(N4CCOCC4)cc3)ncc12,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +367,PAR_81,O=C(Nc1nc2c(s1)-c1nc(-c3ccccc3Br)ncc1CC2)NC1CCCNC1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +368,ZDG_7_51_7,COc1cc(Nc2ncc3c(C)cc(=O)n(C4CCCC4)c3n2)cc(OC)c1OC,,5.844613985129847,<,4.522878745280337,,6.002273976255355,,4.53953279402968,,5.039361792838563,,6.680243093632786,<,4.522878745280337,,6.224836377308716,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0 +369,ZDG_6_72,COc1cc(Nc2ncc3c(n2)c(C2=CCOCC2)nn3C)cc(OC)c1OC,,5.256698859829601,<,4.522878745280337,,5.408457936833549,,5.050230571632087,<,4.522878745280337,,7.322740251210563,<,4.522878745280337,,5.217736958089621,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0 +370,ZDG_7_53_1,CN1CCN(c2cccc(Nc3nccc(-c4ccc(NC(=O)[C@@H]5CCCN5)cc4)n3)c2)CC1,,5.338173223355024,<,4.522878745280337,<,4.522878745280337,,6.231164950149952,<,4.522878745280337,,6.334276757369631,<,4.522878745280337,,5.32844855319665,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0 +371,Nimucitinib,NC(=O)c1cnc(Nc2cccc(NC(=O)[C@H]3CCCNC3)c2)cc1NCc1cc(F)cc(F)c1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,,5.651869225303184,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +372,HO_N_83,CN1CCN(c2ccc(Nc3ncc4c(n3)c(-c3cccc(NS(C)(=O)=O)c3)nn4C)cc2)CC1,,5.561448562355972,<,4.522878745280337,,6.155927167866227,,6.479296566675651,<,4.522878745280337,,7.354332708971508,<,4.522878745280337,,6.592940999627449,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0 +373,PAR_110,O=C(NC1CCCNC1)c1ccc(-c2ccnc(Nc3ccc(N4CCOCC4)cc3)n2)cc1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +374,ZDG_6_41,COc1cc(Nc2ncc3c(-c4cccc(N)c4)nn(C)c3n2)cc(OC)c1OC,,6.072137063704354,,5.331158920986598,,6.41637171103705,,6.192495604580524,,5.327101828744807,,7.769368396562845,<,4.522878745280337,,6.210385953351892,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0 +375,ZDG_7_47_3,Cn1cc(-c2ccc3c(c2)OCCO3)c2c(N)ncnc21,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +376,3856,Cc1cnc(Nc2ccc(NC(=O)c3cccc(NC(=O)/C=C/CN(C)C)c3)cc2)nc1-c1cccnc1,,,,,,,,,,,,5.549750891680638,,,,,,,,,,0.0,, +377,BA_03_53_04,COc1ccc2c(-c3ccnc(Nc4ccc(N5CCOCC5)cc4)n3)cnn2n1,,5.824924592355906,,5.209294342423722,,5.663139266208908,,6.3290190292376,,4.837143150361911,<,4.522878745280337,<,4.522878745280337,,6.6043667019657,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0 +378,ZDG_7_9,Cc1cc(=O)n(C2CCCC2)c2nc(Nc3ccc(C(=O)NCC(F)F)cc3)ncc12,,4.956505457771383,,4.703039486786711,,4.990639364692515,<,4.522878745280337,,4.74687777691037,,6.067682583262696,<,4.522878745280337,,5.269688804153579,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0 +379,ZDG_7_37_C,Cn1cc(-c2ccc3c(c2)CCN3C(=O)CC2Cc3ccccc3C2)c2c(N)ncnc21,,5.105383633393123,,5.257293748542373,,5.378444490078154,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +380,ZDG_7_50_5,CN1CCN(C(=O)c2ccc(Nc3nccc(-c4ccc(NC(=O)[C@@H]5CCCN5)cc4)n3)cc2)CC1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,,5.154876214572618,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +381,ZDG_7_50_3,CN1CCN(c2ccc(Nc3nccc(-c4ccc(NC(=O)[C@@H]5CCCN5)cc4)n3)cc2)CC1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,,5.493317766712035,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,,4.58678479491856,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +382,PAR_108,O=C(NC1CCCCCC1)c1ccc(-c2ccnc(Nc3ccc(N4CCOCC4)cc3)n2)cc1,,4.648976885597432,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +383,TIE_2_VEGFR_2_kinase_IN_2,Nc1ncnc2occ(-c3ccc(NC(=O)Nc4cc(C(F)(F)F)ccc4F)cc3)c12,,4.923856956015497,,5.279845965163705,,5.358873766581655,<,4.522878745280337,<,4.522878745280337,,5.275306784282457,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +384,BA_03_65_b,COC(=O)c1sc(-n2cnc3cc(OC)c(OC)cc32)cc1OCc1ccc(-c2ccncc2)s1,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +385,ZDG_7_36_C,CC#CC(=O)N1CCc2cc(-c3cn(C)c4ncnc(N)c34)ccc21,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +386,3739,N#Cc1c(Nc2nc(Nc3ccc(N4CCNCC4)cc3)ncc2Cl)cccc1OCc1cccc(F)c1,,,,,,6.823908740944318,,,,,,,,,,,,,1.0,,,,, +387,ZDG_6_48_1,COc1cc(Nc2ncc3c(-c4cccc(NS(=O)(=O)C5CC5)c4)nn(C)c3n2)cc(OC)c1OC,,7.134526430898649,<,4.522878745280337,,6.827566657475918,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,,6.980961650786502,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0 +388,3743,N#Cc1c(Nc2nc(Nc3cnn(C4CCNCC4)c3)ncc2Cl)cccc1OCc1ccccc1F,,,,,,6.823908740944318,,,,,,,,,,,,,1.0,,,,, +389,ZDG_7_23_2,COc1cc2ncn(-c3cc(OCc4ccc(F)cc4)c(C(N)=O)s3)c2cc1OC,<,4.522878745280337,<,4.522878745280337,,4.595719696676417,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +390,PAR_408,COc1cc(Nc2nccc(N3CC[C@H](NC(=O)c4ccc(C#N)cc4)C3)n2)cc(OC)c1OC,,5.598642092872625,,4.692209917210851,,4.864752009196747,<,4.522878745280337,,4.769202671483697,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +391,3821,CCOc1ccc2c(-c3ccnc(Nc4ccc(N5CCN(C)CC5)cc4)n3)cnn2n1,,,,,,,,,,,,7.236572006437062,,,,,,,,,,1.0,, +392,BA_03_66_c,COc1cc2ncn(-c3cc(OCc4sccc4F)c(C(N)=O)s3)c2cc1OC,<,4.522878745280337,,4.534933246450109,,4.987977287080152,<,4.522878745280337,,4.6026568572406905,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +393,PAR_149,O=C(Nc1nc2c(s1)-c1nc(-c3ccccc3Br)ncc1CC2)N1CCC(N2CCCCC2)CC1,,4.803895504413972,<,4.522878745280337,,4.70292748711545,,4.937318844782486,<,4.522878745280337,,5.305326640282966,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +394,PAR_247,CCOc1ccsc1C(=O)Nc1ccc(-c2ccnc(Nc3ccc(N4CCOCC4)cc3)n2)cc1,,5.213153750968695,,5.065611488315886,,4.91383756830168,,5.220674896840294,,5.14240597996013,,5.22581934838519,<,4.522878745280337,,5.141058054333915,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +395,CHZ868,CC(=O)Nc1cc(Oc2ccc3c(nc(Nc4ccc(F)cc4F)n3C)c2C)ccn1,<,4.522878745280337,,4.73755906290163,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +396,HS_1371,Cc1ccc(Oc2ccnc3cc(-c4cnn(C5CCNCC5)c4)ccc23)cc1,<,4.522878745280337,,4.546058875121365,,4.6313362303328764,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,<,4.522878745280337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +397,3722,N#Cc1c(Nc2nc(Nc3ccc(N4CCOCC4)cc3)ncc2Cl)cccc1OCc1ccccc1F,,,,,,6.823908740944318,,,,,,,,,,,,,1.0,,,,, +398,3822,CCOc1ccc2c(-c3ccnc(Nc4cccc(S(N)(=O)=O)c4)n3)cnn2n1,,,,,,,,,,,,7.119186407719209,,,,,,,,,,1.0,, +399,3738,N#Cc1c(Nc2nc(Nc3ccc(N4CCNCC4)cc3)ncc2Cl)cccc1OCc1ccc(F)cc1,,,,,,6.823908740944318,,,,,,,,,,,,,1.0,,,,, +400,3816,CCOc1ccc2c(-c3ccnc(Nc4cccc(NC(C)=O)c4)n3)cnn2n1,,,,,,,,,,,,7.795880017344074,,,,,,,,,,1.0,, +401,3740,CN1CCN(Cc2ccc(Nc3ncc(Cl)c(Nc4ccc(C#N)c(OCc5ccccc5F)c4)n3)cc2)CC1,,,,,,6.578396073130168,,,,,,,,,,,,,1.0,,,,, +402,3741,N#Cc1c(Nc2nc(Nc3ccc(CN4CCOCC4)cc3)ncc2Cl)cccc1OCc1ccccc1F,,,,,,7.30102999566398,,,,,,,,,,,,,1.0,,,,, +403,3789,COc1cc(Nc2nccc(N3CC[C@H](NC(=O)[C@@H]4COc5ccccc5O4)C3)n2)cc(OC)c1OC,,,,,,,,,,,,5.920818753952374,,,,,,,,,,0.0,, +404,3814,CCOc1ccc2c(-c3ccnc(Nc4ccc(S(C)(=O)=O)cc4)n3)cnn2n1,,,,,,,,,,,,7.853871964321761,,,,,,,,,,1.0,, +405,3825,CCOc1ccc2c(-c3ccnc(Nc4cccc(C(=O)N5CCN(C)CC5)c4)n3)cnn2n1,,,,,,,,,,,,6.815308569182401,,,,,,,,,,1.0,, +406,3792,COc1cc(Nc2nccc(N3CC[C@@H](NC(=O)[C@H]4COc5ccccc5O4)C3)n2)cc(OC)c1OC,,,,,,,,,,,,5.337242168318426,,,,,,,,,,0.0,, +407,3837,Cc1cc(Nc2nccc(-c3cnn4ncccc34)n2)ccc1N1CCN(C)CC1,,,,,,,,,,,,6.060480747381381,,,,,,,,,,1.0,, +408,3731,COc1cc(Nc2ncc(Cl)c(Nc3cccc(OCc4ccccc4F)c3C#N)n2)ccc1N1CCNCC1,,,,,,7.30102999566398,,,,,,,,,,,,,1.0,,,,, +409,3818,CCOc1ccc2c(-c3ccnc(Nc4ccc(Cl)c(Cl)c4)n3)cnn2n1,,,,,,,,,,,,7.568636235841011,,,,,,,,,,1.0,, +410,3728,N#Cc1c(Nc2nc(Nc3ccc(N4CCN(CCO)CC4)cc3)ncc2Cl)cccc1OCc1ccccc1F,,,,,,7.30102999566398,,,,,,,,,,,,,1.0,,,,, +411,3725,N#Cc1c(Nc2nc(Nc3ccc(N4CCNCC4)cc3)ncc2Cl)cccc1OCc1ccccc1F,,,,,,7.30102999566398,,,,,,,,,,,,,1.0,,,,, +412,3736,N#Cc1c(Nc2nc(Nc3ccc(C(=O)N4CCNCC4)cc3)ncc2Cl)cccc1OCc1ccccc1F,,,,,,6.823908740944318,,,,,,,,,,,,,1.0,,,,, +413,3834,CN1CCN(c2cccc(Nc3nccc(-c4cnn5ncccc45)n3)c2)CC1,,,,,,,,,,,,6.300162274132754,,,,,,,,,,1.0,, +414,3730,CN1CCN(c2ccc(Nc3ncc(Cl)c(Nc4cccc(OCc5ccccc5F)c4C#N)n3)cc2)CC1,,,,,,7.30102999566398,,,,,,,,,,,,,1.0,,,,, +415,2092,Cc1cc(Nc2nccc(-c3cccnc3)n2)ccc1NC(=O)c1cccc(NC(=O)/C=C/CN(C)C)c1,,,,,,,,,,,,6.14266750356873,,,,,,,,,,1.0,, +416,3826,COc1ccc2c(-c3ccnc(Nc4ccc(N5CCN(C)CC5)cc4)n3)cnn2n1,,,,,,,,,,,,6.7594507517174,,,,,,,,,,1.0,, +417,3727,COc1cnc(Nc2ccc(N3CCNCC3)cc2)nc1Nc1cccc(OCc2ccccc2F)c1C#N,,,,,,7.30102999566398,,,,,,,,,,,,,1.0,,,,, +418,3744,N#Cc1c(Nc2nc(Nc3ccc(N4CCNCC4)nc3)ncc2Cl)cccc1OCc1ccccc1F,,,,,,6.823908740944318,,,,,,,,,,,,,1.0,,,,, +419,3726,CN1CCN(c2ccc(Nc3ncc(Cl)c(Nc4cccc(OCc5c(F)cccc5F)c4C#N)n3)cc2)CC1,,,,,,7.30102999566398,,,,,,,,,,,,,1.0,,,,, +420,3835,CCOc1ccc2c(-c3ccnc(Nc4cccc(S(=O)(=O)C(F)(F)F)c4)n3)cnn2n1,,,,,,,,,,,,6.291579099865286,,,,,,,,,,1.0,, +421,3824,CCOc1ccc2c(-c3ccnc(Nc4cccc(C(N)=O)c4)n3)cnn2n1,,,,,,,,,,,,6.863279432843592,,,,,,,,,,1.0,, +422,3745,N#Cc1c(Nc2nc(Nc3ccc(CN4CCNCC4)cc3)ncc2Cl)cccc1OCc1ccccc1F,,,,,,6.823908740944318,,,,,,,,,,,,,1.0,,,,, +423,2094,CN(C)CCCC(=O)Nc1cccc(C(=O)Nc2ccc(Nc3nccc(-c4cccnc4)n3)cc2)c1,,,,,,,,,,,,5.616184634019568,,,,,,,,,,0.0,, +424,3793,COc1cc(Nc2nccc(N3CC[C@H](NC(=O)[C@H]4COc5ccccc5O4)C3)n2)cc(OC)c1OC,,,,,,,,,,,,5.070581074285707,,,,,,,,,,0.0,, +425,3729,COc1cc(N2CCNCC2)ccc1Nc1ncc(Cl)c(Nc2cccc(OCc3ccccc3F)c2C#N)n1,,,,,,6.823908740944318,,,,,,,,,,,,,1.0,,,,, +426,3724,C[C@H](Oc1cccc(Nc2nc(Nc3ccc(N4CCN(C)CC4)cc3)ncc2Cl)c1C#N)c1ccccc1F,,,,,,7.30102999566398,,,,,,,,,,,,,1.0,,,,, +427,3828,COc1ccc2c(-c3ccnc(Nc4ccc(N5CCN(C)CC5)c(F)c4)n3)cnn2n1,,,,,,,,,,,,6.707743928643524,,,,,,,,,,1.0,, +428,3737,COc1ccccc1COc1cccc(Nc2nc(Nc3ccc(N4CCNCC4)cc3)ncc2Cl)c1C#N,,,,,,6.823908740944318,,,,,,,,,,,,,1.0,,,,, +429,2099,Cc1ccc(-c2ccnc(Nc3ccc(NC(=O)c4cccc(NC(=O)/C=C/CN(C)C)c4)cc3)n2)cn1,,,,,,,,,,,,6.070581074285706,,,,,,,,,,1.0,, diff --git a/atomsci/ddm/test/integrative/sampling_test/nanobret_multitask_classification_data_train_valid_test_multitaskscaffold_e34ba827-a532-4313-9e63-8a9b0ed18ba9.csv b/atomsci/ddm/test/integrative/sampling_test/nanobret_multitask_classification_data_train_valid_test_multitaskscaffold_e34ba827-a532-4313-9e63-8a9b0ed18ba9.csv new file mode 100755 index 00000000..9699217d --- /dev/null +++ b/atomsci/ddm/test/integrative/sampling_test/nanobret_multitask_classification_data_train_valid_test_multitaskscaffold_e34ba827-a532-4313-9e63-8a9b0ed18ba9.csv @@ -0,0 +1,431 @@ +cmpd_id,subset,fold +PAR_272,train,0 +PAR_315,train,0 +ZDG_7_52_4,train,0 +HO_N_101,train,0 +HO_N_57_1,train,0 +BA_03_61_01,train,0 +PAR_182,train,0 +BA_03_69_c,train,0 +PAR_334,train,0 +ZDG_7_50_2,train,0 +3746,train,0 +ATH686,train,0 +TL01_022,train,0 +BA_03_53_12,train,0 +PAR_379,train,0 +PAR_331,train,0 +PAR_403,train,0 +PAR_380,train,0 +HO_N_90,train,0 +PAR_142,train,0 +BA_03_56_12,train,0 +TL01_026,train,0 +GW843682,train,0 +BA_03_60_04,train,0 +HO_N_135_2_A,train,0 +AST487,train,0 +PAR_310,train,0 +PAR_268,train,0 +HO_N_133_2_A,train,0 +TL01_019,train,0 +PAR_291,train,0 +ZDG_7_40_C,train,0 +HO_N_136_6_A,train,0 +BA_03_59_02,train,0 +ZDG_7_44_1,train,0 +PAR_294,train,0 +BBT594,train,0 +PAR_298,train,0 +PAR_269,train,0 +PAR_404,train,0 +HO_N_96,train,0 +PAR_261,train,0 +HO_N_42,train,0 +BA_03_50_15,train,0 +PAR_158,train,0 +PAR_402,train,0 +3820,train,0 +BA_03_50_18,train,0 +PAR_112,train,0 +BA_03_56_11,train,0 +BA_03_61_07,train,0 +PAR_314,train,0 +PAR_157,train,0 +PAR_109,train,0 +PAR_159,train,0 +HO_N_67,train,0 +HO_N_57_2,train,0 +PAR_320,train,0 +PAR_377,train,0 +PAR_383,train,0 +Bafetinib,train,0 +HO_N_131_5_A,train,0 +PAR_371,train,0 +PAR_369,train,0 +ZDG_6_49_1,train,0 +ZDG_7_33_C,train,0 +BA_03_51_15,train,0 +BA_03_66_a,train,0 +BA_03_53_11,train,0 +ZDG_7_25_3,train,0 +HO_N_116,train,0 +Brepocitinib,train,0 +HO_N_115,train,0 +BA_03_50_20,train,0 +BA_03_69_a,train,0 +HO_N_137_A,train,0 +PAR_249,train,0 +PAR_361,train,0 +PAR_274,train,0 +ZDG_5_55_6,train,0 +BA_03_50_21,train,0 +HO_N_95,train,0 +EF_3_101,train,0 +ZDG_6_50_1,train,0 +ZDG_7_52_6,train,0 +HO_N_104,train,0 +HO_N_99,train,0 +PAR_277,train,0 +BA_03_50_01,train,0 +BA_03_55_12,train,0 +ZDG_7_23_1,train,0 +EF_3_103,train,0 +PAR_299,train,0 +BA_03_50_22,train,0 +ZDG_6_59_1,train,0 +ZDG_7_35_B,train,0 +BA_03_59_04,train,0 +PAR_322,train,0 +PAR_342,train,0 +HO_N_139_2,train,0 +HO_N_129,train,0 +ZDG_7_41_A,train,0 +HO_N_135_2_E,train,0 +PAR_374,train,0 +BA_03_59_10,train,0 +PAR_311,train,0 +ZDG_5_55_7,train,0 +ZDG_7_37_B,train,0 +ZDG_7_11,train,0 +ZDG_7_43_2,train,0 +PAR_205,train,0 +ZDG_7_27_1,train,0 +CE_245677,train,0 +ZDG_7_14,train,0 +HO_N_136_2_E,train,0 +BA_03_59_03,train,0 +PAR_394,train,0 +BA_03_51_19,train,0 +PAR_398,train,0 +ZDG_7_21_2,train,0 +HO_N_98,train,0 +PAR_90,train,0 +BA_03_59_05,train,0 +PAR_338,train,0 +ZDG_7_44_4,train,0 +BA_03_59_07,train,0 +ZDG_6_48_4,train,0 +BA_03_65_c,train,0 +BA_03_61_04,train,0 +ZDG_6_60,train,0 +HO_N_133_1_A,train,0 +PAR_360,train,0 +3723,train,0 +BA_03_80_A,train,0 +BA_03_60_03,train,0 +BA_03_66_h,train,0 +ZDG_7_24_1,train,0 +PAR_168,train,0 +PAR_111,train,0 +BA_03_60_02,train,0 +EF_3_105,train,0 +PAR_321,train,0 +PAR_318,train,0 +Derazantinib,train,0 +PAR_345,train,0 +PAR_381,train,0 +ZDG_6_50_4,train,0 +TL01_010,train,0 +TL01_024,train,0 +HO_N_135_7_A,train,0 +ZDG_6_48_2,train,0 +PAR_370,train,0 +PAR_241,train,0 +ZDG_7_26_1,train,0 +Tpl2_Kinase_Inhibitor_1,train,0 +BA_03_61_10,train,0 +ZDG_7_46_1,train,0 +PAR_162,train,0 +3842,train,0 +BA_03_50_11,train,0 +AMG_47a,train,0 +Golvatinib,train,0 +PAR_372,train,0 +PAR_113,train,0 +BA_03_51_18,train,0 +PAR_316,train,0 +3732,train,0 +S116836,train,0 +BA_03_50_19,train,0 +PAR_356,train,0 +BA_03_50_07,train,0 +HO_N_138_2_A,train,0 +HO_N_73,train,0 +BA_03_65_a,train,0 +ZDG_6_38,train,0 +TL01_023,train,0 +PAR_333,train,0 +ZDG_6_48_3,train,0 +HO_N_71,train,0 +ZDG_7_24_2,train,0 +BA_03_59_01,train,0 +PAR_395,train,0 +ZDG_6_59_3,train,0 +PAR_309,train,0 +BA_03_61_05,train,0 +ZDG_7_40_B,train,0 +PAR_138,train,0 +HO_N_131_5_E,train,0 +BA_03_56_04,train,0 +XL_019,train,0 +PAR_139,train,0 +ZDG_6_50_3,train,0 +BA_03_51_22,train,0 +PAR_271,train,0 +3735,train,0 +ba_03_55_11,train,0 +PAR_260,train,0 +HO_N_100,train,0 +PAR_348,train,0 +PAR_183,train,0 +HO_N_49,train,0 +HO_N_110,train,0 +G1T38,train,0 +ZDG_6_47,train,0 +ZDG_7_32_B,train,0 +PAR_312,train,0 +PAR_275,train,0 +ZDG_7_24_3,train,0 +HO_N_136_2_A,train,0 +ZDG_7_50_1,train,0 +ZDG_7_35_A,train,0 +3838,train,0 +ALW_II_49_7,train,0 +BA_03_50_13,train,0 +BA_03_50_12,train,0 +BA_03_61_02,train,0 +Ifidancitinib,train,0 +ZDG_7_35_C,train,0 +GSK461364,train,0 +PAR_141,train,0 +ZDG_7_26_3,train,0 +BA_03_50_03,train,0 +PAR_378,train,0 +PAR_248,train,0 +ZDG_7_34_B,train,0 +PAR_353,train,0 +PAR_140,train,0 +ZDG_7_51_7,train,0 +ZDG_6_72,train,0 +ZDG_7_53_1,train,0 +Nimucitinib,train,0 +HO_N_83,train,0 +PAR_110,train,0 +ZDG_6_41,train,0 +ZDG_7_47_3,train,0 +BA_03_53_04,train,0 +ZDG_7_9,train,0 +ZDG_7_37_C,train,0 +ZDG_7_50_5,train,0 +ZDG_7_50_3,train,0 +PAR_108,train,0 +TIE_2_VEGFR_2_kinase_IN_2,train,0 +ZDG_7_36_C,train,0 +3739,train,0 +ZDG_6_48_1,train,0 +ZDG_7_23_2,train,0 +PAR_408,train,0 +3821,train,0 +BA_03_66_c,train,0 +HS_1371,train,0 +3722,train,0 +3738,train,0 +3740,train,0 +3741,train,0 +3825,train,0 +3837,train,0 +3731,train,0 +3728,train,0 +3725,train,0 +3736,train,0 +3834,train,0 +3730,train,0 +3826,train,0 +3727,train,0 +3744,train,0 +3726,train,0 +3745,train,0 +3729,train,0 +3724,train,0 +3828,train,0 +3737,train,0 +PAR_337,valid,0 +Narazaciclib,valid,0 +PAR_363,valid,0 +PAR_335,valid,0 +HO_N_135_4_A,valid,0 +3827,valid,0 +Altiratinib,valid,0 +PAR_252,valid,0 +ZDG_7_43_3,valid,0 +PAR_376,valid,0 +PAR_375,valid,0 +BA_03_53_01,valid,0 +EF_3_203,valid,0 +ZDG_7_51_5,valid,0 +TL01_020,valid,0 +GSK2606414,valid,0 +Tovorafenib,valid,0 +PAR_323,valid,0 +HO_N_105,valid,0 +GCN2_IN_1,valid,0 +ZDG_7_39_A,valid,0 +PAR_406,valid,0 +BA_03_55_01,valid,0 +Axitinib,valid,0 +3829,valid,0 +ZDG_7_31_A,valid,0 +HO_N_133_4_A,valid,0 +2093,valid,0 +PAR_405,valid,0 +HO_N_136_5_A,valid,0 +Encorafenib,valid,0 +BA_03_56_14,valid,0 +3819,valid,0 +Rac_CCT_250863,valid,0 +PAR_330,valid,0 +PAR_382,valid,0 +ZDG_7_38_A,valid,0 +BA_03_55_14,valid,0 +HO_N_136_4_A,valid,0 +HO_N_135_4_E,valid,0 +BA_03_50_08,valid,0 +BA_03_53_14,valid,0 +HO_N_134_E,valid,0 +PAR_228,valid,0 +ZDG_6_48_7,valid,0 +HO_N_136_3_A,valid,0 +BA_03_53_06,valid,0 +HO_N_134_A,valid,0 +ZDG_7_39_C,valid,0 +PAR_324,valid,0 +HO_N_132_3_E,valid,0 +3853,valid,0 +3790,valid,0 +TC_S_7005,valid,0 +PAR_400,valid,0 +RSS0680,valid,0 +ALK_kinase_inhibitor_1,valid,0 +HO_N_135_5_A,valid,0 +ZDG_7_42_1,valid,0 +HO_N_135_1_A,valid,0 +HO_N_132_3_A,valid,0 +EF_3_201,valid,0 +PAR_327,valid,0 +HO_N_135_8_A,valid,0 +PAR_336,valid,0 +3791,valid,0 +BA_03_56_01,valid,0 +BA_03_53_08,valid,0 +PAR_401,valid,0 +BA_03_53_13,valid,0 +KW_2449,valid,0 +PAR_270,valid,0 +BA_03_53_05,valid,0 +PAR_357,valid,0 +ZDG_6_59_2,valid,0 +3833,valid,0 +HO_N_136_7_A,valid,0 +3801,valid,0 +HO_N_133_3_E,valid,0 +HO_N_140,valid,0 +HO_N_62,valid,0 +3823,valid,0 +HO_N_133_3_A,valid,0 +PAR_396,valid,0 +ZDG_7_41_3,valid,0 +ON1231320,valid,0 +BA_03_61_03,valid,0 +PAR_81,valid,0 +3856,valid,0 +3743,valid,0 +PAR_149,valid,0 +PAR_247,valid,0 +CHZ868,valid,0 +3822,valid,0 +3816,valid,0 +3789,valid,0 +3814,valid,0 +3792,valid,0 +3818,valid,0 +2092,valid,0 +3835,valid,0 +3824,valid,0 +2094,valid,0 +3793,valid,0 +2099,valid,0 +BA_03_50_04,test,0 +GSK329,test,0 +PAR_225,test,0 +ZDG_7_46_3,test,0 +HO_N_138_3_A,test,0 +ZDG_6_64,test,0 +BA_03_78_d,test,0 +PAR_355,test,0 +ZDG_6_75_4,test,0 +BA_03_50_05,test,0 +ZDG_2_91,test,0 +ZDG_6_51_2,test,0 +ZDG_7_48_1,test,0 +ZDG_7_44_3,test,0 +ZDG_7_45_1,test,0 +ZDG_7_47_1,test,0 +Culmerciclib,test,0 +3_IN_PP1,test,0 +GCN2iB,test,0 +Tivozanib_hydrochloride_hydrate,test,0 +PAR_89,test,0 +PAR_244,test,0 +MAX_40279,test,0 +ZDG_7_46_4,test,0 +ZDG_7_43_1,test,0 +ZDG_6_61_N2,test,0 +PAR_358,test,0 +BA_03_66_b,test,0 +ZDG_2_93,test,0 +BA_03_60_05,test,0 +ZDG_6_75_3,test,0 +BA_03_60_01,test,0 +ZDG_7_43_4,test,0 +BA_03_60_10,test,0 +ZDG_6_75_2,test,0 +ZDG_7_47_2,test,0 +ZDG_6_67,test,0 +BA_03_50_06,test,0 +BSc5367,test,0 +2096,test,0 +Casein_Kinase_II_Inhibitor_IV,test,0 +BI_882370,test,0 +PAR_351,test,0 +ZDG_7_15,test,0 +BA_03_78_c,test,0 +BA_03_60_07,test,0 +PAR_354,test,0 +ZDG_7_45_2,test,0 +PAR_362,test,0 +HO_N_133_5_A,test,0 +ZDG_2_92,test,0 +TBAP_001,test,0 +ZDG_6_66,test,0 +BA_03_65_b,test,0 diff --git a/atomsci/ddm/test/integrative/sampling_test/test_sampling_mtss_model.py b/atomsci/ddm/test/integrative/sampling_test/test_sampling_mtss_model.py new file mode 100755 index 00000000..d60c4551 --- /dev/null +++ b/atomsci/ddm/test/integrative/sampling_test/test_sampling_mtss_model.py @@ -0,0 +1,77 @@ +# smote/undersampling multitask test + +import os +import atomsci.ddm.pipeline.model_pipeline as mp +import atomsci.ddm.pipeline.parameter_parser as parse +import pytest + + +def test_mtss_model(): + script_path = os.path.dirname(os.path.realpath(__file__)) + dataset_file = os.path.join(script_path, "nanobret_multitask_classification_data.csv") + split_uuid="e34ba827-a532-4313-9e63-8a9b0ed18ba9" + odir = os.path.join(script_path, "output") + + id_col="compound_id" + smiles_col="base_rdkit_smiles" + response_cols="NEK1_active,NEK2_active,NEK3_active,NEK5_active,NEK9_active" + + params = { + # logistics input + "dataset_key": dataset_file, + "smiles_col": smiles_col, + "prediction_type": "classification", + "split_uuid": split_uuid, + "splitter": "multitaskscaffold", + "response_cols": response_cols, + "previously_split": "True", + + # dataset + "id_col": id_col, + "result_dir": odir, + + # featurization and model + "featurizer": "computed_descriptors", + "descriptor_type": "rdkit_raw", + "model_type": "NN", + # grid search + "max_epochs": "300", + "early_stopping_patience": "100", + "sampling_method":"SMOTE", + "layer_sizes": "128,128,128", + "dropouts": "0.1,0.1,0.10", + "learning_rates": "0.0007", + + # extras, can be deleted as needed + "system": "LC", + "verbose": "True", + } + ampl_param = parse.wrapper(params) + pl = mp.ModelPipeline(ampl_param) + with pytest.raises(ValueError) as e: + # this should say + # Imbalanced-learn currently supports binary, multiclass and binarized encoded multiclasss targets. Multilabel and multioutput targets are not supported. + pl.train_model() + print("done") + +def test_imblearn_mtss_compatibility(): + # this just shows that all SMOTE methods do not work with multitask problems. + import sklearn.datasets as skdatasets + import numpy as np + from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE, KMeansSMOTE, SVMSMOTE + + X, y = skdatasets.make_classification() + print(X.shape, y.shape) + multi_y = np.vstack([y, y, y]).transpose() + print(multi_y.shape) + + for sampler in [SMOTE, ADASYN, BorderlineSMOTE, KMeansSMOTE, SVMSMOTE]: + sm = sampler() + try: + _x, _y = sm.fit_resample(X, multi_y) + except Exception as e: + print(sm) + print(e) + +if __name__ == "__main__": + test_mtss_model() From 2e03fef8528f8c2add9d651a242118b791d04153 Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Wed, 25 Sep 2024 11:29:43 -0700 Subject: [PATCH 33/57] Used parameter to determine if SMOTE or undersampling is being used --- atomsci/ddm/pipeline/model_datasets.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/atomsci/ddm/pipeline/model_datasets.py b/atomsci/ddm/pipeline/model_datasets.py index 7ad2fee9..d5bd3a4a 100644 --- a/atomsci/ddm/pipeline/model_datasets.py +++ b/atomsci/ddm/pipeline/model_datasets.py @@ -661,17 +661,17 @@ def combined_training_data(self): # All of the splits have the same combined train/valid data, regardless of whether we're using # k-fold or train/valid/test splitting. if self.combined_train_valid_data is None: - # normally combining one fold is sufficient, but if SMOTE is being used - # each fold will have compounds unique to it. + # normally combining one fold is sufficient, but if SMOTE or undersampling is being used + # just combining the first fold isn't enough (train, valid) = self.train_valid_dsets[0] combined_X = np.concatenate((train.X, valid.X), axis=0) combined_y = np.concatenate((train.y, valid.y), axis=0) combined_w = np.concatenate((train.w, valid.w), axis=0) combined_ids = np.concatenate((train.ids, valid.ids)) - contains_synthetic = any(id.startswith('synthetic_') for id in train.ids) - if contains_synthetic: + if self.params.sampling_method=='SMOTE' or self.params.sampling_method=='undersampling': # for each successive fold, merge in any new compounds + # this loop just won't run if there are no additional folds for train, valid in self.train_valid_dsets[1:]: fold_ids = np.concatenate((train.ids, valid.ids)) new_id_indexes = [i for i in range(len(fold_ids)) if i not in combined_ids] From b48ed02a69a655e570b90cccfad6939934563635 Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Wed, 25 Sep 2024 11:36:44 -0700 Subject: [PATCH 34/57] Added a seed to this test for more consistent results --- .../jsons/reg_config_delaney_fit_NN_graphconv.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/atomsci/ddm/test/integrative/delaney_Panel/jsons/reg_config_delaney_fit_NN_graphconv.json b/atomsci/ddm/test/integrative/delaney_Panel/jsons/reg_config_delaney_fit_NN_graphconv.json index dd5d7fdf..aa8461da 100644 --- a/atomsci/ddm/test/integrative/delaney_Panel/jsons/reg_config_delaney_fit_NN_graphconv.json +++ b/atomsci/ddm/test/integrative/delaney_Panel/jsons/reg_config_delaney_fit_NN_graphconv.json @@ -40,6 +40,7 @@ "comment": "Test", "comment": "----------------------------------------", - "perf_threshold": "0.50" + "perf_threshold": "0.50", + "seed":0 } From 567264ac0dc0bd1f1bb440717f45847a1729488b Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Thu, 26 Sep 2024 10:15:08 -0700 Subject: [PATCH 35/57] Changed balancing transformer to just check to see if the weights changed --- .../test_balancing_transformer.py | 129 ++++-------------- 1 file changed, 29 insertions(+), 100 deletions(-) diff --git a/atomsci/ddm/test/integrative/balancing_trans/test_balancing_transformer.py b/atomsci/ddm/test/integrative/balancing_trans/test_balancing_transformer.py index 2508b0ea..c249d6a8 100644 --- a/atomsci/ddm/test/integrative/balancing_trans/test_balancing_transformer.py +++ b/atomsci/ddm/test/integrative/balancing_trans/test_balancing_transformer.py @@ -1,88 +1,48 @@ -import pandas as pd import tempfile import atomsci.ddm.pipeline.parameter_parser as parse import atomsci.ddm.pipeline.model_pipeline as mp +import numpy as np import logging logger = logging.getLogger(__name__) -nreps = 10 -metrics = [] -vals = [] -balanced = [] -subset = [] - def test_balancing_transformer(): dset_key = '../../test_datasets/MRP3_dataset.csv' - dset_df = pd.read_csv(dset_key) res_dir = tempfile.mkdtemp() - split_uuid = create_scaffold_split(dset_key, res_dir) - - # train the model without the balancing - train_model_wo_balan(dset_key, split_uuid, res_dir) - # train the model with the balancing parameter - train_model_w_balan(dset_key, split_uuid, res_dir) - - metrics_df = pd.DataFrame(dict(subset=subset, balanced=balanced, metric=metrics, val=vals)) - - # check the recall_score - rec_df = metrics_df[metrics_df.metric == 'recall_score'] - not_balanced_series = rec_df[(rec_df.balanced == 'no')].groupby("subset").val.mean() - balanced_series = rec_df[(rec_df.balanced == 'yes')].groupby("subset").val.mean() + balanced_params = params_w_balan(dset_key, res_dir) + balanced_weights = make_pipeline_and_get_weights(balanced_params) + (major_weight, minor_weight), (major_count, minor_count) = np.unique(balanced_weights, return_counts=True) + assert major_weight < minor_weight - assert((balanced_series['test'] > not_balanced_series['test']) & (balanced_series['valid'] > not_balanced_series['valid']) ) + assert major_count == 416 + assert minor_count == 20 -def create_scaffold_split(dset_key, res_dir): - params = { - "dataset_key" : dset_key, - "datastore" : "False", - "uncertainty": "False", - "splitter": "scaffold", - "split_valid_frac": "0.1", - "split_test_frac": "0.1", - "split_strategy": "train_valid_test", - "previously_split": "False", - "prediction_type": "classification", - "model_choice_score_type": "roc_auc", - "response_cols" : "active", - "id_col": "compound_id", - "smiles_col" : "rdkit_smiles", - "result_dir": res_dir, - "system": "LC", - "transformers": "True", - "model_type": "NN", - "featurizer": "computed_descriptors", - "descriptor_type": "rdkit_raw", - "learning_rate": ".0007", - "layer_sizes": "512,128", - "dropouts": "0.3,0.3", - "save_results": "False", - "max_epochs": "500", - "early_stopping_patience": "50", - "verbose": "False" - } + nonbalanced_params = params_wo_balan(dset_key, res_dir) + nonbalanced_weights = make_pipeline_and_get_weights(nonbalanced_params) + (weight,), (count,) = np.unique(nonbalanced_weights, return_counts=True) + assert weight == 1 + assert count == 436 +def make_pipeline_and_get_weights(params): pparams = parse.wrapper(params) - MP = mp.ModelPipeline(pparams) + model_pipeline = mp.ModelPipeline(pparams) + model_pipeline.train_model() - split_uuid = MP.split_dataset() - return split_uuid + return model_pipeline.data.train_valid_dsets[0][0].w -def train_model_wo_balan(dset_key, split_uuid, res_dir): +def params_wo_balan(dset_key, res_dir): # Train classification models without balancing weights. Repeat this several times so we can get some statistics on the performance metrics. params = { "dataset_key" : dset_key, "datastore" : "False", "uncertainty": "False", "splitter": "scaffold", - "split_valid_frac": "0.1", - "split_test_frac": "0.1", + "split_valid_frac": "0.20", + "split_test_frac": "0.20", "split_strategy": "train_valid_test", - "previously_split": "True", - "split_uuid": split_uuid, "prediction_type": "classification", "model_choice_score_type": "roc_auc", "response_cols" : "active", @@ -92,45 +52,28 @@ def train_model_wo_balan(dset_key, split_uuid, res_dir): "system": "LC", "transformers": "True", "model_type": "NN", - "featurizer": "computed_descriptors", - "descriptor_type": "rdkit_raw", + "featurizer": "ecfp", "learning_rate": ".0007", "layer_sizes": "512,128", "dropouts": "0.3,0.3", "save_results": "False", - "max_epochs": "500", - "early_stopping_patience": "50", + "max_epochs": "2", # You don't need to train very long. Just need to build datasets + "early_stopping_patience": "2", "verbose": "False", - - "seed":"0" } - for i in range(nreps): - pparams = parse.wrapper(params) - MP = mp.ModelPipeline(pparams) - MP.train_model() - wrapper = MP.model_wrapper - - for ss in ['valid', 'test']: - metvals = wrapper.get_pred_results(ss, 'best') - for metric in ['roc_auc_score', 'prc_auc_score', 'cross_entropy', 'precision', 'recall_score', 'npv', 'accuracy_score', 'bal_accuracy', 'kappa','matthews_cc']: - subset.append(ss) - balanced.append('no') - metrics.append(metric) - vals.append(metvals[metric]) + return params -def train_model_w_balan(dset_key, split_uuid, res_dir): +def params_w_balan(dset_key, res_dir): # Now train models on the same dataset with balancing weights params = { "dataset_key" : dset_key, "datastore" : "False", "uncertainty": "False", "splitter": "scaffold", - "split_valid_frac": "0.1", - "split_test_frac": "0.1", + "split_valid_frac": "0.20", + "split_test_frac": "0.20", "split_strategy": "train_valid_test", - "previously_split": "True", - "split_uuid": split_uuid, "prediction_type": "classification", "model_choice_score_type": "roc_auc", "response_cols" : "active", @@ -147,26 +90,12 @@ def train_model_w_balan(dset_key, split_uuid, res_dir): "layer_sizes": "512,128", "dropouts": "0.3,0.3", "save_results": "False", - "max_epochs": "500", - "early_stopping_patience": "50", + "max_epochs": "2", + "early_stopping_patience": "2", "verbose": "False", - - "seed":"0" } - for i in range(nreps): - pparams = parse.wrapper(params) - MP = mp.ModelPipeline(pparams) - MP.train_model() - wrapper = MP.model_wrapper - - for ss in ['valid', 'test']: - metvals = wrapper.get_pred_results(ss, 'best') - for metric in ['roc_auc_score', 'prc_auc_score', 'cross_entropy', 'precision', 'recall_score', 'npv', 'accuracy_score', 'bal_accuracy', 'kappa','matthews_cc']: - subset.append(ss) - balanced.append('yes') - metrics.append(metric) - vals.append(metvals[metric]) + return params if __name__ == '__main__': test_balancing_transformer() \ No newline at end of file From 627cc2000aee0d4acafabcf03d0f315a73757bb9 Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Thu, 26 Sep 2024 15:30:30 -0700 Subject: [PATCH 36/57] Set the seed to make sure the number of positive and negative compounds in each run is the same --- .../integrative/balancing_trans/test_balancing_transformer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/atomsci/ddm/test/integrative/balancing_trans/test_balancing_transformer.py b/atomsci/ddm/test/integrative/balancing_trans/test_balancing_transformer.py index c249d6a8..51769700 100644 --- a/atomsci/ddm/test/integrative/balancing_trans/test_balancing_transformer.py +++ b/atomsci/ddm/test/integrative/balancing_trans/test_balancing_transformer.py @@ -60,6 +60,7 @@ def params_wo_balan(dset_key, res_dir): "max_epochs": "2", # You don't need to train very long. Just need to build datasets "early_stopping_patience": "2", "verbose": "False", + "seed":"0", } return params @@ -93,6 +94,7 @@ def params_w_balan(dset_key, res_dir): "max_epochs": "2", "early_stopping_patience": "2", "verbose": "False", + "seed":"0", } return params From 8decc0eab4d9bbc0b0766cbd163720268b864b67 Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Mon, 30 Sep 2024 11:19:59 -0700 Subject: [PATCH 37/57] Removed unnecessary loop and printed out results from the perf_data test --- atomsci/ddm/pipeline/perf_data.py | 3 +- atomsci/ddm/test/unit/test_perf_data.py | 71 +++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 2 deletions(-) create mode 100644 atomsci/ddm/test/unit/test_perf_data.py diff --git a/atomsci/ddm/pipeline/perf_data.py b/atomsci/ddm/pipeline/perf_data.py index 5d43b70d..8f6ca15b 100644 --- a/atomsci/ddm/pipeline/perf_data.py +++ b/atomsci/ddm/pipeline/perf_data.py @@ -1112,8 +1112,7 @@ def __init__(self, model_dataset, transformers, subset, predict_probs=True, tran self.subset = subset if self.subset in ('train', 'valid', 'train_valid'): - for fold, (train, valid) in enumerate(model_dataset.train_valid_dsets): - dataset = model_dataset.combined_training_data() + dataset = model_dataset.combined_training_data() elif self.subset == 'test': dataset = model_dataset.test_dset else: diff --git a/atomsci/ddm/test/unit/test_perf_data.py b/atomsci/ddm/test/unit/test_perf_data.py new file mode 100644 index 00000000..2b9e6da6 --- /dev/null +++ b/atomsci/ddm/test/unit/test_perf_data.py @@ -0,0 +1,71 @@ +import atomsci.ddm.pipeline.perf_data as perf_data +import atomsci.ddm.pipeline.model_pipeline as model_pipeline +import atomsci.ddm.pipeline.featurization as feat +import atomsci.ddm.pipeline.parameter_parser as parse +import os +import tempfile +import pdb +import deepchem as dc +import numpy as np + +def test_KFoldRegressionPerfData(): + script_path = os.path.dirname(os.path.realpath(__file__)) + res_dir = tempfile.mkdtemp() + + params = {"verbose": "True", + "datastore": "False", + "save_results": "False", + "model_type": "NN", + "featurizer": "ecfp", + "prediction_type": "classification", + "split_strategy": "k_fold_cv", + "splitter": "random", + "split_test_frac": "0.15", + "split_valid_frac": "0.15", + "transformers": "True", + "dataset_key": os.path.join(script_path, + '../test_datasets/aurka_chembl_base_smiles_union.csv'), + "id_col": "compound_id", + "response_cols":"pIC50", + "smiles_col": "base_rdkit_smiles", + "max_epochs":"2", + "prediction_type": "regression", + "result_dir":res_dir} + + # setup a pipeline that will be used to create performance data + pparams = parse.wrapper(params) + mp = model_pipeline.ModelPipeline(pparams) + mp.train_model() + + # creat performance data + perf = perf_data.create_perf_data(mp.params.prediction_type, + mp.data, mp.model_wrapper.transformers, 'train') + + ids = sorted(list(mp.data.train_valid_dsets[0][0].ids[:5])) + weights = perf.get_weights(ids) + assert weights.shape == (5,1) + assert all(weights==1) + + real_vals = perf.get_real_values(ids) + d = dc.data.NumpyDataset(X=np.ones_like(real_vals), y=real_vals, ids=ids, w=np.ones(len(ids))) + # pass correct values through the transformers + for t in perf.transformers: + d = t.transform(d) + + pred_vals = d.y + # This should have r2 of 1 + r2 = perf.accumulate_preds(pred_vals, ids) + assert r2 == 1 + # do a few more folds + r2 = perf.accumulate_preds(pred_vals, ids) + r2 = perf.accumulate_preds(pred_vals, ids) + + (res_ids, res_vals, res_std) = perf.get_pred_values() + (r2_mean, r2_std) = perf.compute_perf_metrics() + + print(f"should be {real_vals}, [0,0,0,0,0]") + print(res_vals, res_std) + #pdb.set_trace() + +if __name__ == "__main__": + test_KFoldRegressionPerfData() \ No newline at end of file From 317cc290dd07a5d1f32a9b54c61c51b58d6063c9 Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Mon, 30 Sep 2024 15:45:18 -0700 Subject: [PATCH 38/57] accumulate_preds ignores the id parameter for SimpleRegressionPerfData and KFoldClassificationPerfData. Updated the parameters and documentation to match behavior. New tests for SimpleClassificationPerfData and SimpleRegressionPerfData --- atomsci/ddm/pipeline/perf_data.py | 18 +-- atomsci/ddm/test/unit/test_perf_data.py | 194 +++++++++++++++++++++++- 2 files changed, 200 insertions(+), 12 deletions(-) diff --git a/atomsci/ddm/pipeline/perf_data.py b/atomsci/ddm/pipeline/perf_data.py index 8f6ca15b..2ebebcfb 100644 --- a/atomsci/ddm/pipeline/perf_data.py +++ b/atomsci/ddm/pipeline/perf_data.py @@ -1400,14 +1400,14 @@ def __init__(self, model_dataset, transformers, subset, transformed=True): # **************************************************************************************** # class SimpleRegressionPerfData - def accumulate_preds(self, predicted_vals, ids, pred_stds=None): + def accumulate_preds(self, predicted_vals, ids=None, pred_stds=None): """Add training, validation or test set predictions to the data structure where we keep track of them. Args: predicted_vals (np.array): Array of predicted values - ids (list): List of the compound ids of the dataset + ids: Ignored for this class pred_stds (np.array): Optional np.array of the prediction standard deviations @@ -1420,8 +1420,8 @@ def accumulate_preds(self, predicted_vals, ids, pred_stds=None): if pred_stds is not None: self.pred_stds = self._reshape_preds(pred_stds) pred_vals = dc.trans.undo_transforms(self.pred_vals, self.transformers) - real_vals = self.get_real_values(ids) - weights = self.get_weights(ids) + real_vals = self.get_real_values() + weights = self.get_weights() scores = [] for i in range(self.num_tasks): nzrows = np.where(weights[:,i] != 0)[0] @@ -1464,7 +1464,7 @@ def get_pred_values(self): # class SimpleRegressionPerfData def get_real_values(self, ids=None): """Returns the real dataset response values, with any transformations undone, as an (ncmpds, ntasks) array - with compounds in the same ID order as in the return from get_pred_values(). + with compounds in the same order as when this was created. Args: ids: Ignored for this class @@ -1637,14 +1637,14 @@ def __init__(self, model_dataset, transformers, subset, predict_probs=True, tran # **************************************************************************************** # class SimpleClassificationPerfData - def accumulate_preds(self, predicted_vals, ids, pred_stds=None): + def accumulate_preds(self, predicted_vals, ids=None, pred_stds=None): """Add training, validation or test set predictions from the current dataset to the data structure where we keep track of them. Arguments: predicted_vals (np.array): Array of predicted values (class probabilities) - ids (list): List of the compound ids of the dataset + ids: Ignored for this class pred_stds (np.array): Optional np.array of the prediction standard deviations @@ -1655,8 +1655,8 @@ def accumulate_preds(self, predicted_vals, ids, pred_stds=None): class_probs = self.pred_vals = self._reshape_preds(predicted_vals) if pred_stds is not None: self.pred_stds = self._reshape_preds(pred_stds) - real_vals = self.get_real_values(ids) - weights = self.get_weights(ids) + real_vals = self.get_real_values() + weights = self.get_weights() # Break out different predictions for each task, with zero-weight compounds masked out, and compute per-task metrics scores = [] for i in range(self.num_tasks): diff --git a/atomsci/ddm/test/unit/test_perf_data.py b/atomsci/ddm/test/unit/test_perf_data.py index 2b9e6da6..ce4c237e 100644 --- a/atomsci/ddm/test/unit/test_perf_data.py +++ b/atomsci/ddm/test/unit/test_perf_data.py @@ -1,6 +1,5 @@ import atomsci.ddm.pipeline.perf_data as perf_data import atomsci.ddm.pipeline.model_pipeline as model_pipeline -import atomsci.ddm.pipeline.featurization as feat import atomsci.ddm.pipeline.parameter_parser as parse import os import tempfile @@ -65,7 +64,196 @@ def test_KFoldRegressionPerfData(): print(f"should be {real_vals}, [0,0,0,0,0]") print(res_vals, res_std) - #pdb.set_trace() + +def test_KFoldClassificationPerfData(): + script_path = os.path.dirname(os.path.realpath(__file__)) + res_dir = tempfile.mkdtemp() + + params = {"verbose": "True", + "datastore": "False", + "save_results": "False", + "model_type": "NN", + "featurizer": "ecfp", + "prediction_type": "classification", + "split_strategy": "k_fold_cv", + "splitter": "random", + "split_test_frac": "0.15", + "split_valid_frac": "0.15", + "transformers": "True", + "dataset_key": os.path.join(script_path, + '../test_datasets/aurka_chembl_base_smiles_union.csv'), + "id_col": "compound_id", + "response_cols":"active", + "smiles_col": "base_rdkit_smiles", + "max_epochs":"2", + "prediction_type": "classification", + "result_dir":res_dir} + + # setup a pipeline that will be used to create performance data + pparams = parse.wrapper(params) + mp = model_pipeline.ModelPipeline(pparams) + mp.train_model() + + # creat performance data + perf = perf_data.create_perf_data(mp.params.prediction_type, + mp.data, mp.model_wrapper.transformers, 'train') + + ids = sorted(list(mp.data.train_valid_dsets[0][0].ids)) + weights = perf.get_weights(ids) + assert weights.shape == (len(ids),1) + assert all(weights==1) + + real_vals = perf.get_real_values(ids) + d = dc.data.NumpyDataset(X=np.ones_like(real_vals), y=real_vals, ids=ids, w=np.ones(len(ids))) + # There should be no transformers + assert len(perf.transformers) == 0 + + num_classes = 2 + # input to to_one_hot needs to have the shape (N,) not (N,1) + pred_vals = dc.metrics.to_one_hot(d.y.reshape(len(d.y)), num_classes) + # This should have r2 of 1 + roc_auc_score = perf.accumulate_preds(pred_vals, ids) + assert roc_auc_score == 1 + # do a few more folds + roc_auc_score = perf.accumulate_preds(pred_vals, ids) + roc_auc_score = perf.accumulate_preds(pred_vals, ids) + + (res_ids, res_classes, res_probs, res_std) = perf.get_pred_values() + (roc_auc_mean, roc_auc_std) = perf.compute_perf_metrics() + + # std should be zero + assert all((res_std==np.zeros_like(res_std)).flatten()) + # probs should match predictions + assert all((res_probs==pred_vals.reshape(len(d.y), 1, num_classes)).flatten()) + # all predictions are correct + assert all(res_classes==real_vals) + # perfect score every time + assert roc_auc_mean==1 + assert roc_auc_std==0 + +def test_SimpleRegressionPerfData(): + script_path = os.path.dirname(os.path.realpath(__file__)) + res_dir = tempfile.mkdtemp() + + params = {"verbose": "True", + "datastore": "False", + "save_results": "False", + "model_type": "NN", + "featurizer": "ecfp", + "prediction_type": "classification", + "split_strategy": "train_valid_test", + "splitter": "random", + "split_test_frac": "0.15", + "split_valid_frac": "0.15", + "transformers": "True", + "dataset_key": os.path.join(script_path, + '../test_datasets/aurka_chembl_base_smiles_union.csv'), + "id_col": "compound_id", + "response_cols":"pIC50", + "smiles_col": "base_rdkit_smiles", + "max_epochs":"2", + "prediction_type": "regression", + "result_dir":res_dir} + + # setup a pipeline that will be used to create performance data + pparams = parse.wrapper(params) + mp = model_pipeline.ModelPipeline(pparams) + mp.train_model() + + # creat performance data + perf = perf_data.create_perf_data(mp.params.prediction_type, + mp.data, mp.model_wrapper.transformers, 'train') + + real_vals = perf.get_real_values() + weights = perf.get_weights() + ids = np.array(range(len(real_vals))) # these are not used by SimpleRegressionPerfData + assert weights.shape == (len(ids),1) + assert all(weights==1) + + d = dc.data.NumpyDataset(X=np.ones_like(real_vals), y=real_vals, + ids=ids, w=np.ones(len(ids))) + # pass correct values through the transformers + for t in perf.transformers: + d = t.transform(d) + + pred_vals = d.y + # This should have r2 of 1 ids are ignored + r2 = perf.accumulate_preds(pred_vals, ids) + assert r2 == 1 + + (res_ids, res_vals, _) = perf.get_pred_values() + (r2_mean, _) = perf.compute_perf_metrics() + + # the predicted values should equal the real values + assert all(real_vals == res_vals) + + # should be a perfect score + assert r2_mean == 1 + + +def test_SimpleClassificationPerfData(): + script_path = os.path.dirname(os.path.realpath(__file__)) + res_dir = tempfile.mkdtemp() + + params = {"verbose": "True", + "datastore": "False", + "save_results": "False", + "model_type": "NN", + "featurizer": "ecfp", + "prediction_type": "classification", + "split_strategy": "train_valid_test", + "splitter": "random", + "split_test_frac": "0.15", + "split_valid_frac": "0.15", + "transformers": "True", + "dataset_key": os.path.join(script_path, + '../test_datasets/aurka_chembl_base_smiles_union.csv'), + "id_col": "compound_id", + "response_cols":"active", + "smiles_col": "base_rdkit_smiles", + "max_epochs":"2", + "prediction_type": "classification", + "result_dir":res_dir} + + # setup a pipeline that will be used to create performance data + pparams = parse.wrapper(params) + mp = model_pipeline.ModelPipeline(pparams) + mp.train_model() + + # creat performance data + perf = perf_data.create_perf_data(mp.params.prediction_type, + mp.data, mp.model_wrapper.transformers, 'train') + + ids = sorted(list(mp.data.train_valid_dsets[0][0].ids)) + weights = perf.get_weights() + real_vals = perf.get_real_values() + assert weights.shape == (len(ids),1) + assert all(weights==1) + + d = dc.data.NumpyDataset(X=np.ones_like(real_vals), y=real_vals, ids=ids, w=np.ones(len(ids))) + # There should be no transformers + assert len(perf.transformers) == 0 + + num_classes = 2 + # input to to_one_hot needs to have the shape (N,) not (N,1) + pred_vals = dc.metrics.to_one_hot(d.y.reshape(len(d.y)), num_classes) + # This should have r2 of 1 + roc_auc_score = perf.accumulate_preds(pred_vals) + assert roc_auc_score == 1 + + (res_ids, res_classes, res_probs, _) = perf.get_pred_values() + (roc_auc_mean, _) = perf.compute_perf_metrics() + + # probs should match predictions + assert all((res_probs==pred_vals.reshape(len(d.y), 1, num_classes)).flatten()) + # all predictions are correct + assert all(res_classes==real_vals) + # perfect score every time + assert roc_auc_mean==1 + if __name__ == "__main__": - test_KFoldRegressionPerfData() \ No newline at end of file + test_SimpleClassificationPerfData() + #test_KFoldClassificationPerfData() + #test_SimpleRegressionPerfData() + #test_KFoldRegressionPerfData() \ No newline at end of file From 5055889e0cbefc440a94cdc711ea06fd1954cc6e Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Mon, 30 Sep 2024 15:46:36 -0700 Subject: [PATCH 39/57] the positive and negative counts are inconsistent, instead just check to see that there is more of the major class --- .../integrative/balancing_trans/test_balancing_transformer.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/atomsci/ddm/test/integrative/balancing_trans/test_balancing_transformer.py b/atomsci/ddm/test/integrative/balancing_trans/test_balancing_transformer.py index 51769700..09043c45 100644 --- a/atomsci/ddm/test/integrative/balancing_trans/test_balancing_transformer.py +++ b/atomsci/ddm/test/integrative/balancing_trans/test_balancing_transformer.py @@ -16,9 +16,7 @@ def test_balancing_transformer(): balanced_weights = make_pipeline_and_get_weights(balanced_params) (major_weight, minor_weight), (major_count, minor_count) = np.unique(balanced_weights, return_counts=True) assert major_weight < minor_weight - - assert major_count == 416 - assert minor_count == 20 + assert major_count > minor_count nonbalanced_params = params_wo_balan(dset_key, res_dir) nonbalanced_weights = make_pipeline_and_get_weights(nonbalanced_params) From 16d50f8c7ea63ecc14b2310708ee40b8b3461dae Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Mon, 28 Oct 2024 11:47:01 -0700 Subject: [PATCH 40/57] Undo transformations before calculating mean and std of predictions --- atomsci/ddm/pipeline/perf_data.py | 27 ++--- atomsci/ddm/test/unit/test_perf_data.py | 135 +++++++++++++++++++----- 2 files changed, 123 insertions(+), 39 deletions(-) diff --git a/atomsci/ddm/pipeline/perf_data.py b/atomsci/ddm/pipeline/perf_data.py index 2ebebcfb..c5a5344d 100644 --- a/atomsci/ddm/pipeline/perf_data.py +++ b/atomsci/ddm/pipeline/perf_data.py @@ -12,6 +12,7 @@ from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error from atomsci.ddm.pipeline import transformations as trans +import pdb # ****************************************************************************************************************************** @@ -966,12 +967,14 @@ def get_pred_values(self): ids = [id for id in all_ids if not (self.pred_vals[id].size == 0)] if self.subset in ['train', 'test', 'train_valid']: - rawvals = np.concatenate([self.pred_vals[id].mean(axis=0, keepdims=True).reshape((1,-1)) for id in ids]) - vals = dc.trans.undo_transforms(rawvals, self.transformers) + tmp = [dc.trans.undo_transforms(self.pred_vals[id], self.transformers) for id in ids] if self.folds > 1: - stds = dc.trans.undo_transforms(np.concatenate([self.pred_vals[id].std(axis=0, keepdims=True).reshape((1,-1)) - for id in ids]), self.transformers) + # mean transformed predictions + trans_preds = np.stack(tmp) + vals = trans_preds.mean(axis=1) + stds = trans_preds.std(axis=1) else: + vals = np.concatenate(tmp) stds = None else: rawvals = np.concatenate([self.pred_vals[id].reshape((1,-1)) for id in ids], axis=0) @@ -1229,14 +1232,14 @@ def get_pred_values(self): ids = [id for id in all_ids if not (self.pred_vals[id].size == 0)] if self.subset in ['train', 'test', 'train_valid']: - #class_probs = np.concatenate([dc.trans.undo_transforms(self.pred_vals[id], self.transformers).mean(axis=0, keepdims=True) - # for id in ids], axis=0) - #prob_stds = np.concatenate([dc.trans.undo_transforms(self.pred_vals[id], self.transformers).std(axis=0, keepdims=True) - # for id in ids], axis=0) - class_probs = dc.trans.undo_transforms(np.concatenate([self.pred_vals[id].mean(axis=0, keepdims=True) - for id in ids], axis=0), self.transformers) - prob_stds = dc.trans.undo_transforms(np.concatenate([self.pred_vals[id].std(axis=0, keepdims=True) - for id in ids], axis=0), self.transformers) + class_probs = np.concatenate([dc.trans.undo_transforms(self.pred_vals[id], self.transformers).mean(axis=0, keepdims=True) + for id in ids], axis=0) + prob_stds = np.concatenate([dc.trans.undo_transforms(self.pred_vals[id], self.transformers).std(axis=0, keepdims=True) + for id in ids], axis=0) + #class_probs = dc.trans.undo_transforms(np.concatenate([self.pred_vals[id].mean(axis=0, keepdims=True) + # for id in ids], axis=0), self.transformers) + #prob_stds = dc.trans.undo_transforms(np.concatenate([self.pred_vals[id].std(axis=0, keepdims=True) + # for id in ids], axis=0), self.transformers) else: class_probs = np.concatenate([dc.trans.undo_transforms(self.pred_vals[id], self.transformers) for id in ids], axis=0) prob_stds = None diff --git a/atomsci/ddm/test/unit/test_perf_data.py b/atomsci/ddm/test/unit/test_perf_data.py index ce4c237e..27a2fac2 100644 --- a/atomsci/ddm/test/unit/test_perf_data.py +++ b/atomsci/ddm/test/unit/test_perf_data.py @@ -6,24 +6,35 @@ import pdb import deepchem as dc import numpy as np +import shutil +import pandas as pd -def test_KFoldRegressionPerfData(): +def copy_to_temp(dskey, res_dir): + new_dskey = shutil.copy(dskey, res_dir) + return new_dskey + +def setup_paths(): script_path = os.path.dirname(os.path.realpath(__file__)) res_dir = tempfile.mkdtemp() + dskey = os.path.join(script_path, '../test_datasets/aurka_chembl_base_smiles_union.csv') + tmp_dskey = copy_to_temp(dskey, res_dir) + + return res_dir, tmp_dskey + +def test_KFoldRegressionPerfData(): + res_dir, tmp_dskey = setup_paths() params = {"verbose": "True", "datastore": "False", "save_results": "False", "model_type": "NN", "featurizer": "ecfp", - "prediction_type": "classification", "split_strategy": "k_fold_cv", "splitter": "random", "split_test_frac": "0.15", "split_valid_frac": "0.15", "transformers": "True", - "dataset_key": os.path.join(script_path, - '../test_datasets/aurka_chembl_base_smiles_union.csv'), + "dataset_key": tmp_dskey, "id_col": "compound_id", "response_cols":"pIC50", "smiles_col": "base_rdkit_smiles", @@ -40,9 +51,11 @@ def test_KFoldRegressionPerfData(): perf = perf_data.create_perf_data(mp.params.prediction_type, mp.data, mp.model_wrapper.transformers, 'train') - ids = sorted(list(mp.data.train_valid_dsets[0][0].ids[:5])) + assert isinstance(perf, perf_data.KFoldRegressionPerfData) + + ids = sorted(list(mp.data.combined_training_data().ids)) weights = perf.get_weights(ids) - assert weights.shape == (5,1) + assert weights.shape == (len(ids),1) assert all(weights==1) real_vals = perf.get_real_values(ids) @@ -62,26 +75,94 @@ def test_KFoldRegressionPerfData(): (res_ids, res_vals, res_std) = perf.get_pred_values() (r2_mean, r2_std) = perf.compute_perf_metrics() - print(f"should be {real_vals}, [0,0,0,0,0]") - print(res_vals, res_std) + assert np.allclose(res_vals, real_vals) + assert np.allclose(res_std, np.zeros_like(res_std)) + + # perfect score every time + assert r2_mean==1 + assert r2_std==0 + +def test_KFoldRegressionPerfDataMulti(): + res_dir, tmp_dskey = setup_paths() + + # duplicate pIC50 column + df = pd.read_csv(tmp_dskey) + df['pIC50_dupe'] = df['pIC50'] + df.to_csv(tmp_dskey, index=False) + + + params = {"verbose": "True", + "datastore": "False", + "save_results": "False", + "model_type": "NN", + "featurizer": "ecfp", + "split_strategy": "k_fold_cv", + "splitter": "random", + "split_test_frac": "0.15", + "split_valid_frac": "0.15", + "transformers": "True", + "dataset_key": tmp_dskey, + "id_col": "compound_id", + "response_cols":["pIC50", "pIC50_dupe"], + "smiles_col": "base_rdkit_smiles", + "max_epochs":"2", + "prediction_type": "regression", + "result_dir":res_dir} + + # setup a pipeline that will be used to create performance data + pparams = parse.wrapper(params) + mp = model_pipeline.ModelPipeline(pparams) + mp.train_model() + + # creat performance data + perf = perf_data.create_perf_data(mp.params.prediction_type, + mp.data, mp.model_wrapper.transformers, 'train') + + assert isinstance(perf, perf_data.KFoldRegressionPerfData) + + ids = sorted(list(mp.data.combined_training_data().ids)) + weights = perf.get_weights(ids) + assert weights.shape == (len(ids),2) + assert np.allclose(weights, np.ones_like(weights)) + + real_vals = perf.get_real_values(ids) + d = dc.data.NumpyDataset(X=np.ones_like(real_vals), y=real_vals, ids=ids, w=np.ones_like(weights)) + # pass correct values through the transformers + for t in perf.transformers: + d = t.transform(d) + + pred_vals = d.y + # This should have r2 of 1 + r2 = perf.accumulate_preds(pred_vals, ids) + assert r2 == 1 + # do a few more folds + r2 = perf.accumulate_preds(pred_vals, ids) + r2 = perf.accumulate_preds(pred_vals, ids) + + (res_ids, res_vals, res_std) = perf.get_pred_values() + (r2_mean, r2_std) = perf.compute_perf_metrics() + + assert np.allclose(res_vals, real_vals) + assert np.allclose(res_std, np.zeros_like(res_std)) + + # perfect score every time + assert r2_mean==1 + assert r2_std==0 def test_KFoldClassificationPerfData(): - script_path = os.path.dirname(os.path.realpath(__file__)) - res_dir = tempfile.mkdtemp() + res_dir, tmp_dskey = setup_paths() params = {"verbose": "True", "datastore": "False", "save_results": "False", "model_type": "NN", "featurizer": "ecfp", - "prediction_type": "classification", "split_strategy": "k_fold_cv", "splitter": "random", "split_test_frac": "0.15", "split_valid_frac": "0.15", "transformers": "True", - "dataset_key": os.path.join(script_path, - '../test_datasets/aurka_chembl_base_smiles_union.csv'), + "dataset_key": tmp_dskey, "id_col": "compound_id", "response_cols":"active", "smiles_col": "base_rdkit_smiles", @@ -98,6 +179,8 @@ def test_KFoldClassificationPerfData(): perf = perf_data.create_perf_data(mp.params.prediction_type, mp.data, mp.model_wrapper.transformers, 'train') + assert isinstance(perf, perf_data.KFoldClassificationPerfData) + ids = sorted(list(mp.data.train_valid_dsets[0][0].ids)) weights = perf.get_weights(ids) assert weights.shape == (len(ids),1) @@ -132,23 +215,20 @@ def test_KFoldClassificationPerfData(): assert roc_auc_std==0 def test_SimpleRegressionPerfData(): - script_path = os.path.dirname(os.path.realpath(__file__)) - res_dir = tempfile.mkdtemp() + res_dir, tmp_dskey = setup_paths() params = {"verbose": "True", "datastore": "False", "save_results": "False", "model_type": "NN", "featurizer": "ecfp", - "prediction_type": "classification", "split_strategy": "train_valid_test", "splitter": "random", "split_test_frac": "0.15", "split_valid_frac": "0.15", "transformers": "True", - "dataset_key": os.path.join(script_path, - '../test_datasets/aurka_chembl_base_smiles_union.csv'), "id_col": "compound_id", + "dataset_key": tmp_dskey, "response_cols":"pIC50", "smiles_col": "base_rdkit_smiles", "max_epochs":"2", @@ -164,6 +244,8 @@ def test_SimpleRegressionPerfData(): perf = perf_data.create_perf_data(mp.params.prediction_type, mp.data, mp.model_wrapper.transformers, 'train') + assert isinstance(perf, perf_data.SimpleRegressionPerfData) + real_vals = perf.get_real_values() weights = perf.get_weights() ids = np.array(range(len(real_vals))) # these are not used by SimpleRegressionPerfData @@ -190,24 +272,20 @@ def test_SimpleRegressionPerfData(): # should be a perfect score assert r2_mean == 1 - def test_SimpleClassificationPerfData(): - script_path = os.path.dirname(os.path.realpath(__file__)) - res_dir = tempfile.mkdtemp() + res_dir, tmp_dskey = setup_paths() params = {"verbose": "True", "datastore": "False", "save_results": "False", "model_type": "NN", "featurizer": "ecfp", - "prediction_type": "classification", "split_strategy": "train_valid_test", "splitter": "random", "split_test_frac": "0.15", "split_valid_frac": "0.15", "transformers": "True", - "dataset_key": os.path.join(script_path, - '../test_datasets/aurka_chembl_base_smiles_union.csv'), + "dataset_key": tmp_dskey, "id_col": "compound_id", "response_cols":"active", "smiles_col": "base_rdkit_smiles", @@ -224,6 +302,8 @@ def test_SimpleClassificationPerfData(): perf = perf_data.create_perf_data(mp.params.prediction_type, mp.data, mp.model_wrapper.transformers, 'train') + assert isinstance(perf, perf_data.SimpleClassificationPerfData) + ids = sorted(list(mp.data.train_valid_dsets[0][0].ids)) weights = perf.get_weights() real_vals = perf.get_real_values() @@ -253,7 +333,8 @@ def test_SimpleClassificationPerfData(): if __name__ == "__main__": + test_KFoldRegressionPerfDataMulti() + test_KFoldRegressionPerfData() test_SimpleClassificationPerfData() - #test_KFoldClassificationPerfData() - #test_SimpleRegressionPerfData() - #test_KFoldRegressionPerfData() \ No newline at end of file + test_KFoldClassificationPerfData() + test_SimpleRegressionPerfData() \ No newline at end of file From 02809413588e9d0afd21563f75020a6610df6b18 Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Mon, 28 Oct 2024 13:33:27 -0700 Subject: [PATCH 41/57] Removed pdb imports --- atomsci/ddm/pipeline/perf_data.py | 1 - atomsci/ddm/test/unit/test_perf_data.py | 1 - 2 files changed, 2 deletions(-) diff --git a/atomsci/ddm/pipeline/perf_data.py b/atomsci/ddm/pipeline/perf_data.py index c5a5344d..cf9a3c35 100644 --- a/atomsci/ddm/pipeline/perf_data.py +++ b/atomsci/ddm/pipeline/perf_data.py @@ -12,7 +12,6 @@ from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error from atomsci.ddm.pipeline import transformations as trans -import pdb # ****************************************************************************************************************************** diff --git a/atomsci/ddm/test/unit/test_perf_data.py b/atomsci/ddm/test/unit/test_perf_data.py index 27a2fac2..d6c0637e 100644 --- a/atomsci/ddm/test/unit/test_perf_data.py +++ b/atomsci/ddm/test/unit/test_perf_data.py @@ -3,7 +3,6 @@ import atomsci.ddm.pipeline.parameter_parser as parse import os import tempfile -import pdb import deepchem as dc import numpy as np import shutil From a4c2b830bec5cc93d88540e6b049f1a3c555be3d Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Wed, 27 Nov 2024 10:36:46 -0800 Subject: [PATCH 42/57] Updated help for 'seed' input --- atomsci/ddm/pipeline/MultitaskScaffoldSplit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/atomsci/ddm/pipeline/MultitaskScaffoldSplit.py b/atomsci/ddm/pipeline/MultitaskScaffoldSplit.py index 0437a407..37dd7a34 100644 --- a/atomsci/ddm/pipeline/MultitaskScaffoldSplit.py +++ b/atomsci/ddm/pipeline/MultitaskScaffoldSplit.py @@ -1044,7 +1044,7 @@ def parse_args(): parser.add_argument('id_col', type=str, help='the column containing ids') parser.add_argument('response_cols', type=str, help='comma seperated string of response columns') parser.add_argument('output', type=str, help='name of the split file') - parser.add_argument('seed', type=int, default=0, help='name of the split file') + parser.add_argument('seed', type=int, default=0, help='Random seed used in random number generators.') return parser.parse_args() From 8e29047c5089a28b0a6d7439f036a1a7cf67c828 Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Wed, 27 Nov 2024 10:39:05 -0800 Subject: [PATCH 43/57] Removed commented out seed --- atomsci/ddm/pipeline/model_wrapper.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/atomsci/ddm/pipeline/model_wrapper.py b/atomsci/ddm/pipeline/model_wrapper.py index ba74966d..8ff57ad7 100644 --- a/atomsci/ddm/pipeline/model_wrapper.py +++ b/atomsci/ddm/pipeline/model_wrapper.py @@ -2000,7 +2000,7 @@ def make_dc_model(self, model_dir): reg_lambda=1, scale_pos_weight=1, base_score=0.5, - random_state= self.seed, #0, + random_state= self.seed, missing=np.nan, importance_type='gain', n_jobs=-1, @@ -2025,7 +2025,7 @@ def make_dc_model(self, model_dir): reg_lambda=1, scale_pos_weight=1, base_score=0.5, - random_state=self.seed, #0, + random_state=self.seed, importance_type='gain', missing=np.nan, gpu_id = -1, @@ -2135,7 +2135,7 @@ def reload_model(self, reload_dir): reg_lambda=1, scale_pos_weight=1, base_score=0.5, - random_state=self.seed, #0, + random_state=self.seed, missing=np.nan, importance_type='gain', n_jobs=-1, @@ -2160,7 +2160,7 @@ def reload_model(self, reload_dir): reg_lambda=1, scale_pos_weight=1, base_score=0.5, - random_state=self.seed, #0, + random_state=self.seed, importance_type='gain', missing=np.nan, gpu_id = -1, From 268ba05d3adf098ace2ff385fdfd7cfb67671b9e Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Wed, 27 Nov 2024 11:23:04 -0800 Subject: [PATCH 44/57] model_retrian has an option to either keep or discard the saved seed. Currently default is to discard the seed --- .../dc_models/test_retrain_dc_models.py | 65 ++++++++++++++----- atomsci/ddm/utils/model_retrain.py | 36 ++++++---- 2 files changed, 73 insertions(+), 28 deletions(-) diff --git a/atomsci/ddm/test/integrative/dc_models/test_retrain_dc_models.py b/atomsci/ddm/test/integrative/dc_models/test_retrain_dc_models.py index 39d057a1..499527b5 100644 --- a/atomsci/ddm/test/integrative/dc_models/test_retrain_dc_models.py +++ b/atomsci/ddm/test/integrative/dc_models/test_retrain_dc_models.py @@ -153,7 +153,7 @@ def train_and_predict(train_json_f, prefix='delaney-processed'): return tar_f -def verify_saved_params(original_json_f, tar_f): +def verify_saved_params(original_json_f, tar_f, keep_seed=False): """compares saved params in a tar file with original json""" reload_dir = tempfile.mkdtemp() with tarfile.open(tar_f, mode='r:gz') as tar: @@ -187,7 +187,18 @@ def verify_saved_params(original_json_f, tar_f): print(tar_feat_params) assert original_feat_params == tar_feat_params -def retrain(tar_f, prefix='H1'): + print('-----------------------------------') + print('seeds') + print(original_pp.seed) + print(tar_pp.seed) + assert original_pp.seed is not None + assert tar_pp.seed is not None + if keep_seed: + assert original_pp.seed == tar_pp.seed + else: + assert original_pp.seed != tar_pp.seed + +def retrain(tar_f, prefix='H1', keep_seed=False): """retrain a model from tar_f""" model = mr.train_model_from_tar(tar_f, 'result') @@ -212,7 +223,7 @@ def H1_init(): # Train and Predict # ----- -def test_reg_config_H1_fit_AttentiveFPModel(): +def run_test_reg_config_H1_fit_AttentiveFPModel(keep_seed): if not llnl_utils.is_lc_system(): assert True return @@ -223,12 +234,16 @@ def test_reg_config_H1_fit_AttentiveFPModel(): verify_saved_params(json_f, tar_f) - re_tar_f = retrain(tar_f, 'H1') + re_tar_f = retrain(tar_f, 'H1', keep_seed=keep_seed) - verify_saved_params(json_f, re_tar_f) + verify_saved_params(json_f, re_tar_f, keep_seed=keep_seed) + +def test_reg_config_H1_fit_AttentiveFPModel(): + run_test_reg_config_H1_fit_AttentiveFPModel(True) + run_test_reg_config_H1_fit_AttentiveFPModel(False) # ----- -def test_reg_config_H1_fit_GCNModel(): +def run_test_reg_config_H1_fit_GCNModel(keep_seed): if not llnl_utils.is_lc_system(): assert True return @@ -239,12 +254,16 @@ def test_reg_config_H1_fit_GCNModel(): verify_saved_params(json_f, tar_f) - re_tar_f = retrain(tar_f, 'H1') + re_tar_f = retrain(tar_f, 'H1', keep_seed=keep_seed) - verify_saved_params(json_f, re_tar_f) + verify_saved_params(json_f, re_tar_f, keep_seed=keep_seed) + +def test_reg_config_H1_fit_GCNModel(): + run_test_reg_config_H1_fit_GCNModel(True) + run_test_reg_config_H1_fit_GCNModel(False) # ----- -def test_reg_config_H1_fit_MPNNModel(): +def run_test_reg_config_H1_fit_MPNNModel(keep_seed): if not llnl_utils.is_lc_system(): assert True return @@ -255,11 +274,15 @@ def test_reg_config_H1_fit_MPNNModel(): verify_saved_params(json_f, tar_f) - re_tar_f = retrain(tar_f, 'H1') + re_tar_f = retrain(tar_f, 'H1', keep_seed=keep_seed) - verify_saved_params(json_f, re_tar_f) + verify_saved_params(json_f, re_tar_f, keep_seed=keep_seed) -def test_reg_config_H1_fit_GraphConvModel(): +def test_reg_config_H1_fit_MPNNModel(): + run_test_reg_config_H1_fit_MPNNModel(True) + run_test_reg_config_H1_fit_MPNNModel(False) + +def run_test_reg_config_H1_fit_GraphConvModel(keep_seed): if not llnl_utils.is_lc_system(): assert True return @@ -270,11 +293,15 @@ def test_reg_config_H1_fit_GraphConvModel(): verify_saved_params(json_f, tar_f) - re_tar_f = retrain(tar_f, 'H1') + re_tar_f = retrain(tar_f, 'H1', keep_seed=keep_seed) - verify_saved_params(json_f, re_tar_f) + verify_saved_params(json_f, re_tar_f, keep_seed=keep_seed) -def test_reg_config_H1_fit_PytorchMPNNModel(): +def test_reg_config_H1_fit_GraphConvModel(): + run_test_reg_config_H1_fit_GraphConvModel(True) + run_test_reg_config_H1_fit_GraphConvModel(False) + +def run_test_reg_config_H1_fit_PytorchMPNNModel(keep_seed): if not llnl_utils.is_lc_system(): assert True return @@ -285,9 +312,13 @@ def test_reg_config_H1_fit_PytorchMPNNModel(): verify_saved_params(json_f, tar_f) - re_tar_f = retrain(tar_f, 'H1') + re_tar_f = retrain(tar_f, 'H1', keep_seed=keep_seed) - verify_saved_params(json_f, re_tar_f) + verify_saved_params(json_f, re_tar_f, keep_seed) + +def test_reg_config_H1_fit_PytorchMPNNModel(): + run_test_reg_config_H1_fit_PytorchMPNNModel(True) + run_test_reg_config_H1_fit_PytorchMPNNModel(False) if __name__ == '__main__': test_reg_config_H1_fit_PytorchMPNNModel() # Pytorch implementation of MPNNModel diff --git a/atomsci/ddm/utils/model_retrain.py b/atomsci/ddm/utils/model_retrain.py index 1e376f5e..5f89e649 100644 --- a/atomsci/ddm/utils/model_retrain.py +++ b/atomsci/ddm/utils/model_retrain.py @@ -49,7 +49,7 @@ mlmt_supported = False -def train_model(input, output, dskey='', production=False): +def train_model(input, output, dskey='', production=False, keep_seed=False): """Retrain a model saved in a model_metadata.json file Args: @@ -76,6 +76,13 @@ def train_model(input, output, dskey='', production=False): # Parse parameters params = parse.wrapper(config) + + # keep or discard seed. + if keep_seed and params.seed is None: + raise RuntimeWarning("Expected to find random seed not found. Retraining using a new random seed.") + elif not keep_seed: + params.seed = None + params.result_dir = output # otherwise this will have the same uuid as the source model params.model_uuid = None @@ -101,7 +108,7 @@ def train_model(input, output, dskey='', production=False): return model -def train_model_from_tar(input, output, dskey='', production=False): +def train_model_from_tar(input, output, dskey='', production=False, keep_seed=False): """Retrain a model saved in a tar.gz file Args: @@ -122,9 +129,9 @@ def train_model_from_tar(input, output, dskey='', production=False): # make metadata path metadata_path = os.path.join(tmpdir, 'model_metadata.json') - return train_model(metadata_path, output, dskey=dskey, production=production) + return train_model(metadata_path, output, dskey=dskey, production=production, keep_seed=keep_seed) -def train_model_from_tracker(model_uuid, output_dir, production=False): +def train_model_from_tracker(model_uuid, output_dir, production=False, keep_seed=False): """Retrain a model saved in the model tracker, but save it to output_dir and don't insert it into the model tracker Args: @@ -158,6 +165,12 @@ def train_model_from_tracker(model_uuid, output_dir, production=False): #if config[] # Parse parameters params = parse.wrapper(config) + # keep or discard seed. + if keep_seed and params.seed is None: + raise RuntimeWarning("Expected to find random seed not found. Retraining using a new random seed.") + elif not keep_seed: + params.seed = None + params.result_dir = output_dir # otherwise this will have the same uuid as the source model params.model_uuid = None @@ -181,7 +194,7 @@ def train_model_from_tracker(model_uuid, output_dir, production=False): return model -def train_models_from_dataset_keys(input, output, pred_type='regression', production=False): +def train_models_from_dataset_keys(input, output, pred_type='regression', production=False, keep_seed=False): """Retrain a list of models from an input file Args: @@ -243,7 +256,7 @@ def train_models_from_dataset_keys(input, output, pred_type='regression', produc for model_uuid in best_mods.model_uuid.sort_values(): try: logger.debug('Training %s in %s' % (model_uuid, output)) - train_model_from_tracker(model_uuid, output, production=production) + train_model_from_tracker(model_uuid, output, production=production, keep_seed=keep_seed) except Exception: Exception(f'Error for model_uuid {model_uuid}') pass @@ -263,6 +276,7 @@ def main(argv): parser.add_argument('-dk', '--dataset_key', default='', help='Sometimes dataset keys get moved. Specify new location of dataset. Only works when passing in one model at time.') parser.add_argument('-pd_type', '--pred_type', default='regression', help='Specify the prediction type used for model retrain. The default is set to regression.') parser.add_argument('-prod', '--production', action='store_true', default=False, help='Retrain the model in production mode') + parser.add_argument('-keep_seed', '--keep_seed', action='store_true', default=False, help='Retrain the model using the saved seed if available.') args = parser.parse_args() @@ -277,19 +291,19 @@ def main(argv): if os.path.isdir(input): # loop for path in Path(input).rglob('model_metadata.json'): - train_model(path.absolute(), output, production=args.production) + train_model(path.absolute(), output, production=args.production, keep_seed=args.keep_seed) elif os.path.isfile(input): # 2 if it's a file, check if it's a json or tar.gz or file that contains list of dataset keys if input.endswith('.json'): - train_model(input, output, dskey=args.dataset_key, production=args.production) + train_model(input, output, dskey=args.dataset_key, production=args.production, keep_seed=args.keep_seed) elif input.endswith('.tar.gz'): - train_model_from_tar(input, output, dskey=args.dataset_key, production=args.production) + train_model_from_tar(input, output, dskey=args.dataset_key, production=args.production, keep_seed=args.keep_seed) else: - train_models_from_dataset_keys(input, output, pred_type=args.pred_type, production=args.production) + train_models_from_dataset_keys(input, output, pred_type=args.pred_type, production=args.production, keep_seed=args.keep_seed) else: try: # 3 try to process 'input' as uuid - train_model_from_tracker(input, output, production=args.production) + train_model_from_tracker(input, output, production=args.production, keep_seed=args.keep_seed) except Exception: Exception('Unrecognized input %s'%input) From 17ba026eb2e7b095fad1565f92309a88ea6a6b6d Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Wed, 27 Nov 2024 12:07:31 -0800 Subject: [PATCH 45/57] Pass on keep_seed argument --- .../ddm/test/integrative/dc_models/test_retrain_dc_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/atomsci/ddm/test/integrative/dc_models/test_retrain_dc_models.py b/atomsci/ddm/test/integrative/dc_models/test_retrain_dc_models.py index 499527b5..1013b484 100644 --- a/atomsci/ddm/test/integrative/dc_models/test_retrain_dc_models.py +++ b/atomsci/ddm/test/integrative/dc_models/test_retrain_dc_models.py @@ -200,7 +200,7 @@ def verify_saved_params(original_json_f, tar_f, keep_seed=False): def retrain(tar_f, prefix='H1', keep_seed=False): """retrain a model from tar_f""" - model = mr.train_model_from_tar(tar_f, 'result') + model = mr.train_model_from_tar(tar_f, 'result', keep_seed=keep_seed) uuid = model.params.model_uuid re_tar_f = f'result/{prefix}_curated_fit_model_{uuid}.tar.gz' From b2a0c5afed1e179bbceba9187c464df7fdef6e56 Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Mon, 2 Dec 2024 15:35:47 -0800 Subject: [PATCH 46/57] Looping through all folds is redundant --- atomsci/ddm/pipeline/model_datasets.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/atomsci/ddm/pipeline/model_datasets.py b/atomsci/ddm/pipeline/model_datasets.py index d5bd3a4a..a235abd8 100644 --- a/atomsci/ddm/pipeline/model_datasets.py +++ b/atomsci/ddm/pipeline/model_datasets.py @@ -723,8 +723,7 @@ def get_subset_responses_and_weights(self, subset, transformers): """ if subset not in self.subset_response_dict: if subset in ('train', 'valid', 'train_valid'): - for fold, (train, valid) in enumerate(self.train_valid_dsets): - dataset = self.combined_training_data() + dataset = self.combined_training_data() elif subset == 'test': dataset = self.test_dset else: From 60ed6702a7047c9315c749cd44ab59cb6c27f793 Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Mon, 2 Dec 2024 15:37:52 -0800 Subject: [PATCH 47/57] Added option to keep the same random seed when retraining a model. Default is value is False --- .../integrative/model_retrain/config.json | 35 + .../integrative/model_retrain/example.csv | 751 ++++++++++++++++++ .../model_retrain/test_model_retrain.py | 86 ++ atomsci/ddm/utils/model_file_reader.py | 9 +- atomsci/ddm/utils/model_retrain.py | 2 + 5 files changed, 882 insertions(+), 1 deletion(-) create mode 100644 atomsci/ddm/test/integrative/model_retrain/config.json create mode 100644 atomsci/ddm/test/integrative/model_retrain/example.csv create mode 100644 atomsci/ddm/test/integrative/model_retrain/test_model_retrain.py diff --git a/atomsci/ddm/test/integrative/model_retrain/config.json b/atomsci/ddm/test/integrative/model_retrain/config.json new file mode 100644 index 00000000..f962baf6 --- /dev/null +++ b/atomsci/ddm/test/integrative/model_retrain/config.json @@ -0,0 +1,35 @@ +{ + "comment": "Input file", + "comment": "----------------------------------------", + "comment": "Note: dataset_key must be a path/file name: E.G. ./dataset.csv", + "id_col": "Id", + "smiles_col": "smiles", + "class_number": "3", + + "comment": "Split", + "comment": "----------------------------------------", + "splitter": "random", + + "comment": "Prediction Type", + "comment": "----------------------------------------", + "response_cols": "sol_category", + "prediction_type": "classification", + + "comment": "Features", + "comment": "----------------------------------------", + "featurizer": "ecfp", + + "comment": "Model", + "comment": "----------------------------------------", + "model_type": "NN", + "dropout": ".01,.01,.01", + "layer_sizes": "256,50,18", + "learning_rate": "0.00007", + "max_epochs": "25", + + "comment": "Training", + "comment": "----------------------------------------", + "comment": "This regulates how long to train the model", + "early_stopping_patience": "2" +} + diff --git a/atomsci/ddm/test/integrative/model_retrain/example.csv b/atomsci/ddm/test/integrative/model_retrain/example.csv new file mode 100644 index 00000000..92ef31ec --- /dev/null +++ b/atomsci/ddm/test/integrative/model_retrain/example.csv @@ -0,0 +1,751 @@ +Id,smiles,sol_category +EOS12286,Cc1nc(N2CCN(C(=O)Nc3ccc(F)cc3F)CC2)cc(-n2ccnc2)n1,0 +EOS85869,CCN(CC)[C@H]1CCN(C(=O)Cc2nc(C(C)C)c(C)s2)C1,0 +EOS85435,CNC(=O)CNC(=O)c1c(-n2cccc2)sc(C)c1C,0 +EOS102302,CC(C)(C)c1ccc(CSc2cnn(C(C)(C)C)c(=O)c2Cl)cc1,0 +EOS64213,CC[C@H](NC(=O)c1ccnc(-n2ccnc2)c1)c1ccccc1OC,0 +EOS68602,Cn1c(SCC(=O)Nc2ccccc2F)nnc1-c1ccncc1,0 +EOS90323,CCN(Cc1ccc2c(c1)OCO2)C(=O)C1=NN([C@H]2CCS(=O)(=O)C2)C(=O)CC1,0 +EOS4636,COc1ccc(CN2C[C@@H]3CC[C@H](C2)N(Cc2ccccc2)C3=O)cc1O,0 +EOS72860,C[C@@H]1Oc2ccc(NC(=O)COCc3nc4ccccc4s3)cc2NC1=O,0 +EOS18993,COc1cccc(-c2nc3n(c2C(=O)Nc2ccc4c(c2)OCO4)CCS3(=O)=O)c1,0 +EOS47744,C[C@H]1CCCN1S(=O)(=O)N(C)c1ccc(O)cc1,0 +EOS70302,O=C(Nc1ccccc1OCC1CC1)N1CCCC[C@@H]1CN1CCOCC1,0 +EOS84678,FC(F)(F)c1c[nH]c(NCc2ccccc2Cl)n1,0 +EOS67976,CCOc1cccc2c1OCC(C(=O)NS(=O)(=O)Cc1ccc(F)cc1)=C2,0 +EOS48390,O=C(Nc1ccnn1-c1cccc(F)c1)c1ccnc(-n2cncn2)c1,0 +EOS84428,Cc1ccc2c(c1)NC(=O)[C@]2(O)CC(=O)c1cccs1,0 +EOS23838,O=C(CN1C(=O)CSc2ccc(S(=O)(=O)N3CCOCC3)cc21)NCc1ccccc1,0 +EOS23248,CCOc1ccc(NC(=O)c2ccc(-n3cnnn3)cc2)cc1,0 +EOS12014,COc1ccc(OC)c(N(CC(=O)Nc2cccnc2)S(=O)(=O)c2ccc(OC)c(OC)c2)c1,0 +EOS70996,COc1ccc(OC)c([C@@H]2CCCN2S(=O)(=O)c2ccccc2C#N)c1,0 +EOS86630,COCCn1cc(NC(=O)N2CCN(Cc3cc(C)no3)CC2)cn1,0 +EOS101681,COc1c(C)cc(NC(=O)c2nn(Cc3ccc(F)cc3)c3c2CN(C(=O)c2ccc[nH]2)C[C@]3(C)C(N)=O)cc1C,0 +EOS76444,CC(=O)c1ccc(CNC(=O)C#Cc2ccccc2)nc1C,0 +EOS317,O=C(c1cccc(N2CCCCS2(=O)=O)c1)N1CCc2ccccc21,0 +EOS19869,CCN(CC)C(=O)c1ccc2c(c1)CC(=O)N2Cc1cccc(F)c1,0 +EOS26424,CCCn1cc(C(=O)Nc2ccc3c(c2)OCCO3)c(=O)c2ccc(C)nc21,0 +EOS12820,COc1ccc2[nH]c(SCC(=O)N3CCOCC3)nc2c1,0 +EOS100258,CC1(C)CN(C(=O)c2ccc(-c3cccc4nc(NC(=O)C5CC5)nn34)cc2)C1,0 +EOS62095,O=C(Nc1ccc(OC2CCCC2)nc1)[C@@H]1CCCc2[nH]ncc21,0 +EOS49570,Cc1ccccc1[C@@H](NS(=O)(=O)C1CCS(=O)(=O)CC1)C1CC1,0 +EOS55935,COc1cc(OC)c(NC(=O)c2cc(=O)n(-c3ccc(C)cc3)[nH]2)cc1F,0 +EOS4688,COc1cccc(C(=O)N2CCN(C(=O)[C@@H]3CCCN3C(C)=O)CC2)c1,0 +EOS14843,CC(Cc1ccccc1)[n+]1[cH-]/c(=N\C(N)=O)on1,0 +EOS72732,CCCN(Cc1ccc(C#N)cc1)[C@H]1CCS(=O)(=O)C1,0 +EOS86828,C[C@@H](c1ccccc1)N1CCN(C(=O)Cn2cnn(C)c2=O)CC1,0 +EOS50535,CN(C)CCCOC1CCN(c2ccnc3c(F)cccc23)CC1,0 +EOS101685,O=C(NC1CCN(CCCCC2(C(=O)NCC(F)(F)F)c3ccccc3-c3ccccc32)CC1)c1ccccc1-c1ccc(C(F)(F)F)cc1,0 +EOS2424,COc1ccc(Cl)cc1NC(=O)N1CCN(c2cc(N(C)C)nc(C)n2)CC1,0 +EOS74181,CCn1cnnc1[C@H]1CCCN(C(=O)Nc2ccc(C)c(Cl)c2)C1,0 +EOS29352,Cc1cc(=O)n(CC(=O)N2CCN(c3ccccc3F)CC2)c(-c2ccc(F)cc2)n1,0 +EOS20523,CC(C)N1CCC(OCCCNC(=O)C23CCCC=C2N(C2CCCC2)C(=O)CC3)CC1,0 +EOS11203,CCCCNc1ccc(C(=O)N2CCC(Oc3cccnc3)CC2)cn1,0 +EOS69500,CNC(=O)C1CN(C(=O)COCC(F)(F)F)C1,0 +EOS54740,Cc1c(O)ccc(C(Cc2ccc3ccccc3n2)=NO)c1O,0 +EOS24531,CC(C)OCCCNC(=O)C1CCCN(c2ccc(-c3ccccc3)nn2)C1,0 +EOS85309,Cc1nnc([C@H](C)N(C)CCCNC(=O)C2CCC2)s1,0 +EOS71340,COc1ccc(CC(=O)N2CCCN(c3ccccc3C#N)CC2)cc1,0 +EOS73454,CCCc1nc(CN2CCC[C@@H]2Cn2cncn2)cs1,0 +EOS95892,O=C(Nc1ccc2c(c1)OC1(CCCC1)O2)N1CCN(C(=O)c2ccco2)CC1,0 +EOS71024,COc1cccc(C(=O)N2CCCC[C@H]2c2nc(C)cs2)c1OC,0 +EOS20280,CC(C)C[C@H]1C(=O)N2C[C@@H](N(C)C)C[C@H]2CN1C(=O)CCc1nc(-c2ccccc2)no1,0 +EOS75115,Cc1noc(C)c1CN1CCC[C@H]1c1cccs1,0 +EOS4204,CC(=O)NCCC1CCCCN1S(=O)(=O)c1cc(C)sc1C,0 +EOS1080,Cc1ccc(-c2noc(C(=O)N3CCCCC3)n2)cc1S(=O)(=O)Nc1cccnc1,0 +EOS71657,O=C(Nc1cc(S(=O)(=O)N2CCOCC2)ccc1O)c1ccccc1OCc1ccccc1,0 +EOS30361,CCOCc1nc(C2CCCN(C(=O)c3ccccc3)C2)no1,0 +EOS86322,C[C@H]1CCCN(CC(=O)NCc2ccc3c(c2)OCO3)C1,0 +EOS47656,CO[C@@H]1C[C@@H](c2ncn[nH]2)N(C(=O)c2cc(C(C)(C)C)n[nH]2)C1,0 +EOS84451,CC(=O)N1N=C(c2ccc3c(c2)OCO3)C[C@H]1c1cccc(F)c1,0 +EOS68460,CC1CCN(c2nnc(S[C@@H](C(=O)N3CCOCC3)c3ccccc3)n2C2CC2)CC1,0 +EOS94000,O=C(Cn1[nH]c(=O)c2ccccc2c1=O)NCCCCOc1ccc(Cl)cc1,0 +EOS100419,COc1cc2c(cc1OC)CN(CCc1ccc(NC(=O)c3cc(OC)c(OC)cc3NC(=O)c3cnc4ccccc4c3)cc1)CC2,0 +EOS53202,CN(Cc1nc2ccccc2c(=O)[nH]1)Cc1ccccc1Br,0 +EOS72157,Cc1nc2cc(=O)[nH]n2c(C)c1CC(=O)Nc1cccc(F)c1,0 +EOS32717,COC[C@@H]1CC(F)(F)CN1C1CN(C(=O)c2ccc(OC)c(F)c2)C1,0 +EOS64116,Cc1ccc(N2C[C@@H](C(=O)N3CCN(S(=O)(=O)c4ccc5c(c4)OCCCO5)CC3)CC2=O)cc1,0 +EOS18710,CCCNC(=O)CSc1nc2c(C)nn(CC)c2c(=O)n1Cc1cccs1,0 +EOS75229,Cc1cc(=O)c(C(=O)N[C@@]2(CCO)CCOC2)nn1-c1cccc(C(F)(F)F)c1,0 +EOS101261,COc1cccc(CNc2ccc(S(=O)(=O)Nc3nc4ccccc4s3)cc2)c1O,0 +EOS63484,CCN(CC(=O)Nc1cccc(CN(C)C(C)=O)c1)c1ccccc1,0 +EOS52662,Cc1cn(CC(=O)N(C2CCCC2)[C@H]2CCS(=O)(=O)C2)c(=O)n1-c1ccc(C#N)cc1,0 +EOS75633,COCC(=O)N[C@@H](C)C(=O)N1CCN(S(=O)(=O)c2cccc3c2N=S=N3)CC1,0 +EOS48965,Cc1nc(NC(=O)N[C@@H](C)c2cnn(C)c2)sc1C,0 +EOS75232,O=C(NS(=O)(=O)CCCF)[C@@H]1CSc2ccccc21,0 +EOS101077,Cc1cc(NC(=O)Nc2ccc(N(C)C)cc2)c2cc(F)cc(F)c2n1.Cl,0 +EOS68367,O=C(Nc1[nH]c(=O)ncc1F)c1ccc(Cl)cc1Br,0 +EOS74964,CN(C)CCCNC(=O)c1cn(-c2ccccc2)nc1-c1cccnc1.Cl,0 +EOS73806,Cc1nc(C)c(C(=O)N2CCC[C@H]2c2cccnc2)s1,0 +EOS60795,CC(C)c1ccc(-c2nnc(NC(=O)C3=COCCO3)s2)cc1,0 +EOS85379,COc1ccc(OC)c(C(=O)Nc2cccc3c(OC)ccnc23)c1,0 +EOS51677,COc1ccc([C@@H]2CCCN2C(=O)c2ccc(S(=O)(=O)NC3CC3)cc2)c(OC)c1,0 +EOS69628,CCCCOc1ccc(S(=O)(=O)Nc2cc(OC)ccc2OC)cc1,0 +EOS20313,O=C(c1cscn1)N1C2CCC1Cn1c(nnc1-c1cccnc1)C2,0 +EOS40006,CC#CCN(c1ccccc1F)S(=O)(=O)CC,0 +EOS62200,Cc1ccc(-n2[nH]c(C(=O)Nc3cccn(CC(F)(F)F)c3=O)cc2=O)cc1,0 +EOS4603,Cc1cc2c(cc1NC(=O)N1CCCC(c3ccncn3)C1)n(C)c(=O)n2C,0 +EOS84475,COCCN1C(=O)C(O)=C(C(=O)c2ccc(OC)cc2)[C@H]1c1ccc(OC)cc1,0 +EOS32515,CCOCc1cnc(C)nc1C1CCCN(C(=O)c2cc(C)n(C)n2)C1,0 +EOS83045,C[C@]1(NC(=O)CCOc2ccc(F)cc2)CCS(=O)(=O)C1,0 +EOS64178,O=c1[nH]c(CN2CCN(S(=O)(=O)c3ccc4ccccc4c3)CC2)nc2ccsc12,0 +EOS69525,CN(Cc1coc(-c2ccccc2)n1)S(=O)(=O)c1ccc(F)cc1,0 +EOS72007,C[C@H](NC(=O)NC1CCC2(CC1)COC2)c1cnn(CC2CCC2)c1,0 +EOS68620,C[C@@H](CNC(=O)c1c(F)cccc1Cl)N1CCCC1,0 +EOS34716,O=C(c1ccn2nnnc2c1)N1C[C@@H]2C[C@H](C1)Cn1c2cccc1=O,0 +EOS100506,Cc1c[nH]c2ncnc(N3CCC(CN)(C(=O)Nc4cccc(OC(=O)N(C)C)c4)CC3)c12,0 +EOS32572,COCc1nnc2n1CC1CCC(C2)N1C(=O)CCc1ccc(OC)cc1,0 +EOS42051,CCC[C@]1(CO)CCN(C(=O)c2cccnc2C2CC2)C1,0 +EOS102362,Cc1ccc(-n2ncc3c(=O)n(CC(=O)NCc4ccco4)cnc32)cc1,0 +EOS94889,CCCn1c(NC(=O)c2ccncc2)nc2ccccc21,0 +EOS85437,C#Cc1cccc(NC(=O)CN(C)c2ncc(Cl)cn2)c1,0 +EOS28121,O=C(Nc1ccccc1F)N1CCC(n2cnc3cc(F)ccc3c2=O)CC1,0 +EOS74746,C#CCSCCNC(=O)N(CC)c1ccccc1,0 +EOS83980,O=C(CCNC(=O)c1c[nH]c2ccccc12)N[C@H]1CCCCNC1=O,0 +EOS101619,Cc1ccc(S(=O)(=O)O)cc1.Cn1ncnc1[C@H]1c2n[nH]c(=O)c3cc(F)cc(c23)N[C@@H]1c1ccc(F)cc1,0 +EOS101380,N[C@@H](Cc1cc(I)c(Oc2ccc(O)c(I)c2)c(I)c1)C(=O)[O-].[Na+],0 +EOS74404,Cc1cc(C)n(C[C@H]2CCCN2Cc2ncc(C(C)(C)C)o2)n1,0 +EOS39466,O=C1Nc2ccc(F)cc2C2(CNC2)O1,0 +EOS100771,Cl.O=C1Nc2ccccc2C2(CCN(CCc3ccc(C(F)(F)F)cc3)CC2)O1,0 +EOS97473,COc1ccnc(NC[C@H](O)c2ccccc2C(F)(F)F)n1,0 +EOS101061,CC1(C)CC[C@]2(C(=O)O)CC[C@]3(C)[C@H](C(=O)C=C4[C@@]3(C)CC[C@H]3C(C)(C)C(=O)C(C#N)=C[C@]43C)[C@@H]2C1,0 +EOS70993,CCCCNc1ccc(S(C)(=O)=O)cc1S(C)(=O)=O,0 +EOS35211,Cc1ccc(C)n1CCC(=O)Nc1ccc2[nH]c(CO)nc2c1,0 +EOS30963,COCCOCc1nc(C2CCCCN2C(=O)Cc2cccc(Cl)c2)no1,0 +EOS6650,CN(CCCS(=O)(=O)N(C)C)Cc1n[nH]c2c1CCCCC2,0 +EOS84489,O=c1nc(C(F)(F)F)nc2ccccn12,0 +EOS85707,CCOCC(=O)N1c2ccccc2N(C)CC[C@@H]1C,0 +EOS68429,COc1cc(C(=O)Nc2ccc(C#N)cc2)cc2c1OCCO2,0 +EOS75116,C[C@H](NCc1cccc(NC(=O)Cn2cccn2)c1)c1cccs1,0 +EOS101549,CCN1CCN(C(=O)Cc2ccc(Nc3ncc(F)c(Nc4ccc(C(=O)Nc5ccccc5Cl)cc4)n3)cc2)CC1,0 +EOS2236,COc1ccc(Nc2nc(N3CCN(S(C)(=O)=O)CC3)nc3ccccc23)cc1,0 +EOS38187,CC(=O)N1CCc2cc(NC(=O)c3cc4ccccc4cc3O)ccc21,0 +EOS60861,Cc1ccc(NC(=O)CCc2ccccc2)cc1S(=O)(=O)N1CCOCC1,0 +EOS71172,Cc1cnn(C[C@H]2CN(Cc3ccoc3C)CCO2)c1,0 +EOS101808,Cl.O=C(O)Cc1cccc(OCCCN(Cc2cccc(C(F)(F)F)c2Cl)CC(c2ccccc2)c2ccccc2)c1,0 +EOS40031,O=S1(=O)CCCN(Cc2cn3ccccc3n2)CC1,0 +EOS73643,COc1cc(CNC(=O)C(C)C)ccc1OC[C@H]1CCCO1,0 +EOS85429,Cl.Cn1cc(CN)c(-c2ccc(C#N)cc2)n1,0 +EOS93043,CNC(=O)c1ccc(CSC(C)(C)C)cc1,0 +EOS102237,Cc1coc2c1C(=O)C(=O)c1c-2ccc2c1CCCC2(C)C,0 +EOS72582,COc1ccc(OC)c([C@@H]2CCCN2C(=O)c2cn(C(C)C)cn2)c1,0 +EOS57426,CCN(CC)c1ccc(C(=O)N2CCC[C@H]2c2noc(C)n2)cn1,0 +EOS53552,CCOc1ccc(S(=O)(=O)Nc2ccc([C@@]3(C)NC(=O)NC3=O)cc2)cc1OCC,0 +EOS20732,CC(C)Cn1ccc2c(NC(=O)c3cnc4sccn4c3=O)cccc21,0 +EOS51286,Cc1cc(NC(=O)NC[C@H](O)COc2cccc3ccccc23)no1,0 +EOS60822,Cn1c(=O)c2c(ncn2CC(=O)Nc2nc(-c3ccc4c(c3)OCCO4)cs2)n(C)c1=O,0 +EOS19194,COc1ccnc2c1c(=O)n(CC(=O)Nc1ccc3c(c1)OCO3)c(=O)n2C,0 +EOS4613,CC1CCCN1C(=O)c1ccc2nc(-c3ccc(F)cc3)cn2c1,0 +EOS64357,CC(=O)N(C)C1CCN(C(=O)c2ccc3cc[nH]c3c2)CC1,0 +EOS73381,CN1C[C@@H]2[C@@H](CCCN2C(=O)NCc2cccc(COCC(F)(F)F)c2)C1=O,0 +EOS48062,C[C@@H](CC#N)NC(=O)c1ccnc(SC(C)(C)C)c1,0 +EOS101656,Cl.Clc1ccccc1CN1CCc2sccc2C1,0 +EOS101295,COc1cc2ncnc(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)/C=C/CN1CCCCC1,0 +EOS86660,N#CCCN(CCC(F)(F)F)C(=O)Cn1nc2n(c1=O)CCCCC2,0 +EOS74770,Cc1noc(C)c1C[C@H](C)C(=O)N1CC(n2cc(-c3ccccc3)nn2)C1,0 +EOS67998,Cc1ccc(C)c(S(=O)(=O)NC(=O)CC[C@H]2CCCO2)c1,0 +EOS968,CCCC(=O)Nc1nn2c(-c3ccc(C)c(F)c3)nnc2s1,0 +EOS38460,COc1cc(OC)cc(N2CC[C@H](NC(=O)N3Cc4cccc(OC)c4C3)C2)c1,0 +EOS97123,O=C(NCCOc1ccccc1F)[C@H]1CC(=O)N(Cc2ccccc2)C1,0 +EOS45917,C[C@H](NC(=O)c1ccoc1)C(=O)Nc1cccc(CN2C(=O)CNC2=O)c1,0 +EOS83217,CCCN(C(=O)c1nn(-c2ccccc2C(F)(F)F)c(C)cc1=O)[C@H]1CCS(=O)(=O)C1,0 +EOS49461,CCN(CCN(C)CC(F)(F)F)C(=O)c1ccccc1-n1cccn1,0 +EOS50218,Cc1n[nH]cc1CN1C(=O)[C@@H]2CCCCN2C1=O,0 +EOS4606,COc1ccccc1-n1cc(C(=O)N2CCCCC2c2cc(C)on2)cn1,0 +EOS61531,CCc1ccc(NC(=O)NC[C@H]2CCS(=O)(=O)C2)cc1,0 +EOS20410,O=C(c1cn(C[C@@H]2CCCN2C2CCOCC2)nn1)N1CCc2ccccc2C1,0 +EOS84305,Cn1nc(N2CCN(c3ccc(F)cc3)CC2)c(=O)n(C)c1=O,0 +EOS41541,CCn1c(=O)n(CC(=O)N2CCCN3c4ccccc4C[C@H]3C2)c2ccccc21,0 +EOS100489,C[C@@H]1COCCN1c1cc(=O)n2c(n1)N(Cc1cncc(Cl)c1)[C@H](C(F)(F)F)CC2,0 +EOS57657,C[C@]1(C(=O)N2CCCc3occc3C2)CCC(=O)NC1,0 +EOS68318,N#C[C@@H](NC(=O)c1ccc2[nH]nnc2c1)C1CCCCC1,0 +EOS70350,CCn1cc(NC(=O)c2cc(-c3cccc(Cl)c3)no2)ccc1=O,0 +EOS101803,CN1CCc2cc3c(cc2[C@H]1[C@@H]1OC(=O)c2c1ccc1c2OCO1)OCO3,0 +EOS61452,O=S(=O)(c1ccc(N2CCN(c3ncnc4c3oc3ccccc34)CC2)nc1)N1CCOCC1,0 +EOS32960,CC1(c2noc(C3CCOCC3)n2)CCCN(C(=O)c2ccccn2)C1,0 +EOS101363,Cc1ccc(-c2cc(C(F)(F)F)nn2-c2ccc(S(N)(=O)=O)cc2)cc1,0 +EOS100836,CCC(C)n1ncn(-c2ccc(N3CCN(c4ccc(OC[C@H]5CO[C@](Cn6cncn6)(c6ccc(Cl)cc6Cl)O5)cc4)CC3)cc2)c1=O,0 +EOS40062,O=C(CN1CCC[C@H](O)C1)N1CCN(Cc2ccccc2)CC1,0 +EOS53751,CC(C)CCNC(=O)Cn1cnc2c(oc3ccccc32)c1=O,0 +EOS28747,CC(=O)Nc1ccc(S(=O)(=O)Nc2ccc3c(c2)CCC(=O)N3C)cc1,0 +EOS68356,C[C@@H](c1ccco1)N(C)C(=O)c1cc(O)nc(N(C)C)n1,0 +EOS61487,CCCNC(=O)[C@H](C)NC(=O)CCCn1cnc2c(C)cccc2c1=O,0 +EOS74344,O=C(Nc1ccncc1)N[C@H]1CCS(=O)(=O)C1,0 +EOS22069,CCn1nc(N2CCCC(NS(=O)(=O)c3ccc(F)c(C)c3)C2)ccc1=O,0 +EOS75113,COc1ccc(NC(=O)NC[C@H](C)Cn2nc(C)cc2C)cn1,0 +EOS14093,Cc1nc(-c2ccc(S(=O)(=O)N3CCN(c4cc(-n5ccnc5)ncn4)CC3)s2)cs1,0 +EOS84289,CN1CC(=O)N=C1NC(=O)Nc1ccc(O)cc1,0 +EOS12353,CC(C)c1ccccc1NC(=O)Cn1c(=O)oc2ccccc21,0 +EOS48318,CC(C)[C@@H](c1ccccc1)S(=O)(=O)NC(=O)c1cn2c(n1)CCC2,0 +EOS68001,CCONC(=O)c1csc(-c2ccc(OC(C)C)cc2)n1,0 +EOS95884,Cc1nccn1C[C@@H](C)CNC(=O)CCCn1c(=O)oc2ccccc21,0 +EOS18705,CCCN(CCC)CCCNC(=O)CCNC(=O)Cn1ccc2ccccc2c1=O,0 +EOS68358,O=C(Nc1ccccc1Cl)N1CCN(Cc2ccccn2)CC1,0 +EOS70986,O=C([C@@H]1CN(Cc2nnsc2Cl)c2ccccc2O1)N1CCCCC1,0 +EOS655,COc1ccc(OC)c(C2CC(c3c(O)[nH]c(=O)n(C4CCCCC4)c3=O)=NN2)c1,0 +EOS25087,COc1ccc(NC(=O)N2CCN(c3nc4ccccc4n(Cc4ccccc4)c3=O)CC2)c(OC)c1,0 +EOS74565,CCn1nnc(C)c1CS(=O)(=O)CC1(C)CCCCC1,0 +EOS75894,Cc1ccc(C#N)cc1S(=O)(=O)N1CCN[C@@H](C)C1,0 +EOS73711,CNS(=O)(=O)c1ccc(C(=O)Nc2ccc(-n3ccnc3C)c(F)c2)o1,0 +EOS63830,CCc1ccc(C)cc1OCc1nnc(C)n1CC,0 +EOS75530,CCc1ccc(S(=O)(=O)N2CC[C@H](CNC)C2)cc1,0 +EOS102405,C[N+](C)(C)CC#CCN1CCCC1=O.[I-],0 +EOS73193,CCN(CC)S(=O)(=O)c1cccc(C(=O)Nc2cccc(N3CCCC3=O)c2)c1,0 +EOS64470,CCS(=O)(=O)N1CCC(=NO)CC1,0 +EOS100139,Cc1c(CN(C)C(=O)/C=C/C2=CNC3=NC(=O)CCC3=C2)oc2ccccc12,0 +EOS85018,COCc1cc(C(=O)N2Cc3ccccc3N(C)C[C@@H]2C)no1,0 +EOS79004,OC[C@H](Nc1cc(C(F)(F)F)ccn1)c1ccccc1,0 +EOS68364,CCc1ccc([C@H](C)NCC(C)(C)C(N)=O)s1,0 +EOS100502,COc1cc2c(cc1F)C(c1ccccc1Cl)=NC1=C(C)NNC1=N2,0 +EOS11756,Cc1ccc(OCC(=O)N2CCc3ccccc32)cc1C,0 +EOS38930,O=C(CCNC(=O)[C@@H]1COCCN1CC1CCC1)Nc1ccccc1,0 +EOS2381,Cc1c2c(=O)n(-c3nc4ccccc4s3)[nH]c2cc(=O)n1Cc1cccs1,0 +EOS75463,Cc1ccc(NC(=O)CN(C)S(C)(=O)=O)cc1S(=O)(=O)N1CCOCC1,0 +EOS68402,CC(=O)Nc1ccc(NC(=O)c2cccc(OCc3cscn3)c2)cc1,0 +EOS68555,Cc1cn2c(n1)CC[C@H](NC(=O)c1cc(C)oc1C)C2,0 +EOS30122,CCn1c(=O)c2c(cc(C)n2C)n(CC(=O)Nc2ccc(OC)c(Cl)c2)c1=O,0 +EOS84785,CCONC(=O)c1nn(-c2ccccc2F)cc1O,0 +EOS50836,c1ccc([C@H]2N(c3ncnc4[nH]ccc34)CC23CCOCC3)cc1,0 +EOS69603,O=C(Cn1cnc2scc(-c3cccs3)c2c1=O)Nc1cccnc1,0 +EOS72529,C[C@@H](CN1CCCCC1)NC(=O)Cc1ccn[nH]1,0 +EOS37268,COc1ccc(NC(=O)c2ccc(NC(=O)c3cnn(C)c3)cc2)cc1,0 +EOS100138,CNC(=O)O[C@H]1COc2ccc(N3CCN(C4COC4)CC3)cc2[C@@H]1NC(=O)c1ccc(F)cc1,0 +EOS38167,COc1ccc2nc(C(=O)Nc3ccc4[nH]c(=O)[nH]c4c3)ccc2c1,0 +EOS57195,O=C(Cn1ccc2ccc(Cl)cc21)NC[C@]1(O)CCS(=O)(=O)C1,0 +EOS53366,Clc1ccccc1-c1noc(CSc2nnnn2C[C@H]2CCCO2)n1,0 +EOS69457,Nc1nc(C2CC2)nc(N2CCN(C[C@H]3CCC4(CCC4)O3)CC2)n1,0 +EOS71052,Cc1cc(C)n(C[C@H]2CCCN2C(=O)c2ccoc2C)n1,0 +EOS13687,Cc1ccc2nc(Nc3ccccc3)c(/C=N/O)c(=O)n2c1,0 +EOS71163,Cc1nc(-c2cccc(NC(=O)CN[C@H]3CCC(=O)NC34CCC4)c2)cs1,0 +EOS71861,N#CCCN(CC1CC1)C(=O)[C@H](O)c1ccc(Br)cc1,0 +EOS10022,Cc1c(NC(=O)N[C@H]2COC[C@@H]2N2CCCC2)cccc1N1CCCC1,0 +EOS55390,O=C(Nc1nc2ccccc2[nH]1)c1c[nH]n2c1nc(=O)c1ccccc12,0 +EOS72237,O=C1C[C@H](NC(=O)c2ocnc2C2CC2)CN1,0 +EOS21724,CC(=O)c1ccc(NC(=O)C(CC(C)C)NS(=O)(=O)c2ccc3c(c2)oc(=O)n3C)cc1,0 +EOS38052,CCN(C)C[C@H]1CCN(C(=O)CCNc2ncc(C(F)(F)F)cc2Cl)C1,0 +EOS19865,CCCNC(=O)c1cccc2c1CCN2C(=O)c1ccc(F)cc1,0 +EOS85253,COc1ccccc1O[C@H]1CCCN(C(=O)c2cnc3c(cnn3C)c2OC)C1,0 +EOS70677,CO[C@@](C)(CNC(=O)c1ccc(=O)[nH]n1)C1CC1,0 +EOS101606,CC1(C)CNc2cc(NC(=O)c3cccnc3NCc3ccncc3)ccc21,0 +EOS57141,C[C@H](NCCN(C)Cc1ccccc1)c1cn(C2COC2)nn1,0 +EOS31524,Cc1cnc(C)n1C1CCCN(C(=O)c2cccs2)C1,0 +EOS68188,C[C@H]1CN(C(=O)c2cccnc2N2CCOCC2)CCO1,0 +EOS64373,CCCNC(=O)c1ccc(NC(=O)[C@@H]2COCCN2CC2CCC2)cc1Cl,0 +EOS21404,CC(C)Cn1c(=O)c2ccc(C(=O)NC3CCCC3)cc2n2c(=O)[nH]nc12,0 +EOS39492,COc1ccccc1[C@@H]1CC(c2ccco2)=NN1S(C)(=O)=O,0 +EOS39892,O=C(NCCCCc1nccs1)[C@@H]1OCCc2ccccc21,0 +EOS19808,O=c1c2c[nH]c3ccccc3c-2nn1-c1ccc(Cl)cc1,0 +EOS4028,CNCCN1C(=O)CC[C@H]2CN(C(=O)C3(COC)CCC3)CC[C@H]21.Cl,0 +EOS84786,Cc1nnc2ccc(N3CC[C@]4(CCOC4)C3)nn12,0 +EOS72231,COCCOc1cccc(C(=O)Nc2[nH]cc(C)c2C#N)c1,0 +EOS68564,Cc1nc([C@H]2CCCN(C(=O)COc3ccc(C#N)cc3)C2)no1,0 +EOS74273,O=C(CSc1cccs1)NC[C@H]1CCCO1,0 +EOS78190,CCCN1CCN(Cc2nnc(C)n2CC)c2cc(F)ccc21,0 +EOS11937,CCc1ccc(OCC(=O)Nc2ccc(S(=O)(=O)N(CC)CC)cc2)cc1,0 +EOS46373,CN(C)C(=O)c1cccc(S(=O)(=O)N2CCn3c(Br)cnc3C2)c1,0 +EOS28807,COc1ccc(NC(=O)Cn2nc3c(-c4nc(-c5ccccc5)no4)cccn3c2=O)cc1,0 +EOS92511,COc1ccc(Nc2nc(CN3CCN(S(=O)(=O)c4c(C)noc4C)CC3)cs2)cc1,0 +EOS75292,O=C(Cn1[nH]c(=O)ccc1=O)NC[C@@H](c1ccccc1)N1CCCC1,0 +EOS68622,C[C@H](NC(=O)COc1ccc(N2CCCC2=O)cc1)C(N)=O,0 +EOS93210,Cc1cc(-n2c(C)cc(C(=O)Cn3cnc4c(cnn4C)c3=O)c2C)no1,0 +EOS29869,O=C(Nc1ccc(F)cc1)c1ccc(=O)n(CCN2CCOCC2)c1,0 +EOS64886,CS(=O)(=O)NC[C@H]1Cc2ccccc2O1,1 +EOS49657,CON(C)C(=O)Cn1nc(C(F)(F)F)cc1Br,1 +EOS2134,O=S(=O)(NCc1ccc2c(c1)OCO2)N1CCCC1,1 +EOS55726,N#Cc1c(F)cccc1N1CCN(c2ccc(N)nc2)CC1,1 +EOS94466,CC(=O)c1ccc(S(=O)(=O)Nc2nnc(-c3cccc(C#N)c3)o2)cc1,1 +EOS28957,CCc1ccc(NC(=O)Cn2nc3c(N4CCCCCC4)nccn3c2=O)cc1,1 +EOS2355,Cc1nc(C)n(CC(C)C(=O)NCc2ccccc2)n1,1 +EOS31977,COC[C@@H]1CN(C(=O)c2ccc(C)cn2)C[C@H]1C(N)=O,1 +EOS12981,Cc1cn2c3c(=O)n(Cc4ccccc4)c(=O)n(C)c3nc2n1CCCN(C)C,1 +EOS47263,CCCc1cc(C(=O)N2CCC[C@H](Cn3cncn3)C2)cc(=O)[nH]1,1 +EOS97698,CC(C)n1ccnc1CSCCN(C)C,1 +EOS14145,CCCC(=O)N1CCN(c2ccc(Nc3cccc(C)n3)nn2)CC1,1 +EOS9133,O=C(c1ccc2c(c1)CCCC2)N1C[C@H](O)[C@@H](N2CCOCC2)C1,1 +EOS73842,CC(C)(C)NC(=O)CCN1CCc2[nH]nc(C(F)(F)F)c2C1,1 +EOS28946,CC(=O)c1ccc(NC(=O)Cn2nc3c(Oc4ccccc4C)nccn3c2=O)cc1,1 +EOS36400,Cc1cc(=O)oc2cc(O)c(-c3ccnc(N)n3)cc12,1 +EOS75052,Cc1ccnc(NC(=O)CCNS(=O)(=O)c2cn(C)c(C)n2)c1,1 +EOS101243,Cc1ccc(C(=O)c2cc(O)c(O)c([N+](=O)[O-])c2)cc1,1 +EOS91731,CC(C)O[C@@H]1CCN(C[C@H](O)c2ccc(C(F)(F)F)cc2)C1,1 +EOS20531,COCCCN(Cc1cccn1C)CC(O)COCC(C)C,1 +EOS73088,Cc1ncsc1CCN1CCNC(=O)[C@@H]1c1ccccc1,1 +EOS101591,COc1cccc2c1C(=O)c1c(O)c3c(c(O)c1C2=O)C[C@@](O)(C(C)=O)C[C@@H]3O[C@H]1C[C@H](N)[C@H](O)[C@H](C)O1.Cl,1 +EOS55126,COCCc1nnc(SCC(F)(F)F)n1N,1 +EOS23522,Cc1cccn2c(=O)c3cc(C(=O)N4CCCCCC4)n(C)c3nc12,1 +EOS66412,COc1ccc(Cl)cc1NC(=O)CSc1nc(O)cc(=O)n1CC(C)C,1 +EOS30366,O=C(c1cc(CN2CCOCC2)on1)N1CCCC(c2nc(C3CCOCC3)no2)C1,1 +EOS42829,O=C(Nc1cccc(CNC(=O)[C@H]2CCc3nccn3C2)c1)c1ccco1,1 +EOS19470,CCNC(=O)CCc1nc(CS(=O)(=O)c2ccccc2)no1,1 +EOS52119,Cc1nc(CN(C)C(=O)[C@H]2CC(=O)N(CC(F)(F)F)C2)no1,1 +EOS44623,COc1cc(NC(=O)c2csc(-c3ccco3)n2)c(C(N)=O)cc1OC,1 +EOS72244,O=C(CSc1ccncc1)N1CC[C@H](O)C12CCCC2,1 +EOS66281,COCc1ccc(CNC[C@H]2CNc3cc(C)nn3C2)cc1,1 +EOS82921,Cl.N[C@@H]1CCCC[C@H]1NC(=O)c1ccc2c(c1)NC(=O)CO2,1 +EOS11868,CCOc1ccc(Br)cc1S(=O)(=O)N1CCN(c2ccccc2OC)CC1,1 +EOS8750,Cc1cc(NC(=O)C2CCCN(CC(N)=O)C2)ccc1N1CCCC1=O,1 +EOS74107,CC(C)CC(=O)Nc1cc(C2CC2)nn1[C@H]1CCS(=O)(=O)C1,1 +EOS3220,CCN(C(=O)c1ccc(-c2c(C)[nH]c(=O)[nH]c2=O)cc1)[C@H]1COC[C@@H]1O,1 +EOS75496,CCOc1ccc2c(c1)C=C(C(=O)Nc1ccc3c(c1)CCN3C(C)=O)CO2,1 +EOS2177,CC(=O)c1cccc(NC(=O)c2cc(S(=O)(=O)N3CCCC3)cs2)c1,1 +EOS48254,Cc1ccc(C(=O)Nc2cnn(C[C@H]3CCCCN3C(=O)c3ccc(C)n3C)c2)n1C,1 +EOS60252,COc1ccc2oc(=O)c(C(=O)NCc3ccccn3)cc2c1,1 +EOS88181,CC1(C)CCc2sc(NC(=O)N3CCc4ccnc(O)c4C3)nc21,1 +EOS1394,CCOc1ncccc1CN(C)C(=O)c1ccc(Br)o1,1 +EOS55066,Cc1cc(C)cc(N(C)Cc2cn(C)c(=O)n(C)c2=O)c1,1 +EOS101060,Cl.N=C(NCc1ccccc1)NC(=O)c1nc(Cl)c(N)nc1N,1 +EOS41021,O=C(Nc1ccc(OC2COC2)cc1)[C@]1(c2ccccc2)CCCO1,1 +EOS179,COCCCNC(=O)c1ccc(N2CCCCS2(=O)=O)cc1Cl,1 +EOS42600,CS(=O)(=O)c1ccc(C(=O)Nc2ccc(C(=O)NC3CC3)cc2)cc1,1 +EOS30056,CCCN(CC1CCOC1)S(=O)(=O)c1ccc(C(F)(F)F)cc1,1 +EOS71174,CC1(C)COC[C@@H]1NC(=O)c1oc2ccccc2c1Cn1cccn1,1 +EOS78531,Cc1ccc(CN(C)C(=O)c2cccc(C)n2)cc1,1 +EOS43509,O=C1OCCN1c1cccc(C(=O)N(c2cccc(F)c2)C2CCOCC2)c1,1 +EOS29806,COc1ccc(NC(=O)c2c(O)c3cccnc3[nH]c2=O)c(OC)c1,1 +EOS16178,CCC(=O)N1CCc2sc(S(=O)(=O)NCc3ccccc3OC)cc2C1,1 +EOS81233,O=C(c1cnccn1)N1CCC[C@H]1c1nnc2n1CCCCC2,1 +EOS47464,Cc1ccc(C(=O)N[C@@]2(C)CCS(=O)(=O)C2)cc1F,1 +EOS63147,CO[C@@H](CNc1ccccc1OCC(F)(F)F)C1CCOCC1,1 +EOS47476,CCCCNC(=O)NC(=O)CN1CCC(C(=O)c2ccc3c(c2)OCCO3)CC1,1 +EOS44914,Cc1n[nH]c2ncc(CNC[C@@H](c3cccs3)N3CCCC3)cc12.Cl,1 +EOS83454,CCN(Cc1ccc(Cl)s1)C(=O)Cn1c(=O)n(CC)c(=O)c2ccccc21,1 +EOS101236,CN(C)C[C@@H]1CCn2cc(c3ccccc32)C2=C(C(=O)NC2=O)c2cn(c3ccccc23)CCO1.Cl,1 +EOS30121,CCc1ccc(N2CC(c3noc(-c4ccccn4)n3)CC2=O)cc1,1 +EOS84491,CC(=O)Nc1cccc(NC(=O)c2ccc3nccnc3c2)c1,1 +EOS66886,Cc1noc(C)c1CC(=O)NC[C@@H](c1cccs1)N(C)C,1 +EOS67062,COC1(C(=O)N2CC[C@H](c3nc(C)cs3)C2)CS(=O)(=O)C1,1 +EOS69662,Cc1ccc(-c2nc(C(=O)NCCS(N)(=O)=O)cs2)c(C)c1,1 +EOS19323,CCCNC(=O)c1ccc(N2CCNCC2)c(NS(=O)(=O)c2cccs2)c1.O=C(O)C(F)(F)F,1 +EOS29690,CCc1nnc(-c2ccc(=O)n(CC(=O)Nc3ccc(OC)c(Cl)c3)c2)o1,1 +EOS37142,CN(C(=O)Cn1nnn(-c2cccs2)c1=O)c1nc2ccccc2s1,1 +EOS56625,Cc1ccc2nc(CN3C[C@H](C)OC(C)(C)C3)cc(=O)n2c1,1 +EOS48993,OCCCSc1nnc(COc2ccc(Cl)cc2)[nH]1,1 +EOS66335,O=C(Nc1ccc(-n2cncn2)nc1)c1cnn(-c2ccccc2F)c1,1 +EOS42224,COc1ccccc1-n1ncc(C(=O)Nc2cccc3nccn23)c1C,1 +EOS17576,COc1cc(NC(=O)Cn2cnc3c([nH]c4cc(OC)c(OC)cc43)c2=O)cc(OC)c1,1 +EOS35103,CS(=O)(=O)N(CC(=O)NC1CC2CCC1C2)c1cccc(F)c1,1 +EOS68306,Cn1ccnc1C[C@H]1CCCN(Cc2cc(C#N)cs2)C1,1 +EOS75255,O=C(Nc1nccs1)c1cc2ccccc2[nH]c1=O,1 +EOS29172,CCC(=O)Nc1ccc(-n2cnc(C(=O)N3CCN(c4ccc(F)cc4)CC3)c2)nc1,1 +EOS24746,CC(Oc1ccccc1)C(=O)N1CCN(S(=O)(=O)c2cc3c(cc2Cl)NC(=O)CO3)CC1,1 +EOS60479,COc1ccc(S(=O)(=O)Nc2ccc(C)c(S(=O)(=O)N(C)C)c2)cc1,1 +EOS6670,CCc1oc(C(=O)N2CCOC(c3ccccc3)C2)cc1CN1CCCC1,1 +EOS60213,CC(=O)NCc1ccc(C(=O)CN2C(=O)N[C@@](C)(Cc3ccc4c(c3)OCO4)C2=O)s1,1 +EOS29060,CC(=O)N1CCc2cc(N3CC(C(=O)Nc4ccc(F)cc4)CC3=O)ccc21,1 +EOS20201,Cc1cnc(C2CC2)n1CCNC(=O)C(C)Oc1ccccc1,1 +EOS55933,COc1cc(NC(=O)c2ccc(C#N)c(C)n2)cc(-n2cccn2)c1,1 +EOS34900,COc1ccc(OCC(O)Cn2c(Nc3ccccc3)nc3c2c(=O)[nH]c(=O)n3C)cc1,1 +EOS49292,Cc1ccc(CC(=O)N2CCC[C@H](n3ccnc3C)C2)s1,1 +EOS83010,Cc1cccc(NC(=O)CCc2c(C)nc(C)[nH]c2=O)c1,1 +EOS92560,CC(=O)Nc1ccccc1OCC(=O)N1N=C(c2ccco2)C[C@H]1c1ccco1,1 +EOS69831,CC[C@H](Sc1ccccc1)C(=O)NCc1cc(=O)nc(SC)[nH]1,1 +EOS82679,N#Cc1ccnc(N2CCN(CC(=O)NC(N)=O)CC2)c1,1 +EOS32067,O=C(Cc1cccc(F)c1)N1CCC(n2cc(COCC3CC3)nn2)C1,1 +EOS53528,O=C1COc2cc(C(=O)Nc3ccc(Cl)cc3)ccc2N1,1 +EOS70163,CCCCN1CCCC[C@@H]1CNC(=O)Nc1ccc(-n2cncn2)nc1,1 +EOS97090,COc1ccccc1NC(=O)COc1ccc(C(=O)N(C)C[C@H]2COc3ccccc3O2)cc1OC,1 +EOS97137,COc1ccc(C(C)(C)C)cc1NC(=O)Cn1cnc2c1c(=O)n(C)c(=O)n2C,1 +EOS46355,Cc1ccc2c(c1)[C@H]1CN(C)CC[C@@H]1N2S(=O)(=O)c1ccc(C)c(N(C)C)c1,1 +EOS85878,Cc1cccc(N(C)C(=O)c2cc(Br)c(C)n(C)c2=O)c1,1 +EOS86566,Cc1cc(NC(=O)CNc2ccccc2C(=O)NCCc2ccccc2)no1,1 +EOS88849,CC(=O)Nc1ccccc1CNc1cc(C(F)(F)F)n(C)n1,1 +EOS58411,O=C([C@@H]1CCCCN1C(=O)c1ccco1)N1CC(O)(C(F)F)C1,1 +EOS68759,Cc1cc(C)n2nc(C(=O)Nc3ccc(Br)cc3)nc2n1,1 +EOS21738,CN(CC(=O)N1CCN(c2ccc(F)cc2)CC1)S(=O)(=O)c1ccc2c(c1)c(=O)n(C)c(=O)n2C,1 +EOS33555,O=C(NCCN1CCOCC1)C1CCc2sc(C(=O)N3CCCCC3)cc2C1,1 +EOS45003,CC(C)NC(=O)Cn1ncnc1-c1ccccc1Cl,1 +EOS81781,Cc1ccc(C(=O)C2CCN(c3cc(=O)n4ccccc4n3)CC2)cc1,1 +EOS58118,COc1cnc(C2(NC(=O)C[C@H](C)n3cccn3)CCCC2)[nH]c1=O,1 +EOS92643,COc1ccc(NC(=O)COc2ccc3c(C)cc(=O)oc3c2)c(OC)c1,1 +EOS20358,O=C(c1cncc(Br)c1)N1CCCC1CCc1noc(C2CC2)n1,1 +EOS1448,COc1cccc(C(=O)N2CCC(N(C(C)=O)C3CCOCC3)C2)c1C,1 +EOS40322,Cc1cccc(-c2ccc(C(=O)N(C)Cc3cnn(C)c3)c(=O)[nH]2)c1,1 +EOS53159,CCn1c(SCC(=O)N2CCCCC2)nc2cc(S(=O)(=O)N3CCCCC3)ccc21,1 +EOS97722,Fc1cccc(Cn2ccnc2-c2cccnc2)c1,1 +EOS29173,CCC(=O)Nc1ccc(-n2cnc(C(=O)N3CCN(c4ccccc4OC)CC3)c2)nc1,1 +EOS12054,CC(C)N1CC(C(=O)Nc2ccc(Br)c(Cl)c2)CC1=O,1 +EOS85246,O=C(Cc1n[nH]c2ccccc12)N1CCOCC(F)(F)C1,1 +EOS55119,CCn1ncc2c1CCC[C@@H]2NCc1ccsc1,1 +EOS97541,Cc1c(C(=O)N2CCC3(C2)OCCO3)cnn1CCO[C@@H]1CCCCO1,1 +EOS32335,CC(=O)N1CCN(C(=O)c2cc(C)n(C)n2)CCC1c1nc(C)no1,1 +EOS100737,CC(C)C[C@H](NC(=O)[C@H](Cc1ccccc1)NC(=O)CNC(=O)[C@@H](C)NC(=O)[C@@H](N)Cc1ccc(O)cc1)C(=O)O,1 +EOS97418,CC(C)Oc1cccc(N2CCN(C(C)C)CC2)n1,1 +EOS66465,CNC(=O)c1ccc(NC(=O)N2CCC[C@H](c3nc(C)no3)C2)cc1,1 +EOS30502,CN(CCNC(=O)C1CCCCC(=O)N1Cc1ccccc1)C1CCCCC1,1 +EOS50937,CCOc1ccccc1N1CCN(C(=O)c2ccc(OC)c(OC)c2OC)CC1,1 +EOS73036,CC[C@@H](CNC(=O)c1ccncc1)N1CCCC1,1 +EOS72932,CCN(C[C@H]1CCOC1)C(=O)NCc1ccnc(N2CCOCC2)c1,1 +EOS20126,O=C(CCc1ccc2c(c1)CN(S(=O)(=O)c1cccc(F)c1)CCO2)N1CCCC1,1 +EOS36759,CCOc1ccc(C2C(C(=O)c3ccc4c(c3)OCCO4)=C(O)C(=O)N2CCCN2CCOCC2)cc1,1 +EOS86605,Cc1ccccc1-c1cnc(NC(=O)c2cnn(C)c2)s1,1 +EOS17540,CCOc1ccc(N2CCn3c(SCC(=O)Nc4cccc(NC(C)=O)c4)nnc32)cc1,1 +EOS21408,CCCCn1c(=O)c2sccc2n2c(=O)n(CC(=O)Nc3cc(OC)c(OC)c(OC)c3)nc12,1 +EOS94425,Cn1nc(-c2ccco2)cc1NC(=O)c1ccc(OCC(F)F)nc1,1 +EOS44088,Cc1ccc2ncc(CN(CC(=O)N(C)C)CC(F)(F)F)n2c1,1 +EOS22776,CCC(=O)N1N=C(c2c(O)[nH]c(=O)n(C)c2=O)CC1c1ccc(OC)cc1,1 +EOS19299,O=C(Cn1nc2ccc(Sc3cccc(F)c3)nn2c1=O)NC1CCCC1,1 +EOS36645,Nn1c(Nc2ccc(Nc3ccccc3)cc2)nncc1=O,1 +EOS45188,CN1CCN(S(=O)(=O)c2ccc(NC(=O)c3ccccn3)cc2)CC1,1 +EOS50953,O=C(c1ccc(-c2ccco2)[nH]c1=O)N1CCCN(Cc2ccc(Cl)cc2)CC1,1 +EOS28966,CC(=O)c1cccc(NC(=O)Cn2nc3c(N4CCN(c5ccccc5)CC4)nccn3c2=O)c1,1 +EOS26888,CCOc1ccc(NC(=O)Cn2nc(-c3ccncc3)ccc2=O)cc1,1 +EOS61477,COc1cc(C(=O)N[C@H](C(=O)NCc2ccc3c(c2)OCO3)C(C)C)cc(OC)c1OC,1 +EOS97296,O=C(Cc1n[nH]c(=O)c2ccccc12)Nc1cccc(Br)c1,1 +EOS17737,COCc1nnc2n1CCC(NC(=O)CCc1ccc(OC)cc1)CC2,1 +EOS20359,Cc1noc(C)c1CCC(=O)N1CCCC1CCc1noc(-c2ccccn2)n1,1 +EOS13504,COC(CNC(=O)Cn1cc(NC(=O)CCOc2ccccc2)cn1)c1cccs1,1 +EOS11683,CN(c1ccc(C(=O)Nc2ccc(S(=O)(=O)N3CCOCC3)cc2)cc1)S(=O)(=O)c1ccccc1,1 +EOS84909,CC[C@@](C)(CCCNC(=O)Nc1ccc(C(=O)N2CCCC2)cc1)C1OCCO1,1 +EOS35148,O=C(CCNC(=O)c1cnccn1)Nc1ncc[nH]1,1 +EOS29151,Cc1nc(-c2ccccn2)cc2nn(CC(=O)Nc3ccc(C#N)cc3)c(=O)n12,1 +EOS96021,CCn1c(=O)c(C)nc2cc(C(=O)N3CCN(C(=O)[C@@H]4COc5ccccc5O4)CC3)ccc21,1 +EOS29084,CCC(=O)N1CCc2cc(N3CC(C(=O)Nc4ccc(C)cc4)CC3=O)ccc21,1 +EOS90326,Cn1ccnc1C(=O)c1ccc(NC(=O)CCN2C(=O)COc3ccccc32)cc1,1 +EOS27652,CCC(C)NC(=O)CSc1nc2ccccc2c(=O)n1CCCC(=O)NCC1CCCO1,1 +EOS46701,CCn1nc(C)c(C(=O)NC[C@@H](CO)c2cccnc2)n1,1 +EOS54876,NC(=O)c1cc(-c2csc(Nc3ccccn3)n2)ccc1O,1 +EOS39531,CCC(CC)NC(=O)C1CCN(c2ccc3nnc(C(F)(F)F)n3n2)CC1,1 +EOS92675,O=C(CSCc1cc(=O)n2ccsc2n1)N1CCC[C@H]1c1ccc2c(c1)OCCCO2,1 +EOS55454,CC(C)c1nc([C@H](C)NC(=O)c2c[nH]c(=O)c3ccccc23)cs1,1 +EOS75192,Fc1ccc(Oc2ccc(CNC[C@H]3CNc4ccnn4C3)cn2)cc1,1 +EOS284,O=C(Nc1ccc2c(c1)N(C(=O)c1cccnc1)CCC2)C1CC1,1 +EOS95426,O=C(N[C@H]1CCc2[nH]ncc2C1)N1CCSC2(CCCCC2)C1,1 +EOS84730,CNC(=O)c1ccc(CNC(=O)Cc2cn(C)c3ccccc23)cc1,1 +EOS94432,O=C(COc1ccccc1Cc1ccccc1)N1CCCS1(=O)=O,1 +EOS28936,COc1cccc(Oc2nccn3c(=O)n(CC(=O)NCc4ccc5c(c4)OCO5)nc23)c1,1 +EOS12051,CC(=O)Nc1ccc(NC(=O)CSc2nnc(-c3ccc(N)cc3)n2C)cc1,1 +EOS37291,CNC(=O)[C@@H]1CN(CC(=O)Nc2ccc(C(C)=O)cc2)c2ccccc2O1,1 +EOS50339,Cc1nc([C@H]2CCCN(C(=O)C3=COCCC3)C2)no1,1 +EOS13320,CCNC(=O)c1ccc(C)c(NS(=O)(=O)c2ccccc2)c1,1 +EOS11266,Cc1ccc(NC(=O)CN2C(=O)c3ccccc3C2=O)c(C)c1,1 +EOS87753,CC(=O)c1ccc(NS(=O)(=O)c2ccc3c(c2)OCCCO3)cc1,1 +EOS85014,C[C@@H](CC(=O)Nc1cccc(N2CCCC2)c1)NC(=O)c1cnc2n1CCCC2,1 +EOS37592,O=C(Nc1cccc2ccncc12)c1cnn2ncccc12,1 +EOS19978,CCCc1ccc(S(=O)(=O)N2CCN(c3cc(-c4ccncc4)n[nH]3)CC2)cc1,1 +EOS62269,Cc1nn(C)c(OCC(F)(F)F)c1CN1CC[C@H](CNc2ccccn2)C1,1 +EOS60649,NC1=C(c2nc3ccccc3s2)C(=O)CN1c1ccc2cn[nH]c2c1,1 +EOS102444,CC(C)CCOc1ccc(NC(=S)Nc2ccc(OCCC(C)C)cc2)cc1,1 +EOS49316,Cc1oc(-c2ccco2)nc1CC(=O)N1CCC[C@@H]1Cn1cccn1,1 +EOS16063,O=C(NCc1cccnc1)c1cccc(CN2C(=O)C3CCCN3C(=O)c3ccccc32)c1,1 +EOS19649,COc1cccc(NC(=O)Cn2c3ccccc3n3c(=O)cc(-c4ccncc4)nc23)c1,1 +EOS53775,O=C(Nc1nc2cc3c(cc2s1)OCCO3)[C@H]1CCCN(C(=O)c2ccoc2)C1,1 +EOS53337,O=C(Nc1cccc(NC(=O)c2ccco2)c1)c1ccc2[nH]cnc2c1,1 +EOS19546,O=C(Nc1ccccc1)C1CCN(S(=O)(=O)c2cccc3nonc23)CC1,1 +EOS28535,COc1ccc(CNC(=O)Cn2c(=O)c(CCC(=O)NCC(C)C)nc3ccccc32)cc1,1 +EOS71857,Cn1cc(C(=O)N(C[C@H](O)c2ccc(F)cc2)C2CC2)ccc1=O,1 +EOS14091,Cc1nc(N2CCN(C(=O)Nc3cc(F)cc(F)c3)CC2)cc(-n2cccn2)n1,1 +EOS95991,Cn1cnnc1[C@H]1CCCN(C(=O)c2cc(Cl)c(Cl)n2C)C1,1 +EOS28448,CNC(=O)C1CCN(c2nc3ncn(CC(=O)Nc4cccc(C(C)=O)c4)c(=O)c3s2)CC1,1 +EOS92429,N#CCN1CCCC[C@H]1c1nc(-c2ccccc2)no1,1 +EOS54958,CCCOc1ccc(S(=O)(=O)NC[C@H](C)N2CCOCC2)cc1,1 +EOS22640,Cc1ccc(CS(=O)(=O)N2CCC(C(=O)Nc3ccc4c(c3)n(C)c(=O)n4C)CC2)cc1,1 +EOS30237,CCn1ncc(N2CCCC(CC(=O)NCc3ccc(C)cc3)C2)cc1=O,1 +EOS13810,CCN(Cc1ccccc1)S(=O)(=O)c1ccc(N2C(=O)CCC2=O)cc1,1 +EOS26448,Cn1cc(S(=O)(=O)N2CCCCC2)cc1C(=O)Nc1ccc2c(c1)OCCO2,1 +EOS36845,CCn1c(=O)c(C(=O)Nc2nc3ccccc3[nH]2)c(O)c2ccccc21,1 +EOS29071,CC(=O)Nc1ccc(NC(=O)C2CC(=O)N(c3ccc4c(c3)CCN4C(C)=O)C2)cc1,1 +EOS73388,CC(=O)N1CCC(NC(=O)c2ccc(Oc3cccc(F)c3)cn2)CC1,1 +EOS97377,C[C@H](NC(=O)CN1C(=O)N(C)C2(CCCCC2)C1=O)c1cc2ccccc2o1,1 +EOS42421,CC[C@H]1c2ccsc2CCN1C(=O)CN1C(=O)CC2(CCCC2)C1=O,1 +EOS59377,Cc1cccc(NC(=O)N2CCN(Cc3ccccc3)[C@H](C#N)C2)n1,1 +EOS12962,O=S(=O)(Nc1cccc2cccnc12)c1ccc2c(c1)OCCO2,1 +EOS29798,Cc1ccccc1-c1noc(-c2c(O)c3ccccc3n(C)c2=O)n1,1 +EOS52786,COCc1cccc(NC(=O)N[C@H]2CCCc3c2cnn3CCO)c1,1 +EOS47267,O=C(CCc1ccc(=O)[nH]c1)N1CCC[C@H]1COc1ccc(F)cc1,1 +EOS81657,O=C(c1ccoc1)N1CCN(C[C@H](O)COc2ccc(F)cc2)CC1,1 +EOS12695,CC1CC(C)CN(CC(=O)Nc2cccc(C(F)(F)F)c2)C1,1 +EOS70223,COc1ccc(CN(C)C(=O)c2coc(Br)c2)cc1O,1 +EOS73405,N#Cc1ccc(NC2CCN(C(=O)[C@@H]3CCCN3S(=O)(=O)c3cccs3)CC2)nc1,1 +EOS84484,O=C(NCC1CC1)c1cc2c(cc1Cl)N1CCCCCC1=NS2(=O)=O,1 +EOS66976,O=C(Nc1ccc(-n2cccn2)c(F)c1)N1CCC(NS(=O)(=O)c2cccs2)CC1,1 +EOS100948,N#Cc1c(NC(=O)c2cccc3ccccc23)sc2c1CCCC2,1 +EOS28571,CCn1c(=O)c(N2CCC(NC(=O)Nc3ccccc3)CC2)nc2cccnc21,1 +EOS28959,COc1cccc(NC(=O)Cn2nc3c(N4CCCCCC4)nccn3c2=O)c1,1 +EOS26607,Cc1nn(C(=O)c2ccc(F)cc2)c(C)c1S(=O)(=O)N1CCCC1,1 +EOS13838,CCCOc1ccc(NC(=O)CSc2nc(=O)cc[nH]2)cc1,1 +EOS11149,COc1ccc(C(CC(=O)N(C)C)c2ccccc2)cc1,1 +EOS70092,O=C(Nc1ccccc1F)N1CCC[C@H](Cn2cncn2)C1,1 +EOS70130,CC(C)Cn1ncnc1CN1CCN([C@@H](C)c2ccsc2)CC1,1 +EOS14953,O=C1CC(c2cccs2)c2cc3c(cc2N1)OCO3,1 +EOS67904,CC(=O)Nc1ccc(C(=O)NC[C@@H](c2ccc(C)o2)N2CCOCC2)cc1,1 +EOS50359,C#CCN1CCC(C(=O)N2CCN([C@@H](C)c3cccnc3)CC2)CC1,1 +EOS86721,COCCCNC(=O)CN(C)C(=O)c1[nH]c(C)c(C(C)=O)c1C,1 +EOS101924,COc1cc(NC(=O)Cc2nn(C)c(=O)c3ccccc23)ccc1Cl,1 +EOS74603,CCC1(CC)CN(C(=O)c2nc(C)c(C)[nH]c2=O)CC[S@@]1=O,1 +EOS35381,COc1ccc(N2C(=O)C(C)CS2(=O)=O)cc1S(=O)(=O)N(Cc1ccccc1)C(C)C,1 +EOS17665,CC(=O)Nc1ccc(CC(=O)N2CCC(CCOc3ccccc3F)C2)cc1,1 +EOS79945,COc1ccc(-c2cc(C(=O)Nc3ccnc4ccnn34)[nH]n2)cc1,1 +EOS68369,Cc1ccccc1NC(=O)C[C@H](C)Nc1ccc2c(c1)C(=O)NC2,1 +EOS44134,CCOc1ccc([C@H](C)NC(=O)c2cn3nc(C)cc3[nH]c2=O)cc1OC,1 +EOS71441,NC(=O)[C@H]1CCCN(C(=O)NCCc2cccs2)C1,1 +EOS32679,Cc1cncc(C(=O)N2CC(N3CC(F)(F)C[C@H]3CN(C)C)C2)c1,1 +EOS28890,CCN1C(=O)COc2cc(-c3noc(C(=O)NCc4cccnc4)n3)ccc21,1 +EOS18150,CCC(=O)N1CC(c2nc(-c3ccc(F)cc3)no2)C2(CCOCC2)C1,1 +EOS89428,COc1cc(S(N)(=O)=O)ccc1N,1 +EOS56951,CNC(=O)NCC(=O)N1CC[C@H](C)N(Cc2ccccc2)CC1,1 +EOS40201,Cc1nc(C)n(C[C@H]2CCCN(Cc3cscn3)C2)n1,1 +EOS61516,CC(C)c1ccc(NC(=O)NC[C@H]2CCS(=O)(=O)C2)cc1,1 +EOS51419,CC(=O)Nc1cc(C(=O)NC[C@H](C)N2CCc3ccccc32)ccc1F,1 +EOS98162,CCCNC(=O)N1CCCC1c1cc(C)no1,1 +EOS62782,Cc1ccccc1NC(=O)CCc1ccc2c(c1)OCCO2,1 +EOS21348,COc1ccccc1NS(=O)(=O)c1ccc(OC)c(OC)c1,1 +EOS13016,CCOCCn1c(=O)c2c(nc3n(-c4ccccc4)c(C)c(C)n23)n(C)c1=O,1 +EOS60820,CC[C@H](NC(=O)Cn1nc(C)c(S(=O)(=O)N(C)C)c1C)c1ccc(C)cc1,1 +EOS1376,CCc1nnc(-c2nnc(NCCCN3CCN(C)CC3)c3ccccc23)o1,1 +EOS13754,COc1ccc(C(=O)Nc2nnc(-c3ccncc3)s2)c(OC)c1,1 +EOS85074,c1cnnc(N2CCC[C@@H]2CNCc2ccsc2)c1,1 +EOS97700,CS(=O)(=O)CCCONC1CCCC1,1 +EOS91906,O=C1CN(NC(=O)C[C@@](O)(c2nc3ccccc3s2)C(F)(F)F)C(=O)N1,1 +EOS80917,CC(C)(C)NC(=O)NC(=O)CN1CCN(c2ccccc2O)CC1,1 +EOS94024,O=C(c1cn(-c2ccccc2)nc1-c1cccnc1)N1CCN(C(=O)N2CCCC2)CC1,1 +EOS29141,COc1ccc(-c2cc3nn(CC(=O)NCc4ccc(F)cc4)c(=O)n3c(C)n2)cc1,1 +EOS58528,Cc1nccn1[C@H]1CCCN(C(=O)c2cccn2Cc2cccnc2)C1,2 +EOS17544,O=C(NCc1ccco1)c1ccc(-n2nnc3cccnc32)cc1,2 +EOS48187,CC(C)(CO)CSc1nnc2sc3ccccc3n12,2 +EOS8378,CCOc1ncccc1C(=O)N1CC(=O)N(c2ccc(C)cc2)CC1C,2 +EOS72263,COc1cc(C)ccc1OCCC(=O)N1CC[C@@H](N)C1.Cl,2 +EOS32876,O=C(CC1COC2(C1)CN(C(=O)CCc1nc(-c3ccccc3)no1)C2)NCC1CC1,2 +EOS56337,CCCn1nc(C(=O)N[C@@H](C)Cn2cccn2)ccc1=O,2 +EOS69439,CCn1nc(C2CC2)cc1C(=O)N1C[C@H](C)NC(=O)c2ccccc21,2 +EOS93462,Cc1sc2nc3ccc(N4CCCCC4)nn3c(=O)c2c1C,2 +EOS35123,COc1ccc(-c2ccc(=O)n(CN3CCN(Cc4ccccc4)CC3)n2)cc1OC,2 +EOS32554,COc1cc(OC)cc(C(=O)N2CCCC(c3nc(C)ncc3CO)C2)c1,2 +EOS74345,CON(C)C(=O)c1cnc(-c2ccc(C)cc2C)s1,2 +EOS68601,O=c1c2cc(Cl)ccc2ncn1C[C@H](O)COCc1ccccc1F,2 +EOS35932,COc1ccc2nc(NC(=O)COc3ccc(C(=O)N4CCOCC4)cc3)sc2c1,2 +EOS14492,Cc1ccn(-c2ccc(N3CCN(C(=O)Nc4cccc(F)c4)CC3)nn2)n1,2 +EOS69256,C[C@@H](C1CC1)n1cc(NC(=O)c2ccc3cc[nH]c3c2)cn1,2 +EOS5127,CCCC(=O)N1CCC(c2nnc(Cn3cncn3)n2C2CC2)CC1,2 +EOS80285,COc1ccccc1CN(C(=O)[C@H]1CCC(=O)NC1)C1CC1,2 +EOS96397,CC(C)[C@H](CNc1cnn(C)c(=O)c1Cl)Nc1ccccc1,2 +EOS8570,Cc1cccc2nc(C(=O)Nc3ccc(Oc4nnn[nH]4)cc3)cn12,2 +EOS30112,CCc1ccc(S(=O)(=O)N2CCC(N(C(=O)COC)C3CCOCC3)CC2)cc1,2 +EOS5291,CCN1CC(C(=O)N2CC(Oc3cccc(C)c3)C2)CC1=O,2 +EOS69488,C[C@H]1CCN(C(=O)c2cc3[nH]cnc3cc2F)CCN1Cc1ccccc1,2 +EOS18037,Cc1n[nH]c([C@@]23CCN(C(=O)Cc4c(C)noc4C)C[C@@H]2CN(C(=O)C(C)C)C3)n1,2 +EOS75443,C[C@]1(NC(=O)Cc2ccccc2Cl)CCS(=O)(=O)C1,2 +EOS61200,CC[C@H](C)C(=O)NCc1ccc(OCc2cccnc2)c(OC)c1,2 +EOS94761,O=C(Nc1ccc(-n2ccnc2)cc1)c1ccccc1,2 +EOS34219,O=C(Nc1ccc(Cl)cc1)C1CCCO1,2 +EOS65077,Cc1ccc(N2C[C@@H](C(=O)NCc3nc(N(C)C)no3)CC2=O)cc1,2 +EOS50967,Cc1cc(C)n(C[C@H]2CCCN2C(=O)c2ccc[nH]2)n1,2 +EOS5197,CC(C)c1noc([C@@H]2CCCN2C(=O)c2cn(C)c(=O)[nH]c2=O)n1,2 +EOS59542,COc1ccc(-c2nn(-c3ccccc3)cc2C(=O)N(C)OC)cc1OC,2 +EOS1006,CCC1CCCCN1C1=C(C(=O)N2CCOCC2)S(=O)(=O)c2ccccc21,2 +EOS45220,O=C(NCCN1CCCS1(=O)=O)c1ccc(-c2ccccc2)[nH]1,2 +EOS788,COc1cccc(NS(=O)(=O)c2c(C(=O)N3CCC(C)CC3)c(C)n(C)c2C)c1,2 +EOS20949,COc1cc(C(=O)N2CCN(Cc3ccc(SC)cc3)CC2)cc(OC)c1OC.O=C(O)C(=O)O,2 +EOS86130,O=C([C@@H]1CC=CCC1)N1CCN(S(=O)(=O)c2ccc(F)cc2)CC1,2 +EOS30812,COc1cccc(CNC(=O)C2CCN(c3cnn(Cc4ccccc4)c(=O)c3)C2)c1,2 +EOS6882,Cn1ccnc1C(NC(=O)N1CCOc2ccc(Cl)cc2C1)C1CC1,2 +EOS69228,CNC(=O)c1ccc(C)c(NC(=O)c2ccnn2CC(F)F)c1,2 +EOS27463,CCn1c(SCC(N)=O)nnc1-c1ccc2c(c1)OCCCO2,2 +EOS84589,O=C(Nc1ccccc1C(=O)NCc1cn2ccccc2n1)c1ccco1,2 +EOS81815,CC(C)Oc1ccc(Cl)c(C(=O)N2CCC[C@H]2c2cnn(C)c2)n1,2 +EOS33197,CN(C)S(=O)(=O)N(CCN1CCCC1)C1CCN(S(=O)(=O)c2ccc(F)cc2)C1,2 +EOS83172,CC(C)(C)NS(=O)(=O)c1cccc(NC(=O)c2ccc3c(c2)OCCO3)c1,2 +EOS84067,Cn1ncc(C2CC2)c1C(=O)Nc1ccc2c(c1)CC(=O)N2,2 +EOS80165,COc1cc(C#N)ccc1OC[C@@H](O)CN1CCC(C)CC1,2 +EOS35519,CCC(C)n1c(C)c(C)n2c3c(=O)n(C)c(=O)n(C)c3nc12,2 +EOS83696,CCc1cccc(NC(=O)CN2CCO[C@@H](C)C2)c1,2 +EOS69169,Cc1cc(NC(=O)c2cn3nccc3nc2C)ccc1OC(C)C,2 +EOS18880,Cc1c(CC(=O)N2CCOCC2)c(=O)oc2c(C)c(O)ccc12,2 +EOS87417,CNC(=O)[C@H](C)NC(=O)Cc1c(C)nc(-c2ccccn2)[nH]c1=O,2 +EOS80106,O=S(=O)([C@H]1CCCN(c2ccnc(C3CC3)n2)C1)N1CCOCC1,2 +EOS49715,CC(C)(C)n1ncn(CCc2ccc3c(c2)CCO3)c1=O,2 +EOS49811,Cn1cc(N2CCN(C(=O)[C@H]3CCc4cn[nH]c4C3)CC2=O)cn1,2 +EOS16480,Cc1cc(C)n(Cc2ccc(C(=O)N3C[C@@H](Oc4cccnc4)C[C@H]3C(=O)NC3CC3)o2)n1,2 +EOS35342,CN(C(=O)C1=C(c2ccccc2)SCCO1)c1ccccc1,2 +EOS7629,COCCn1cnnc1C(C)NC(=O)c1cc(C)oc1C,2 +EOS64203,CC(C)OCc1ccc(C(=O)Nc2ccccc2N2CCOCC2)cc1,2 +EOS56513,COc1ccc(OCC#CCNCc2ccco2)cc1.Cl,2 +EOS42248,Cc1c(S(=O)(=O)N2CCC[C@H](OCC3CC3)C2)cnn1C,2 +EOS48181,Cc1cc(C2(C(=O)NC[C@H](C)Oc3cccc(F)c3)CC2)on1,2 +EOS64099,Cc1cc(=O)n(CC(=O)Nc2ccc(F)cc2)c(SCC(N)=O)c1C#N,2 +EOS16091,Cc1ccccc1Sc1nccnc1SCC(=O)NCCN(C)C,2 +EOS18272,CCN(CC)C(=O)c1ccc(C)c(NC(=O)c2ccccc2)c1,2 +EOS15145,CCCc1cc(C(=O)Nc2cccc(C(C)=O)c2)no1,2 +EOS48819,C[C@@H]1CN(c2c(C#N)nnc3ccccc23)CCN(C2CC2)C1,2 +EOS26757,CC(=O)Nc1ccc(OCc2cc(=O)n3ccc(C)cc3n2)cc1,2 +EOS28926,Cc1nn(Cc2ccc(C(=O)NCC3CCCO3)cc2)c(C)c1S(=O)(=O)N1CCCCC1,2 +EOS78164,COc1c([C@@H]2CCCN2C(=O)Cn2cc(C)cn2)c(C)nn1C,2 +EOS52887,COc1ccccc1NC(=O)N1CCC[C@H](CN2CCC(C)CC2)C1,2 +EOS43076,CC(=O)Nc1ccc(SCC(=O)N(C)[C@H]2CCS(=O)(=O)C2)cc1,2 +EOS54533,CC(=O)[C@H](C)n1c(=O)c2c(nc3n(-c4ccc(F)cc4)c(C)cn23)n(C)c1=O,2 +EOS91708,CC(C)(C)c1ncc(CNc2ncccc2CO)s1,2 +EOS47741,O=C(NCC1=CCNCC1)[C@@H]1COc2ccccc2C1,2 +EOS82946,O=C(c1cn(-c2ccc(F)cc2)nn1)N1CCN(c2ccccc2O)CC1,2 +EOS46735,COc1ccc(C(=O)Nc2cccc(C(=O)N(C)C)c2)cc1,2 +EOS78495,Cc1nccn1[C@H]1CCCN(C(=O)CCc2cnn(C)c2)C1,2 +EOS43752,CC(C)(C)c1ccc([C@@]2(C)NC(=O)N(CN(CC#N)C3CC3)C2=O)cc1,2 +EOS43601,COc1ccc(-n2c(SCCCO)nc3ccccc3c2=O)cc1OC,2 +EOS91781,CC(=O)CCCCCNC(=O)Cc1csc(-c2ccccn2)n1,2 +EOS32539,COCCOCc1cnc(C)nc1C1CCCN(C(=O)c2cccnc2)C1,2 +EOS74564,O=C(Nc1cccc2c1OCO2)N1[C@H]2CC[C@@H]1C[C@H](c1ccccc1)C2,2 +EOS46025,CN(c1ccccc1)S(=O)(=O)c1cccc(NC(=O)CN2C(=O)CCOc3ccccc32)c1,2 +EOS60658,COc1ccccc1CN(C)c1ccc(S(=O)(=O)Nc2ccccn2)cn1,2 +EOS101423,C/C(=C\c1ccc(C(=O)O)cc1)c1ccc2c(c1)C(C)(C)CCC2(C)C,2 +EOS33504,COc1ccc(C(=O)Nc2ccc(N3CCN(c4ccccc4)C3=O)nc2)cn1,2 +EOS13531,O=C(Cc1noc2ccccc12)NC1CCCN(Cc2cccnc2)C1,2 +EOS21395,CCn1nc(C)c2c1c(=O)n(Cc1ccc(OC)cc1)c(=O)n2CC(=O)N1CCCCC1,2 +EOS90312,O=C(C[C@@H]1SC(N2CCOCC2)=NC1=O)Nc1ccccc1,2 +EOS2445,Cc1cc(C)n(-c2cc(N3CCN(C(=O)COc4ccccc4)CC3)ncn2)n1,2 +EOS19142,Cc1cc(NC(=O)Nc2cn(C)c(=O)c3ccccc23)no1,2 +EOS22595,Cc1ccc(NC(=O)C2CCN(S(=O)(=O)Cc3ccccc3)CC2)cc1,2 +EOS82858,C[C@@H](CNS(=O)(=O)C1CN(C(=O)c2ccccc2)C1)N1CCOCC1,2 +EOS25011,Cc1ccc(C(=O)N2CCN(c3c(C)n(-c4ccccc4)c(=O)n(C)c3=O)CC2)cc1,2 +EOS91135,CNC[C@H]1CCCN(S(=O)(=O)c2ccc(OC)c(Cl)c2)C1.Cl,2 +EOS1554,CCn1nccc1C(=O)N1C[C@@H](NC(=O)c2ccccc2)[C@H](c2nc(C)n[nH]2)C1,2 +EOS42199,Cc1cnn(-c2ccc(C(=O)N3C[C@H](C)OCC3(C)C)cc2)c1,2 +EOS10884,NC(=O)c1ccc(S(=O)(=O)N[C@@H]2COCC[C@@H]2OCC2CC2)cc1,2 +EOS19234,COc1ccc(CNC(=O)CC2CCN(C(=O)C3CCCCC3)CC2)cc1,2 +EOS73714,CCCS(=O)(=O)N1CCC(C(=O)NC[C@H]2Cc3ccccc3O2)CC1,2 +EOS36281,CC(C)Nc1nc(C#N)nc(NC(C)C)n1,2 +EOS74281,COc1ccc(-c2noc(CN3CCO[C@@H](C)C3)n2)cc1,2 +EOS59428,Cc1cccn2cc(CC(=O)N(CC(F)(F)F)C(C)C)nc12,2 +EOS98466,CCOc1ccc(-c2noc(CCC(=O)N(CCOC)CCN(C)C)n2)cc1,2 +EOS59117,CCC1(CC)NC(=O)N(C[C@H](O)COc2cc(C)ccc2C)C1=O,2 +EOS40964,Cc1ccc([C@H](CNC(=O)NCc2cc[nH]n2)N2CCOCC2)o1,2 +EOS31674,CCn1ccc(C(=O)N2C[C@@H](Oc3ccccc3)C[C@H]2C(=O)NC2CC2)n1,2 +EOS101385,N#Cc1ccc2[nH]c(O)c(-c3ccc(CN4CCOCC4)cn3)c2c1,2 +EOS87294,CC1(C)CCN(C(=O)NC[C@H]2CCOC3(CCC3)C2)CCS1(=O)=O,2 +EOS60145,CC(C)c1n[nH]c([C@@H]2CN(C(=O)c3cc(Cl)c[nH]3)CCO2)n1,2 +EOS90458,Cl.O=S(=O)(NCC[C@H]1CCNC1)c1ccccc1Cl,2 +EOS80260,O=C(c1c(O)cc(F)cc1F)N(Cc1ccco1)C1CC1,2 +EOS28555,CC(=O)Nc1ccc(S(=O)(=O)N2CCC(c3noc(-c4ccccc4)n3)CC2)cc1,2 +EOS34965,CC(C)n1ccc2c(C(=O)NCCN3CCOCC3)cccc21,2 +EOS82331,COc1ccccc1N1CCN(C(=O)CCNC(C)=O)CC1,2 +EOS96190,CO[C@@H](C)c1nnc(Cc2nc(-c3ccccc3)cs2)o1,2 +EOS85504,CCOCCCNC(=O)[C@@H](NC(=O)c1ccco1)C(C)C,2 +EOS5712,COc1cccc2ncn(Cc3ccccc3CN(C)C)c(=O)c12,2 +EOS86615,O=C(C1CC=CC1)N1CCC[C@H](c2cc(C(F)(F)F)[nH]n2)C1,2 +EOS90873,CCOc1ccc(-c2csc(CCN3CCOC3=O)n2)cc1,2 +EOS20619,NC(=O)c1ccccc1NC(=O)c1cccc(I)c1,2 +EOS63886,Cc1ccccc1CN(C)S(=O)(=O)c1cc(C(N)=O)sc1C,2 +EOS25224,Cc1ccc2c(c1)N(C(=O)Cn1nc3ccc(SC(C)(C)C)nn3c1=O)CCO2,2 +EOS17855,COCCn1nc2n(c1=O)CCN(C(=O)Cc1ccsc1)C2c1ccccc1,2 +EOS26802,O=C(Nc1ccccc1)N1CCN(c2nc3ccccc3n3cccc23)CC1,2 +EOS87770,CSc1ccc(N2C[C@@H](C(=O)N3CCN(C(=O)CC(C)C)CC3)CC2=O)cc1,2 +EOS93165,COc1cccc(F)c1NC(=O)CCc1cscn1,2 +EOS92810,Cn1ncc2c(=O)[nH]c(N3CCN(c4ccc(Cl)cc4)CC3)nc21,2 +EOS29000,Cc1nc2c(-c3ccncc3)nsc2c(=O)n1CC(=O)Nc1ccc2c(c1)OCO2,2 +EOS24947,COCCn1cc(C(=O)Nc2ccc(F)cc2F)c2nn(-c3ccccc3)c(=O)c-2c1,2 +EOS51767,O=C(CN1CCN(C(=O)c2nn(-c3ccccc3)c(=O)c3ccccc23)CC1)N1CCCCC1,2 +EOS90248,Cc1ccc(C(C)(C)C)cc1S(=O)(=O)CCCCS(C)(=O)=O,2 +EOS48186,CN(C)CC(=O)N[C@@H](Cc1ccccc1)c1nc2ccccc2o1,2 +EOS84204,Cc1cc(C(=O)N2CCSC[C@H]2C2CCC2)c2c(C)noc2n1,2 +EOS91903,Cc1cc(NC(=O)[C@H](C)N2CCSCC2)no1,2 +EOS93753,Cc1ccc(NC(=O)c2nn[nH]c2[C@@H]2CCCN2)cc1,2 +EOS50993,CNC(=O)NC(=O)CN1CCC[C@H]1c1nc2ccccc2s1,2 +EOS24437,COc1ccccc1NC(=O)CCn1cnc2oc(C)c(C)c2c1=O,2 +EOS54693,Cn1c(=O)n(CC(=O)Nc2nccn2Cc2ccccc2)c2ccccc21,2 +EOS7731,COCCNC(=O)c1cccc(NC(=O)C2CC23CCNCC3)c1C.Cl,2 +EOS40466,Cc1ccc(NC(=O)c2nn(-c3ccccc3)c(C)cc2=O)c(O)c1,2 +EOS40635,Cc1ccc([C@@H]2CN(C(=O)c3cc4ccccc4c(=O)[nH]3)CCO2)o1,2 +EOS67156,C[C@H]1CN(C(=O)c2c[nH]nc2-c2cccnc2)CC2(CCC2)O1,2 +EOS44393,CCOc1ccc(NC(=O)c2cccc(NC(N)=O)c2)cc1,2 +EOS89279,CC(C)Cc1ccc([C@@H](NC(=O)Cn2cccnc2=O)C2CC2)cc1,2 +EOS96607,CO[C@@H]1C[C@@H](c2ncn[nH]2)N(CCn2cnc3ccccc3c2=O)C1,2 +EOS44576,Cc1noc(C)c1CN1CCn2ccnc2C1,2 +EOS51077,Cl.N[C@@H]1CCN(C(=O)CSC(F)(F)F)C1,2 +EOS6745,CS(=O)(=O)N1CCCCC1CCNC(=O)c1n[nH]c2c1COCC2,2 +EOS43334,CCc1csc([C@H]2CCCN(C(=O)c3c(C)nn(C)c3OC)C2)n1,2 +EOS34539,O=C(CCC(=O)N1CCN(c2ccccn2)CC1)NCCc1c[nH]c2ccccc12,2 +EOS12390,CC(=O)c1cccc(OCCN2CC(C)OC(C)C2)c1.Cl,2 +EOS64973,CNC(=O)C1CCN([C@@H](C)C(=O)NCCc2ccc(F)cc2)CC1,2 +EOS87818,CC(=O)c1cc2c(cc1NC(=O)c1ccc(-n3cccn3)cc1)OCO2,2 +EOS86089,CN(C[C@H]1COc2ccccc2O1)C(=O)c1ccc2c(c1)C(=O)N(Cc1ccco1)C2=O,2 +EOS25795,O=C(CCS(=O)(=O)c1ccc2c(c1)CCN2C(=O)C1CC1)Nc1ccccc1,2 +EOS32968,CCn1ncc(C(=O)N2CCCC(C)(c3noc(C4CCOCC4)n3)C2)c1C,2 +EOS16872,Cc1c(C)n(C(=O)CN2CCC(C)CC2)c2ccccc12,2 +EOS7109,Cc1oc2c(c1C(=O)N1CCc3[nH]cnc3C1c1cccnc1)C(=O)CCC2,2 +EOS100490,O=C(c1ccc(Cl)cc1Cl)n1nnc2ccccc21,2 +EOS41323,CS(=O)(=O)N[C@H]1CCCN(Cn2c(=O)oc3ccccc32)C1,2 +EOS55767,CCn1ccnc1CN(CCOC)S(=O)(=O)c1ccc(C#N)cc1,2 +EOS26822,COc1cccc(CNC(=O)CSc2nnc3c(=O)n(-c4ccccc4)ccn23)c1OC,2 +EOS53211,Cc1cc(F)ccc1NC(=O)Cn1c(=O)n(C)c2ccccc21,2 +EOS11168,COCCN1CC(C(=O)NCCn2nc(-c3ccccc3)ccc2=O)CCC1=O,2 +EOS37474,Cc1cc(=O)[nH]cc1C(=O)N1CC[C@H](Oc2ccccc2)C1,2 +EOS51623,O=C(Cc1csc(C2CCCC2)n1)N1CC2(CCNCC2)[C@H]2COC[C@H]21,2 +EOS25225,COc1ccc(-n2nc3oc4c(O)cccc4cc-3c2=O)cc1,2 +EOS80131,COc1ccc(NC(=O)c2ccc3nc(C)ccc3c2)cc1CO,2 +EOS69425,CCCOc1ccc(C(F)(F)F)cc1NC(=O)CN1CSCC1=O,2 +EOS60423,C#CCOc1ccc(CCNC(=O)Cn2cccn2)cc1,2 +EOS27923,CCn1c(N2CCCC(C(=O)NCC3CCN(Cc4cccc(C)c4)CC3)C2)cc(=O)n(C)c1=O,2 +EOS29821,Cc1noc(-c2ccnc(-n3cnc(C(=O)NC4CCCCC4)c3)c2)n1,2 +EOS84718,CN1CCC(N(C)C(=O)c2nn(-c3ccccc3)c(=O)c3ccccc23)CC1,2 +EOS78472,CCc1n[nH]c([C@@H]2CN(C(=O)CCn3ccnc3)CCO2)n1,2 +EOS78345,COc1cc(CN2CCC([C@H](O)c3nccn3C)CC2)cc(OC)c1,2 +EOS4115,CN(C(=O)CC1(c2cccc(C(F)(F)F)c2)CC(=O)N(C2CC2)C1=O)C1CCOC1,2 +EOS73098,CO[C@@H](C)CS(=O)(=O)Nc1ccc2nscc2c1,2 +EOS81365,COc1ccc(-c2nc(CN3CCN(c4nc(C)cs4)CC3)co2)cc1,2 +EOS87658,COc1c(C)cc(CNC[C@H]2CN(C)CCO2)cc1C,2 +EOS55331,O=C(c1ccc(-c2ccccc2)[nH]c1=O)N1CC[C@H]1c1ccccc1,2 +EOS14144,CCNc1cc(N2CCN(C(=O)c3ccc(C)cc3)CC2)nc(C)n1,2 +EOS809,CCc1nn2c(=O)cc(CSCC(=O)Nc3cccc(C)c3)nc2s1,2 +EOS17520,CCN(CC)C(=O)c1ccc2c(c1)N(CC(=O)NCc1ccc(C)cc1)C(=O)CS2,2 +EOS56103,O=c1[nH]c2ccccc2nc1CN1CCC[C@H]1c1cccs1,2 +EOS67188,COc1ccc(C)cc1-n1ccc(C(=O)N2CCOC[C@@H]2C)n1,2 +EOS94960,Cc1ccc2[nH]c([C@H]3CCCN(S(=O)(=O)CCCF)C3)nc2c1,2 +EOS1652,COCCNC(=O)c1cc2n(n1)C[C@@H](NC(=O)C1(c3ccc(OC)cc3)CC1)C2,2 +EOS49107,Cc1[nH]c(=O)[nH]c1C(=O)N1C[C@H](COc2ccccc2)OC(C)(C)C1,2 +EOS49879,Cc1noc([C@H](C)N2CCN(c3cc(Cl)ccc3C#N)CC2)n1,2 +EOS49048,COc1ccccc1N1CCN(C(=O)Nc2cccc3cccnc23)CC1,2 +EOS16429,CNC(=O)N1CCC(C2CCN(C(=O)c3cccc4[nH]ccc34)C2)CC1,2 +EOS63304,CCCc1n[nH]c([C@@H]2CN(C(=O)c3cnccn3)CCO2)n1,2 +EOS72950,O=C(CCc1ccc(=O)[nH]c1)N[C@H]1[C@@H]2CN(Cc3ccccc3)C[C@@H]21,2 +EOS14232,CCS(=O)(=O)N1CCN(c2ccc(Nc3cccc(C)n3)nn2)CC1,2 +EOS74664,O=C(Nc1ccccc1C(=O)Nc1ccc(Cl)cn1)c1ccon1,2 +EOS91830,Cc1noc2nc(C(C)C)cc(C(=O)N3CCOC[C@@]3(C)C3CC3)c12,2 +EOS90365,CCOc1ccc(C(=O)N[C@@H](C)c2cccs2)cc1OCC,2 +EOS5048,COc1ccc([C@@H]2CN(C(=O)CCc3ccncc3)[C@@H]3C4CCN(CC4)[C@H]23)cc1,2 +EOS70509,CN1CCN(C(=O)Nc2ccn(CC(F)F)n2)Cc2ccccc21,2 +EOS57960,c1ccc(-c2nnc3ccc(N[C@H]4CCCC45OCCO5)nn23)cc1,2 +EOS58938,Cc1nn(C)c(C)c1CC(=O)N1CCC[C@H](n2ccnc2)C1,2 +EOS36643,CS(=O)(=O)N1CC(=O)N2CCc3ccccc3C2C1,2 +EOS32108,CCOCC(=O)N1CCC2(CC1)CN(c1ccc(C#N)c(C(F)(F)F)c1)CC2C(=O)NC,2 +EOS29164,CC(=O)Nc1ccc(-n2cnc(C(=O)N3CCN(c4cccc(C)c4C)CC3)c2)nc1,2 +EOS46827,Cc1ccc(CN(C)CC(=O)N(C)c2c(N)n(Cc3ccccc3)c(=O)[nH]c2=O)cc1,2 +EOS41610,CC(C)NC(=O)CN1CCCC[C@@H]1Cc1ccc(O)cc1,2 +EOS36335,Cn1c(=O)c(C(=O)NCc2cccnc2)c(O)c2ccccc21,2 +EOS921,Cc1nc(-c2ccc(N3CCN(C(=O)c4ccccc4)CC3)cc2)no1,2 +EOS102177,C[C@]12CCC3[C@@H](CC[C@]4(O)C[C@@H](O)CC[C@]34C=O)[C@@]1(O)CC[C@@H]2C1=CC(=O)OC1,2 +EOS26047,Cc1cccc(NC(=O)C2CCCN(S(=O)(=O)c3cn(C)cn3)C2)c1C,2 +EOS52350,Cc1ccc(S(=O)(=O)NCC[C@H]2CCNC2)c(C)c1.Cl,2 +EOS100819,CC1(COc2ccc3c(c2)ncn3-c2ccc3cccc(N4CCC(N)CC4)c3n2)COC1,2 +EOS37615,CN1C[C@@H](CCNC(=O)Nc2ccn3ccnc3c2)CC1=O,2 +EOS49443,NC(=O)c1cccc(Cc2noc([C@H]3CC34CCOCC4)n2)c1,2 +EOS77529,O=C(NCc1ccc(O)c(F)c1)Nc1ccc(-c2csnn2)cc1,2 +EOS87285,Cc1noc([C@@H]2COCCN2Cc2ccc(F)cc2F)n1,2 +EOS100247,CN1CCN(c2ccc(-c3ccncc3-c3cc(F)c(O)c(F)c3)cc2)CC1,2 +EOS9367,O=S(=O)(c1cccs1)N1CCc2[nH]cnc2C1c1cccnc1,2 +EOS59854,O=S1(=O)CCC(S(=O)(=O)N2CCC[C@H]2c2cccc3ccccc23)CC1,2 +EOS68838,Nc1nnc(SCC(=O)N2CCNC2=O)s1,2 +EOS25661,Cc1ccc(CNC(=O)CC(C)n2nc(C)cc2C)cc1,2 +EOS68480,Cc1cnn(C[C@H]2CCCCN2Cc2nccn2CC(F)(F)F)c1,2 +EOS86969,COc1ccccc1NC(=O)c1ccc(S(C)(=O)=O)cc1,2 +EOS53930,CC(=O)N1CCN([C@@H](C)C(=O)Nc2ccc(Cl)cc2)CC1,2 +EOS92311,COCC[C@@H](CO)NC(=O)c1csc(-c2cnc[nH]2)n1,2 +EOS75702,CC[C@H]1C(=O)NCCN1C(=O)c1cn(-c2ccc(Br)cc2)cn1,2 +EOS20463,CCn1nccc1C(=O)N1CCCC(N(Cc2cccnc2)C(=O)COC)C1,2 +EOS55812,Cc1nonc1CC(=O)N1CCCSC1,2 +EOS88541,Cc1cc(C)n(C[C@H]2CCCN2C(=O)c2c(C)noc2C)n1,2 +EOS96439,CCN1CC(=O)N([C@H]2CCCOc3cc(Cl)ccc32)C1=O,2 +EOS47481,Cc1ccc(N2C[C@@H](C(=O)N3CCC(C(=O)NCc4ccccc4)CC3)CC2=O)cc1F,2 +EOS2110,COCCn1c(SCCCc2ccccc2)nc2c1c(=O)[nH]c(=O)n2C,2 +EOS41657,CN1C(=O)N[C@@H](c2cccs2)C2=C1CN(CCO)C2=O,2 +EOS97500,C[C@H](NCCN1CCc2ccccc2C1=O)c1ccc(=O)[nH]n1,2 +EOS40840,CNC(=O)c1ccc(OCc2cc(F)cc3cccnc23)cc1,2 +EOS93595,Cc1cc(NC(=O)NC[C@H](C)N2CCOCC2)no1,2 +EOS26486,O=C(c1ccccc1)N1CCN(C(=O)c2cc(-c3ccncc3)[nH]n2)CC1,2 +EOS37008,O=C1NC2(CCCC2)C(=O)N1Cc1coc(-c2cccc(F)c2)n1,2 +EOS54478,COc1ccc(C(=O)Nc2ccc(NC(C)=O)cc2)cc1Br,2 +EOS23573,Cc1cc(NC(=O)c2cc(S(=O)(=O)N3CCOCC3)cn2C)ccc1Br,2 +EOS41714,CCN(C)C(=O)c1cccc(C(=O)Nc2cccc(-n3cccn3)c2)c1,2 +EOS42037,CC[C@H]1CN(C(=O)c2cn3c(n2)CCC3)CCN1CC(F)F,2 +EOS54525,COc1ccc(NC(C)=O)cc1Nc1ncnc2ccsc12,2 +EOS89481,Cc1cccc(S(=O)(=O)NC(=O)Cc2ccc(OCC#N)cc2)c1,2 +EOS60072,CCN(CC)C(=O)c1nn(-c2ccccc2)c(=O)c2ccccc12,2 +EOS21545,Cc1ccc(S(=O)(=O)Nc2ccc3c(c2)oc(=O)n3C)cc1,2 +EOS92249,Cc1ccnc(NC(=O)CN(C)CC(=O)N(C)Cc2ccco2)c1,2 +EOS91561,Cc1nc(CN2CCN(C(=O)Nc3cnn([C@@H](C)C4CC4)c3)CC2)oc1C,2 \ No newline at end of file diff --git a/atomsci/ddm/test/integrative/model_retrain/test_model_retrain.py b/atomsci/ddm/test/integrative/model_retrain/test_model_retrain.py new file mode 100644 index 00000000..3063acc8 --- /dev/null +++ b/atomsci/ddm/test/integrative/model_retrain/test_model_retrain.py @@ -0,0 +1,86 @@ +import atomsci.ddm.pipeline.parameter_parser as parse +import atomsci.ddm.pipeline.model_pipeline as mp +import atomsci.ddm.pipeline.compare_models as cm +import atomsci.ddm.utils.model_file_reader as mfr +import atomsci.ddm.utils.test_utils as tu +import atomsci.ddm.utils.model_retrain as mr +import os +import shutil +import glob +import json + +def clean(result_dir): + if os.path.exists(result_dir): + shutil.rmtree(result_dir) + +def train_model(result_dir): + """Train a model in production mode""" + + json_file = tu.relative_to_file(__file__, './config.json') + example_file = tu.relative_to_file(__file__, './example.csv') + + with open(json_file, 'r') as f: + config_json = json.load(f) + config_json['dataset_key'] = example_file + config_json['result_dir'] = result_dir + + # Parse parameters + params = parse.wrapper(config_json) + + # Create model pipeline + model = mp.ModelPipeline(params) + + # Train model + model.train_model() + +def retrain_model(model_tar, new_result_dir, keep_seed): + """Retrains a model""" + mr.train_model_from_tar(model_tar, new_result_dir, keep_seed=keep_seed) + +def run_test_retrain(keep_seed): + """Trains and retrains a model + + Trains and retrains a model and compares the results + """ + + # train a model + result_dir = tu.relative_to_file(__file__, 'result') + train_model(result_dir) + + # find the tar file + result_df = cm.get_filesystem_perf_results(result_dir) + assert(len(result_df) == 1) + model_tar = result_df['model_path'].values[0] + + # retrain the model + new_result_dir = tu.relative_to_file(__file__, 'retrain_result') + retrain_model(model_tar, new_result_dir, keep_seed) + + # find the new tar file + result_df = cm.get_filesystem_perf_results(new_result_dir) + assert(len(result_df) == 1) + new_model_tar = result_df['model_path'].values[0] + + original_model = mfr.ModelFileReader(model_tar) + new_model = mfr.ModelFileReader(new_model_tar) + + assert new_model.get_split_uuid() == original_model.get_split_uuid() + + if keep_seed: + assert new_model.get_random_seed()==original_model.get_random_seed() + else: + assert new_model.get_random_seed()!=original_model.get_random_seed() + + # clean files + split_files = glob.glob(tu.relative_to_file(__file__, './example_*_random_*.csv')) + for sf in split_files: + os.remove(sf) + clean(new_result_dir) + clean(result_dir) + +def test_retrain(): + run_test_retrain(True) + run_test_retrain(False) + +if __name__ == '__main__': + test_retrain() \ No newline at end of file diff --git a/atomsci/ddm/utils/model_file_reader.py b/atomsci/ddm/utils/model_file_reader.py index 308c21de..1cfb737e 100644 --- a/atomsci/ddm/utils/model_file_reader.py +++ b/atomsci/ddm/utils/model_file_reader.py @@ -194,7 +194,14 @@ def get_response_cols(self): """ return self.get_training_dataset().get('response_cols') - + + def get_random_seed(self): + """Returns: + (int): random seed used in model training. Returns None if not found. + + """ + return self.metadata_dict.get('seed') + def get_model_info(self): """Extract the model metadata (and if applicable, model metrics) diff --git a/atomsci/ddm/utils/model_retrain.py b/atomsci/ddm/utils/model_retrain.py index 5f89e649..bb77aa3f 100644 --- a/atomsci/ddm/utils/model_retrain.py +++ b/atomsci/ddm/utils/model_retrain.py @@ -118,6 +118,8 @@ def train_model_from_tar(input, output, dskey='', production=False, keep_seed=Fa dskey (str): new dataset key if file location has changed + keep_seed (bool): True to keep the same random seed. + Returns: the model pipeline object with trained model """ From c5e634fdf987ae2d4569c9a021d3e40368c3db4d Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Tue, 3 Dec 2024 14:33:30 -0800 Subject: [PATCH 48/57] Move common functions to integrative_utilities --- .../ddm/test/integrative/integrative_utilities.py | 11 +++++++++++ .../test/integrative/sampling_test/test_sampling.py | 9 ++++----- .../test/integrative/seed_test/test_seed_models.py | 13 ++++--------- .../integrative/seed_test/test_seed_splitting.py | 13 ++++--------- 4 files changed, 23 insertions(+), 23 deletions(-) diff --git a/atomsci/ddm/test/integrative/integrative_utilities.py b/atomsci/ddm/test/integrative/integrative_utilities.py index 8db0c9f1..c02fb9f9 100644 --- a/atomsci/ddm/test/integrative/integrative_utilities.py +++ b/atomsci/ddm/test/integrative/integrative_utilities.py @@ -85,3 +85,14 @@ def copy_delaney(dest='.'): '../test_datasets/delaney-processed.csv')) shutil.copy(delaney_source, dest) + +def extract_seed(metadata_path): + with open(metadata_path, 'r') as f: + metadata = json.load(f) + return metadata.get('seed') + +def modify_params_with_seed(pparams, seed): + pparams.seed = seed + return pparams + + diff --git a/atomsci/ddm/test/integrative/sampling_test/test_sampling.py b/atomsci/ddm/test/integrative/sampling_test/test_sampling.py index 488fcb6d..af6c4660 100644 --- a/atomsci/ddm/test/integrative/sampling_test/test_sampling.py +++ b/atomsci/ddm/test/integrative/sampling_test/test_sampling.py @@ -11,6 +11,10 @@ from atomsci.ddm.pipeline import parameter_parser as parse import atomsci.ddm.pipeline.predict_from_model as pfm +import sys +sys.path.append(os.path.join(os.path.dirname(__file__), '..')) +from integrative_utilities import extract_seed + #------------------------------------------------------------------- def get_test_set(dataset_key, split_csv, id_col): @@ -37,11 +41,6 @@ def find_best_test_metric(model_metrics): return metric return None -def extract_seed(metadata_path): - with open(metadata_path, 'r') as f: - metadata = json.load(f) - return metadata.get('seed') - def saved_model_identity(pparams): script_path = os.path.dirname(os.path.realpath(__file__)) retrain_pparams = copy.copy(pparams) diff --git a/atomsci/ddm/test/integrative/seed_test/test_seed_models.py b/atomsci/ddm/test/integrative/seed_test/test_seed_models.py index 9e9a1f2f..9cee12a3 100644 --- a/atomsci/ddm/test/integrative/seed_test/test_seed_models.py +++ b/atomsci/ddm/test/integrative/seed_test/test_seed_models.py @@ -8,6 +8,10 @@ from atomsci.ddm.pipeline import model_pipeline as mp from atomsci.ddm.pipeline import parameter_parser as parse +import sys +sys.path.append(os.path.join(os.path.dirname(__file__), '..')) +from integrative_utilities import extract_seed, modify_params_with_seed + #------------------------------------------------------------------- """ This script does the following: @@ -38,15 +42,6 @@ def find_best_test_metric(model_metrics): return metric return None -def extract_seed(metadata_path): - with open(metadata_path, 'r') as f: - metadata = json.load(f) - return metadata.get('seed') - -def modify_params_with_seed(pparams, seed): - pparams.seed = seed - return pparams - def saved_model_identity(pparams): retrain_pparams = copy.copy(pparams) diff --git a/atomsci/ddm/test/integrative/seed_test/test_seed_splitting.py b/atomsci/ddm/test/integrative/seed_test/test_seed_splitting.py index a158e765..5feea923 100644 --- a/atomsci/ddm/test/integrative/seed_test/test_seed_splitting.py +++ b/atomsci/ddm/test/integrative/seed_test/test_seed_splitting.py @@ -9,6 +9,10 @@ from atomsci.ddm.pipeline import model_pipeline as mp from atomsci.ddm.pipeline import parameter_parser as parse +import sys +sys.path.append(os.path.join(os.path.dirname(__file__), '..')) +from integrative_utilities import extract_seed, modify_params_with_seed + #---------------------------------------------------------------------------------------------------------- def split_dataset(pparams): model_pipe = mp.ModelPipeline(pparams) @@ -16,15 +20,6 @@ def split_dataset(pparams): pparams.split_uuid = split_uuid return pparams -def extract_seed(metadata_path): - with open(metadata_path, 'r') as f: - metadata = json.load(f) - return metadata.get('seed') - -def modify_params_with_seed(pparams, seed): - pparams.seed = seed - return pparams - def compare_splits(original_split_csv, retrained_split_csv): original_split = pd.read_csv(original_split_csv) retrained_split = pd.read_csv(retrained_split_csv) From 36c38ecd041405aafa25607b7ca2c47a0ed966df Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Tue, 3 Dec 2024 14:35:47 -0800 Subject: [PATCH 49/57] Move common functions to integrative_utilities --- .../test/integrative/integrative_utilities.py | 22 ++++++++++++++++ .../sampling_test/test_sampling.py | 26 +------------------ .../integrative/seed_test/test_seed_models.py | 18 +------------ 3 files changed, 24 insertions(+), 42 deletions(-) diff --git a/atomsci/ddm/test/integrative/integrative_utilities.py b/atomsci/ddm/test/integrative/integrative_utilities.py index c02fb9f9..8c130bda 100644 --- a/atomsci/ddm/test/integrative/integrative_utilities.py +++ b/atomsci/ddm/test/integrative/integrative_utilities.py @@ -95,4 +95,26 @@ def modify_params_with_seed(pparams, seed): pparams.seed = seed return pparams +def get_test_set(dataset_key, split_csv, id_col): + """ + Read the dataset key and split_uuid to split dataset into split components + + Parameters: + - dataset_key: path to csv file of dataset + - split_uuid: path to split csv file + - id_col: name of ID column + + Returns: + - train, valid, test dataframe + """ + df = pd.read_csv(dataset_key) + split_df=pd.read_csv(split_csv) + test_df = df[df[id_col].isin(split_df[split_df['subset']=='test']['cmpd_id'])] + + return test_df +def find_best_test_metric(model_metrics): + for metric in model_metrics: + if metric['label'] == 'best' and metric['subset']=='test': + return metric + return None diff --git a/atomsci/ddm/test/integrative/sampling_test/test_sampling.py b/atomsci/ddm/test/integrative/sampling_test/test_sampling.py index af6c4660..4ba1a2b4 100644 --- a/atomsci/ddm/test/integrative/sampling_test/test_sampling.py +++ b/atomsci/ddm/test/integrative/sampling_test/test_sampling.py @@ -13,34 +13,10 @@ import sys sys.path.append(os.path.join(os.path.dirname(__file__), '..')) -from integrative_utilities import extract_seed +from integrative_utilities import extract_seed, get_test_set, find_best_test_metric #------------------------------------------------------------------- -def get_test_set(dataset_key, split_csv, id_col): - """ - Read the dataset key and split_uuid to split dataset into split components - - Parameters: - - dataset_key: path to csv file of dataset - - split_uuid: path to split csv file - - id_col: name of ID column - - Returns: - - train, valid, test dataframe - """ - df = pd.read_csv(dataset_key) - split_df=pd.read_csv(split_csv) - test_df = df[df[id_col].isin(split_df[split_df['subset']=='test']['cmpd_id'])] - - return test_df - -def find_best_test_metric(model_metrics): - for metric in model_metrics: - if metric['label'] == 'best' and metric['subset']=='test': - return metric - return None - def saved_model_identity(pparams): script_path = os.path.dirname(os.path.realpath(__file__)) retrain_pparams = copy.copy(pparams) diff --git a/atomsci/ddm/test/integrative/seed_test/test_seed_models.py b/atomsci/ddm/test/integrative/seed_test/test_seed_models.py index 9cee12a3..56dcd40e 100644 --- a/atomsci/ddm/test/integrative/seed_test/test_seed_models.py +++ b/atomsci/ddm/test/integrative/seed_test/test_seed_models.py @@ -10,7 +10,7 @@ import sys sys.path.append(os.path.join(os.path.dirname(__file__), '..')) -from integrative_utilities import extract_seed, modify_params_with_seed +from integrative_utilities import extract_seed, get_test_set, find_best_test_metric #------------------------------------------------------------------- """ @@ -26,22 +26,6 @@ - train_valid_test split, k-fold cv split """ #------------------------------------------------------------------- - -def get_test_set(dataset_key, split_csv, id_col): - """ - Read the dataset key and split_uuid to split the dataset into split components - """ - df = pd.read_csv(dataset_key) - split_df = pd.read_csv(split_csv) - test_df = df[df[id_col].isin(split_df[split_df['subset'] == 'test']['cmpd_id'])] - return test_df - -def find_best_test_metric(model_metrics): - for metric in model_metrics: - if metric['label'] == 'best' and metric['subset'] == 'test': - return metric - return None - def saved_model_identity(pparams): retrain_pparams = copy.copy(pparams) From d11ee2c441efeb5759afef00f2ee7865146d1222 Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Tue, 3 Dec 2024 14:36:33 -0800 Subject: [PATCH 50/57] deleted unused imports --- atomsci/ddm/test/integrative/seed_test/test_seed_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/atomsci/ddm/test/integrative/seed_test/test_seed_models.py b/atomsci/ddm/test/integrative/seed_test/test_seed_models.py index 56dcd40e..a53d7397 100644 --- a/atomsci/ddm/test/integrative/seed_test/test_seed_models.py +++ b/atomsci/ddm/test/integrative/seed_test/test_seed_models.py @@ -10,7 +10,7 @@ import sys sys.path.append(os.path.join(os.path.dirname(__file__), '..')) -from integrative_utilities import extract_seed, get_test_set, find_best_test_metric +from integrative_utilities import extract_seed, find_best_test_metric #------------------------------------------------------------------- """ From a089d1f0c7f875a834b322958df641abb3a8e885 Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Tue, 3 Dec 2024 15:13:29 -0800 Subject: [PATCH 51/57] moved params to json files --- ...perf_data_KFoldClassificationPerfData.json | 17 +++ ...fig_perf_data_KFoldRegressoinPerfData.json | 18 +++ ...erf_data_KFoldRegressoinPerfDataMulti.json | 17 +++ ...erf_data_SimpleClassificationPerfData.json | 17 +++ ...ig_perf_data_SimpleRegressionPerfData.json | 17 +++ atomsci/ddm/test/unit/test_perf_data.py | 113 +++++------------- 6 files changed, 113 insertions(+), 86 deletions(-) create mode 100644 atomsci/ddm/test/unit/config_perf_data_KFoldClassificationPerfData.json create mode 100644 atomsci/ddm/test/unit/config_perf_data_KFoldRegressoinPerfData.json create mode 100644 atomsci/ddm/test/unit/config_perf_data_KFoldRegressoinPerfDataMulti.json create mode 100644 atomsci/ddm/test/unit/config_perf_data_SimpleClassificationPerfData.json create mode 100644 atomsci/ddm/test/unit/config_perf_data_SimpleRegressionPerfData.json diff --git a/atomsci/ddm/test/unit/config_perf_data_KFoldClassificationPerfData.json b/atomsci/ddm/test/unit/config_perf_data_KFoldClassificationPerfData.json new file mode 100644 index 00000000..5b0a19b1 --- /dev/null +++ b/atomsci/ddm/test/unit/config_perf_data_KFoldClassificationPerfData.json @@ -0,0 +1,17 @@ +{"verbose": "True", + "datastore": "False", + "save_results": "False", + "model_type": "NN", + "featurizer": "ecfp", + "split_strategy": "k_fold_cv", + "splitter": "random", + "split_test_frac": "0.15", + "split_valid_frac": "0.15", + "transformers": "True", + "dataset_key": "replaced", + "id_col": "compound_id", + "response_cols":"active", + "smiles_col": "base_rdkit_smiles", + "max_epochs":"2", + "prediction_type": "classification", + "result_dir": "replaced"} \ No newline at end of file diff --git a/atomsci/ddm/test/unit/config_perf_data_KFoldRegressoinPerfData.json b/atomsci/ddm/test/unit/config_perf_data_KFoldRegressoinPerfData.json new file mode 100644 index 00000000..b2ea5bd8 --- /dev/null +++ b/atomsci/ddm/test/unit/config_perf_data_KFoldRegressoinPerfData.json @@ -0,0 +1,18 @@ +{"verbose": "True", + "datastore": "False", + "save_results": "False", + "model_type": "NN", + "featurizer": "ecfp", + "split_strategy": "k_fold_cv", + "splitter": "random", + "split_test_frac": "0.15", + "split_valid_frac": "0.15", + "transformers": "True", + "dataset_key": "replaced", + "id_col": "compound_id", + "response_cols":"pIC50", + "smiles_col": "base_rdkit_smiles", + "max_epochs":"2", + "prediction_type": "regression", + "result_dir": "replaced" +} \ No newline at end of file diff --git a/atomsci/ddm/test/unit/config_perf_data_KFoldRegressoinPerfDataMulti.json b/atomsci/ddm/test/unit/config_perf_data_KFoldRegressoinPerfDataMulti.json new file mode 100644 index 00000000..9b3512a6 --- /dev/null +++ b/atomsci/ddm/test/unit/config_perf_data_KFoldRegressoinPerfDataMulti.json @@ -0,0 +1,17 @@ +{"verbose": "True", + "datastore": "False", + "save_results": "False", + "model_type": "NN", + "featurizer": "ecfp", + "split_strategy": "k_fold_cv", + "splitter": "random", + "split_test_frac": "0.15", + "split_valid_frac": "0.15", + "transformers": "True", + "dataset_key": "replaced", + "id_col": "compound_id", + "response_cols":["pIC50", "pIC50_dupe"], + "smiles_col": "base_rdkit_smiles", + "max_epochs":"2", + "prediction_type": "regression", + "result_dir":"replaced"} diff --git a/atomsci/ddm/test/unit/config_perf_data_SimpleClassificationPerfData.json b/atomsci/ddm/test/unit/config_perf_data_SimpleClassificationPerfData.json new file mode 100644 index 00000000..1017c83f --- /dev/null +++ b/atomsci/ddm/test/unit/config_perf_data_SimpleClassificationPerfData.json @@ -0,0 +1,17 @@ +{"verbose": "True", + "datastore": "False", + "save_results": "False", + "model_type": "NN", + "featurizer": "ecfp", + "split_strategy": "train_valid_test", + "splitter": "random", + "split_test_frac": "0.15", + "split_valid_frac": "0.15", + "transformers": "True", + "dataset_key": "replaced", + "id_col": "compound_id", + "response_cols":"active", + "smiles_col": "base_rdkit_smiles", + "max_epochs":"2", + "prediction_type": "classification", + "result_dir": "replaced"} \ No newline at end of file diff --git a/atomsci/ddm/test/unit/config_perf_data_SimpleRegressionPerfData.json b/atomsci/ddm/test/unit/config_perf_data_SimpleRegressionPerfData.json new file mode 100644 index 00000000..f4361844 --- /dev/null +++ b/atomsci/ddm/test/unit/config_perf_data_SimpleRegressionPerfData.json @@ -0,0 +1,17 @@ +{"verbose": "True", + "datastore": "False", + "save_results": "False", + "model_type": "NN", + "featurizer": "ecfp", + "split_strategy": "train_valid_test", + "splitter": "random", + "split_test_frac": "0.15", + "split_valid_frac": "0.15", + "transformers": "True", + "id_col": "compound_id", + "dataset_key": "replaced", + "response_cols":"pIC50", + "smiles_col": "base_rdkit_smiles", + "max_epochs":"2", + "prediction_type": "regression", + "result_dir": "replaced"} \ No newline at end of file diff --git a/atomsci/ddm/test/unit/test_perf_data.py b/atomsci/ddm/test/unit/test_perf_data.py index d6c0637e..f218cfa5 100644 --- a/atomsci/ddm/test/unit/test_perf_data.py +++ b/atomsci/ddm/test/unit/test_perf_data.py @@ -7,6 +7,7 @@ import numpy as np import shutil import pandas as pd +import json def copy_to_temp(dskey, res_dir): new_dskey = shutil.copy(dskey, res_dir) @@ -20,26 +21,24 @@ def setup_paths(): return res_dir, tmp_dskey +def read_params(json_file, res_dir, tmp_dskey): + with open(json_file, 'r') as file: + params = json.load(file) + params['result_dir'] = res_dir + params['dataset_key'] = tmp_dskey + return params + +def make_relative_to_file(relative_path): + script_path = os.path.dirname(os.path.realpath(__file__)) + result = os.path.join(script_path, relative_path) + + return result + def test_KFoldRegressionPerfData(): res_dir, tmp_dskey = setup_paths() - params = {"verbose": "True", - "datastore": "False", - "save_results": "False", - "model_type": "NN", - "featurizer": "ecfp", - "split_strategy": "k_fold_cv", - "splitter": "random", - "split_test_frac": "0.15", - "split_valid_frac": "0.15", - "transformers": "True", - "dataset_key": tmp_dskey, - "id_col": "compound_id", - "response_cols":"pIC50", - "smiles_col": "base_rdkit_smiles", - "max_epochs":"2", - "prediction_type": "regression", - "result_dir":res_dir} + params = read_params(make_relative_to_file('config_perf_data_KFoldRegressoinPerfData.json'), + res_dir, tmp_dskey) # setup a pipeline that will be used to create performance data pparams = parse.wrapper(params) @@ -89,24 +88,8 @@ def test_KFoldRegressionPerfDataMulti(): df['pIC50_dupe'] = df['pIC50'] df.to_csv(tmp_dskey, index=False) - - params = {"verbose": "True", - "datastore": "False", - "save_results": "False", - "model_type": "NN", - "featurizer": "ecfp", - "split_strategy": "k_fold_cv", - "splitter": "random", - "split_test_frac": "0.15", - "split_valid_frac": "0.15", - "transformers": "True", - "dataset_key": tmp_dskey, - "id_col": "compound_id", - "response_cols":["pIC50", "pIC50_dupe"], - "smiles_col": "base_rdkit_smiles", - "max_epochs":"2", - "prediction_type": "regression", - "result_dir":res_dir} + params = read_params(make_relative_to_file('config_perf_data_KFoldRegressoinPerfDataMulti.json'), + res_dir, tmp_dskey) # setup a pipeline that will be used to create performance data pparams = parse.wrapper(params) @@ -151,23 +134,9 @@ def test_KFoldRegressionPerfDataMulti(): def test_KFoldClassificationPerfData(): res_dir, tmp_dskey = setup_paths() - params = {"verbose": "True", - "datastore": "False", - "save_results": "False", - "model_type": "NN", - "featurizer": "ecfp", - "split_strategy": "k_fold_cv", - "splitter": "random", - "split_test_frac": "0.15", - "split_valid_frac": "0.15", - "transformers": "True", - "dataset_key": tmp_dskey, - "id_col": "compound_id", - "response_cols":"active", - "smiles_col": "base_rdkit_smiles", - "max_epochs":"2", - "prediction_type": "classification", - "result_dir":res_dir} + params = read_params( + make_relative_to_file('config_perf_data_KFoldClassificationPerfData.json'), + res_dir, tmp_dskey) # setup a pipeline that will be used to create performance data pparams = parse.wrapper(params) @@ -216,23 +185,9 @@ def test_KFoldClassificationPerfData(): def test_SimpleRegressionPerfData(): res_dir, tmp_dskey = setup_paths() - params = {"verbose": "True", - "datastore": "False", - "save_results": "False", - "model_type": "NN", - "featurizer": "ecfp", - "split_strategy": "train_valid_test", - "splitter": "random", - "split_test_frac": "0.15", - "split_valid_frac": "0.15", - "transformers": "True", - "id_col": "compound_id", - "dataset_key": tmp_dskey, - "response_cols":"pIC50", - "smiles_col": "base_rdkit_smiles", - "max_epochs":"2", - "prediction_type": "regression", - "result_dir":res_dir} + params = read_params( + make_relative_to_file('config_perf_data_SimpleRegressionPerfData.json'), + res_dir, tmp_dskey) # setup a pipeline that will be used to create performance data pparams = parse.wrapper(params) @@ -274,23 +229,9 @@ def test_SimpleRegressionPerfData(): def test_SimpleClassificationPerfData(): res_dir, tmp_dskey = setup_paths() - params = {"verbose": "True", - "datastore": "False", - "save_results": "False", - "model_type": "NN", - "featurizer": "ecfp", - "split_strategy": "train_valid_test", - "splitter": "random", - "split_test_frac": "0.15", - "split_valid_frac": "0.15", - "transformers": "True", - "dataset_key": tmp_dskey, - "id_col": "compound_id", - "response_cols":"active", - "smiles_col": "base_rdkit_smiles", - "max_epochs":"2", - "prediction_type": "classification", - "result_dir":res_dir} + params = read_params( + make_relative_to_file('config_perf_data_SimpleClassificationPerfData.json'), + res_dir, tmp_dskey) # setup a pipeline that will be used to create performance data pparams = parse.wrapper(params) From 271c502052de421a8d85b00be9dbb6846ebbc0be Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Tue, 3 Dec 2024 15:24:18 -0800 Subject: [PATCH 52/57] Prevent divide by zero case if the model never learns --- atomsci/ddm/pipeline/model_wrapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/atomsci/ddm/pipeline/model_wrapper.py b/atomsci/ddm/pipeline/model_wrapper.py index 8ff57ad7..7c2392cc 100644 --- a/atomsci/ddm/pipeline/model_wrapper.py +++ b/atomsci/ddm/pipeline/model_wrapper.py @@ -954,7 +954,7 @@ def train_kfold_cv(self, pipeline): self._copy_model(self.best_model_dir) retrain_time = time.time() - retrain_start self.log.info("Time to retrain model for %d epochs: %.1f seconds, %.1f sec/epoch" % (self.best_epoch, retrain_time, - retrain_time/self.best_epoch)) + retrain_time/max(1, self.best_epoch))) # **************************************************************************************** def train_with_early_stopping(self, pipeline): From 48635cbe5300cb245837ea9d796a0ae776284605 Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Tue, 3 Dec 2024 16:00:30 -0800 Subject: [PATCH 53/57] Moved pandas import over to integrative_utilities --- atomsci/ddm/test/integrative/integrative_utilities.py | 1 + atomsci/ddm/test/integrative/sampling_test/test_sampling.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/atomsci/ddm/test/integrative/integrative_utilities.py b/atomsci/ddm/test/integrative/integrative_utilities.py index 8c130bda..58afff4f 100644 --- a/atomsci/ddm/test/integrative/integrative_utilities.py +++ b/atomsci/ddm/test/integrative/integrative_utilities.py @@ -2,6 +2,7 @@ import json import os import shutil +import pandas as pd def clean_fit_predict(): diff --git a/atomsci/ddm/test/integrative/sampling_test/test_sampling.py b/atomsci/ddm/test/integrative/sampling_test/test_sampling.py index 4ba1a2b4..d859a4ac 100644 --- a/atomsci/ddm/test/integrative/sampling_test/test_sampling.py +++ b/atomsci/ddm/test/integrative/sampling_test/test_sampling.py @@ -1,7 +1,6 @@ #!/usr/bin/env python """Testing the sampling methods. Want to ensure that the model pipeline works and that the sampling methods are incorporated. Based off of the test_kfold_split.py method. """ -import pandas as pd import sklearn.metrics as skmetrics import copy import os From 0c67471a7219ac97151eb550a341c061793ebe5f Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Wed, 4 Dec 2024 08:42:32 -0800 Subject: [PATCH 54/57] Added a seed here for reproducability --- .../dc_models/reg_config_H1_fit_GraphConvModel.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/atomsci/ddm/test/integrative/dc_models/reg_config_H1_fit_GraphConvModel.json b/atomsci/ddm/test/integrative/dc_models/reg_config_H1_fit_GraphConvModel.json index 37af3d78..29277b7c 100644 --- a/atomsci/ddm/test/integrative/dc_models/reg_config_H1_fit_GraphConvModel.json +++ b/atomsci/ddm/test/integrative/dc_models/reg_config_H1_fit_GraphConvModel.json @@ -50,5 +50,6 @@ "comment": "Test", "comment": "just needs to run, doesn't need to pass", "comment": "----------------------------------------", - "perf_threshold": "0.3" + "perf_threshold": "0.3", + "seed": "0" } From 524d804490d202ee9397ac9a47e84b65fdfe4f4c Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Wed, 4 Dec 2024 16:01:19 -0800 Subject: [PATCH 55/57] Testing SMOTE and balancing transformer --- .../jsons/SMOTE_balancing_transformer.json | 31 +++++ .../jsons/balancing_transformer.json | 29 +++++ .../jsons/wo_balancing_transformer.json | 27 +++++ .../test_balancing_transformer.py | 107 ++++++++---------- 4 files changed, 137 insertions(+), 57 deletions(-) create mode 100644 atomsci/ddm/test/integrative/balancing_trans/jsons/SMOTE_balancing_transformer.json create mode 100644 atomsci/ddm/test/integrative/balancing_trans/jsons/balancing_transformer.json create mode 100644 atomsci/ddm/test/integrative/balancing_trans/jsons/wo_balancing_transformer.json diff --git a/atomsci/ddm/test/integrative/balancing_trans/jsons/SMOTE_balancing_transformer.json b/atomsci/ddm/test/integrative/balancing_trans/jsons/SMOTE_balancing_transformer.json new file mode 100644 index 00000000..49d7f4ab --- /dev/null +++ b/atomsci/ddm/test/integrative/balancing_trans/jsons/SMOTE_balancing_transformer.json @@ -0,0 +1,31 @@ +{ + "dataset_key" : "replaced", + "datastore" : "False", + "uncertainty": "False", + "splitter": "scaffold", + "split_valid_frac": "0.20", + "split_test_frac": "0.20", + "split_strategy": "train_valid_test", + "prediction_type": "classification", + "model_choice_score_type": "roc_auc", + "response_cols" : "active", + "id_col": "compound_id", + "smiles_col" : "rdkit_smiles", + "result_dir": "replaced", + "system": "LC", + "transformers": "True", + "model_type": "NN", + "featurizer": "computed_descriptors", + "descriptor_type": "rdkit_raw", + "weight_transform_type": "balancing", + "learning_rate": ".0007", + "layer_sizes": "512,128", + "dropouts": "0.3,0.3", + "save_results": "False", + "max_epochs": "2", + "early_stopping_patience": "2", + "verbose": "False", + "sampling_method": "SMOTE", + "sampling_ratio": "0.5", + "seed":"0" + } \ No newline at end of file diff --git a/atomsci/ddm/test/integrative/balancing_trans/jsons/balancing_transformer.json b/atomsci/ddm/test/integrative/balancing_trans/jsons/balancing_transformer.json new file mode 100644 index 00000000..3a9b127f --- /dev/null +++ b/atomsci/ddm/test/integrative/balancing_trans/jsons/balancing_transformer.json @@ -0,0 +1,29 @@ +{ + "dataset_key" : "replaced", + "datastore" : "False", + "uncertainty": "False", + "splitter": "scaffold", + "split_valid_frac": "0.20", + "split_test_frac": "0.20", + "split_strategy": "train_valid_test", + "prediction_type": "classification", + "model_choice_score_type": "roc_auc", + "response_cols" : "active", + "id_col": "compound_id", + "smiles_col" : "rdkit_smiles", + "result_dir": "replaced", + "system": "LC", + "transformers": "True", + "model_type": "NN", + "featurizer": "computed_descriptors", + "descriptor_type": "rdkit_raw", + "weight_transform_type": "balancing", + "learning_rate": ".0007", + "layer_sizes": "512,128", + "dropouts": "0.3,0.3", + "save_results": "False", + "max_epochs": "2", + "early_stopping_patience": "2", + "verbose": "False", + "seed":"0" + } \ No newline at end of file diff --git a/atomsci/ddm/test/integrative/balancing_trans/jsons/wo_balancing_transformer.json b/atomsci/ddm/test/integrative/balancing_trans/jsons/wo_balancing_transformer.json new file mode 100644 index 00000000..00c5160c --- /dev/null +++ b/atomsci/ddm/test/integrative/balancing_trans/jsons/wo_balancing_transformer.json @@ -0,0 +1,27 @@ +{ + "dataset_key" : "replaced", + "datastore" : "False", + "uncertainty": "False", + "splitter": "scaffold", + "split_valid_frac": "0.20", + "split_test_frac": "0.20", + "split_strategy": "train_valid_test", + "prediction_type": "classification", + "model_choice_score_type": "roc_auc", + "response_cols" : "active", + "id_col": "compound_id", + "smiles_col" : "rdkit_smiles", + "result_dir": "replaced", + "system": "LC", + "transformers": "True", + "model_type": "NN", + "featurizer": "ecfp", + "learning_rate": ".0007", + "layer_sizes": "512,128", + "dropouts": "0.3,0.3", + "save_results": "False", + "max_epochs": "2", + "early_stopping_patience": "2", + "verbose": "False", + "seed":"0" + } \ No newline at end of file diff --git a/atomsci/ddm/test/integrative/balancing_trans/test_balancing_transformer.py b/atomsci/ddm/test/integrative/balancing_trans/test_balancing_transformer.py index 09043c45..9711cd7e 100644 --- a/atomsci/ddm/test/integrative/balancing_trans/test_balancing_transformer.py +++ b/atomsci/ddm/test/integrative/balancing_trans/test_balancing_transformer.py @@ -3,12 +3,14 @@ import atomsci.ddm.pipeline.parameter_parser as parse import atomsci.ddm.pipeline.model_pipeline as mp import numpy as np +import os +import json import logging logger = logging.getLogger(__name__) def test_balancing_transformer(): - dset_key = '../../test_datasets/MRP3_dataset.csv' + dset_key = make_relative_to_file('../../test_datasets/MRP3_dataset.csv') res_dir = tempfile.mkdtemp() @@ -24,6 +26,21 @@ def test_balancing_transformer(): assert weight == 1 assert count == 436 + smote_balanced_params = params_w_SMOTE_balan(dset_key, res_dir) + smote_balanced_params['sampling_ratio'] = 0.5 + smote_balanced_weights = make_pipeline_and_get_weights(smote_balanced_params) + (weight,), (count,) = np.unique(smote_balanced_weights, return_counts=True) + # all weights should be the same + assert np.all(weight==weight[0]) + + smote_balanced_params = params_w_SMOTE_balan(dset_key, res_dir) + smote_balanced_params['sampling_ratio'] = 0.8 + smote_balanced_weights = make_pipeline_and_get_weights(smote_balanced_params) + (major_weight, minor_weight), (major_count, minor_count) = np.unique(smote_balanced_weights, return_counts=True) + # There should be one weight that's larger and one that is smaller + assert major_weight < minor_weight + assert major_count > minor_count + def make_pipeline_and_get_weights(params): pparams = parse.wrapper(params) model_pipeline = mp.ModelPipeline(pparams) @@ -31,69 +48,45 @@ def make_pipeline_and_get_weights(params): return model_pipeline.data.train_valid_dsets[0][0].w +def make_relative_to_file(relative_path): + script_path = os.path.dirname(os.path.realpath(__file__)) + result = os.path.join(script_path, relative_path) + + return result + +def read_params(json_file, tmp_dskey, res_dir): + with open(json_file, 'r') as file: + params = json.load(file) + params['result_dir'] = res_dir + params['dataset_key'] = tmp_dskey + return params + def params_wo_balan(dset_key, res_dir): # Train classification models without balancing weights. Repeat this several times so we can get some statistics on the performance metrics. - params = { - "dataset_key" : dset_key, - "datastore" : "False", - "uncertainty": "False", - "splitter": "scaffold", - "split_valid_frac": "0.20", - "split_test_frac": "0.20", - "split_strategy": "train_valid_test", - "prediction_type": "classification", - "model_choice_score_type": "roc_auc", - "response_cols" : "active", - "id_col": "compound_id", - "smiles_col" : "rdkit_smiles", - "result_dir": res_dir, - "system": "LC", - "transformers": "True", - "model_type": "NN", - "featurizer": "ecfp", - "learning_rate": ".0007", - "layer_sizes": "512,128", - "dropouts": "0.3,0.3", - "save_results": "False", - "max_epochs": "2", # You don't need to train very long. Just need to build datasets - "early_stopping_patience": "2", - "verbose": "False", - "seed":"0", - } + params = read_params( + make_relative_to_file('jsons/wo_balancing_transformer.json'), + dset_key, + res_dir) return params def params_w_balan(dset_key, res_dir): # Now train models on the same dataset with balancing weights - params = { - "dataset_key" : dset_key, - "datastore" : "False", - "uncertainty": "False", - "splitter": "scaffold", - "split_valid_frac": "0.20", - "split_test_frac": "0.20", - "split_strategy": "train_valid_test", - "prediction_type": "classification", - "model_choice_score_type": "roc_auc", - "response_cols" : "active", - "id_col": "compound_id", - "smiles_col" : "rdkit_smiles", - "result_dir": res_dir, - "system": "LC", - "transformers": "True", - "model_type": "NN", - "featurizer": "computed_descriptors", - "descriptor_type": "rdkit_raw", - "weight_transform_type": "balancing", - "learning_rate": ".0007", - "layer_sizes": "512,128", - "dropouts": "0.3,0.3", - "save_results": "False", - "max_epochs": "2", - "early_stopping_patience": "2", - "verbose": "False", - "seed":"0", - } + params = read_params( + make_relative_to_file('jsons/balancing_transformer.json'), + dset_key, + res_dir + ) + + return params + +def params_w_SMOTE_balan(dset_key, res_dir): + # Try with SMOTE with ratio set to .50 + params = read_params( + make_relative_to_file('jsons/SMOTE_balancing_transformer.json'), + dset_key, + res_dir + ) return params From a057ba49f88d017ac099adb72c51603bb6a16fad Mon Sep 17 00:00:00 2001 From: Rose Wilfong Date: Mon, 13 Jan 2025 15:03:30 -0500 Subject: [PATCH 56/57] global seed warning --- atomsci/ddm/pipeline/random_seed.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/atomsci/ddm/pipeline/random_seed.py b/atomsci/ddm/pipeline/random_seed.py index 9fb541ae..b9838b37 100644 --- a/atomsci/ddm/pipeline/random_seed.py +++ b/atomsci/ddm/pipeline/random_seed.py @@ -4,6 +4,8 @@ import random import torch import tensorflow as tf +import logging +logging.basicConfig(format='%(asctime)-15s %(message)s') #---------------------------------------------------------------------------------- class RandomStateGenerator: """ @@ -25,6 +27,9 @@ def __init__(self, params=None, seed=None): self.set_seed(self.seed) def set_seed(self, seed): + log = logging.getLogger('ATOM') + log.warning("The global seed is being set to %d, for reproducibility. Note that this action " + "will synchronize the randonmess across all libraries which may impact the randomness of other parts of the pipeline.", seed) """Set the seed for all relevant libraries.""" global _seed, _random_state From 10c4ba7a556867ab6a4e0ccdb617e05974de0454 Mon Sep 17 00:00:00 2001 From: Rose Wilfong Date: Mon, 13 Jan 2025 16:47:23 -0500 Subject: [PATCH 57/57] global seed warning --- atomsci/ddm/pipeline/random_seed.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/atomsci/ddm/pipeline/random_seed.py b/atomsci/ddm/pipeline/random_seed.py index b9838b37..847a62ae 100644 --- a/atomsci/ddm/pipeline/random_seed.py +++ b/atomsci/ddm/pipeline/random_seed.py @@ -28,8 +28,7 @@ def __init__(self, params=None, seed=None): def set_seed(self, seed): log = logging.getLogger('ATOM') - log.warning("The global seed is being set to %d, for reproducibility. Note that this action " - "will synchronize the randonmess across all libraries which may impact the randomness of other parts of the pipeline.", seed) + log.warning("The global seed is being set to %d, for reproducibility. Note that this action will synchronize the randonmess across all libraries which may impact the randomness of other parts of the pipeline.", seed) """Set the seed for all relevant libraries.""" global _seed, _random_state