From 5fde57a7638d8b0122d80370f624bd09bfbd6d35 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Mon, 19 Aug 2024 19:07:54 +0000 Subject: [PATCH 01/54] added autogluon support --- pyproject.toml | 4 ++-- src/MEDS_tabular_automl/mapper.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 475cf78..5070616 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ authors = [ ] description = "Scalable Tabularization of MEDS format Time-Series data" readme = "README.md" -requires-python = ">=3.12" +requires-python = ">=3.11" classifiers = [ "Programming Language :: Python :: 3", "License :: OSI Approved :: MIT License", @@ -17,7 +17,6 @@ classifiers = [ dependencies = [ "polars", "pyarrow", "loguru", "hydra-core", "numpy", "scipy<1.14.0", "pandas", "tqdm", "xgboost", "scikit-learn", "hydra-optuna-sweeper", "hydra-joblib-launcher", "ml-mixins", "meds==0.3", - "MEDS-transforms==0.0.5", ] [project.scripts] @@ -33,6 +32,7 @@ generate-subsets = "MEDS_tabular_automl.scripts.generate_subsets:main" dev = ["pre-commit"] tests = ["pytest", "pytest-cov", "rootutils"] profiling = ["mprofile", "matplotlib"] +autogluon = ["autogluon; python_version=='3.11.*'"] # Environment marker to restrict AutoGluon to Python 3.11 [build-system] requires = ["setuptools>=61.0", "setuptools-scm>=8.0", "wheel"] diff --git a/src/MEDS_tabular_automl/mapper.py b/src/MEDS_tabular_automl/mapper.py index 34275b8..9870c50 100644 --- a/src/MEDS_tabular_automl/mapper.py +++ b/src/MEDS_tabular_automl/mapper.py @@ -5,10 +5,12 @@ from collections.abc import Callable from datetime import datetime from pathlib import Path +from typing import TypeVar from loguru import logger LOCK_TIME_FMT = "%Y-%m-%dT%H:%M:%S.%f" +DF_T = TypeVar("DF_T") def get_earliest_lock(cache_directory: Path) -> datetime | None: @@ -82,9 +84,7 @@ def register_lock(cache_directory: Path) -> tuple[datetime, Path]: return lock_time, lock_fp -def wrap[ - DF_T -]( +def wrap( in_fp: Path, out_fp: Path, read_fn: Callable[[Path], DF_T], From d6832cb027beb901851bd709aa899d1ad4ec012c Mon Sep 17 00:00:00 2001 From: Teya Bergamaschi Date: Mon, 19 Aug 2024 19:44:07 +0000 Subject: [PATCH 02/54] updates for autogluon --- src/MEDS_tabular_automl/base_model.py | 236 +++++++++ .../configs/launch_basemodel.yaml | 33 ++ .../scripts/launch_basemodel.py | 58 +++ .../scripts/launch_xgboost.py | 403 +-------------- src/MEDS_tabular_automl/tabular_dataset.py | 474 ++++++++++++++++++ src/MEDS_tabular_automl/xgboost_model.py | 194 +++++++ tests/test_tabularize.py | 39 ++ 7 files changed, 1038 insertions(+), 399 deletions(-) create mode 100644 src/MEDS_tabular_automl/base_model.py create mode 100644 src/MEDS_tabular_automl/configs/launch_basemodel.yaml create mode 100644 src/MEDS_tabular_automl/scripts/launch_basemodel.py create mode 100644 src/MEDS_tabular_automl/tabular_dataset.py create mode 100644 src/MEDS_tabular_automl/xgboost_model.py diff --git a/src/MEDS_tabular_automl/base_model.py b/src/MEDS_tabular_automl/base_model.py new file mode 100644 index 0000000..b6f0276 --- /dev/null +++ b/src/MEDS_tabular_automl/base_model.py @@ -0,0 +1,236 @@ +from pathlib import Path + +import hydra +import numpy as np +import scipy.sparse as sp +from loguru import logger +from mixins import TimeableMixin +from omegaconf import DictConfig +from sklearn.metrics import roc_auc_score + +from .tabular_dataset import TabularDataset + + +class BaseIterator(TabularDataset, TimeableMixin): + """BaseIterator class for loading and processing data shards for use in SciKit-Learn models. + + This class provides functionality for iterating through data shards, loading + feature data and labels, and processing them based on the provided configuration. + + Args: + cfg: A configuration dictionary containing parameters for + data processing, feature selection, and other settings. + split: The data split to use, which can be one of "train", "tuning", + or "held_out". This determines which subset of the data is loaded and processed. + + Attributes: + cfg: Configuration dictionary containing parameters for + data processing, feature selection, and other settings. + file_name_resolver: Object for resolving file names and paths based on the configuration. + split: The data split being used for loading and processing data shards. + _data_shards: List of data shard names. + valid_event_ids: Dictionary mapping shard number to a list of valid event IDs. + labels: Dictionary mapping shard number to a list of labels for the corresponding event IDs. + codes_set: Set of codes to include in the data. + code_masks: Dictionary of code masks for filtering features based on aggregation. + num_features: Total number of features in the data. + """ + + def __init__(self, cfg: DictConfig, split: str): + """Initializes the BaseIterator with the provided configuration and data split. + + Args: + cfg: The configuration dictionary. + split: The data split to use. + """ + TabularDataset.__init__(self, cfg=cfg, split=split) + TimeableMixin.__init__(self) + self.valid_event_ids, self.labels = self._load_ids_and_labels() + # check if the labels are empty + if len(self.labels) == 0: + raise ValueError("No labels found.") + # self._it = 0 + + # def iterate(self, function): + # for shard_idx in range(len(self._data_shards)): + # data, labels = self.get_data_shards(shard_idx) + # function(data, labels) + + +class BaseMatrix(TimeableMixin): + """BaseMatrix class for loading and processing data shards for use in SciKit-Learn models.""" + + def __init__(self, data: sp.csr_matrix, labels: np.ndarray): + """Initializes the BaseMatrix with the provided configuration and data split. + + Args: + data + """ + super().__init__() + self.data = data + self.labels = labels + + def get_data(self): + return self.data + + def get_label(self): + return self.labels + + +class BaseModel(TimeableMixin): + """Class for configuring, training, and evaluating an SciKit-Learn model. + + This class utilizes the configuration settings provided to manage the training and evaluation + process of an XGBoost model, ensuring the model is trained and validated using specified parameters + and data splits. It supports training with in-memory data handling as well as direct streaming from + disk using iterators. + + Args: + cfg: The configuration settings for the model, including data paths, model parameters, + and flags for data handling. + + Attributes: + cfg: Configuration object containing all settings required for model operation. + model: The XGBoost model after being trained. + dtrain: The training dataset in DMatrix format. + dtuning: The tuning (validation) dataset in DMatrix format. + dheld_out: The held-out (test) dataset in DMatrix format. + itrain: Iterator for the training dataset. + ituning: Iterator for the tuning dataset. + iheld_out: Iterator for the held-out dataset. + keep_data_in_memory: Flag indicating whether to keep all data in memory or stream from disk. + """ + + def __init__(self, cfg: DictConfig): + """Initializes the XGBoostClassifier with the provided configuration. + + Args: + cfg: The configuration dictionary. + """ + self.cfg = cfg + self.keep_data_in_memory = cfg.model_params.iterator.keep_data_in_memory + + self.itrain = None + self.ituning = None + self.iheld_out = None + + self.dtrain = None + self.dtuning = None + self.dheld_out = None + + self.model = hydra.utils.call(cfg.model_params.model) + # check that self.model is a valid model + if not hasattr(self.model, "fit"): + raise ValueError("Model does not have a fit method.") + + @TimeableMixin.TimeAs + def _build_data(self): + """Builds necessary data structures for training.""" + if self.keep_data_in_memory: + self._build_iterators() + self._build_matrix_in_memory() + else: + self._build_iterators() + + def _fit_from_partial(self): + """Fits model until convergence or maximum epochs.""" + if not hasattr(self.model, "partial_fit"): + raise ValueError( + f"Data is loaded in shards, but {self.model.__class__.__name__} does not support partial_fit." + ) + classes = self.itrain.get_classes() + best_auc = 0 + best_epoch = 0 + for epoch in range(self.cfg.model_params.epochs): + # train on each all data + for shard_idx in range(len(self.itrain._data_shards)): + data, labels = self.itrain.get_data_shards(shard_idx) + # if self.model.shuffle: # TODO: check this for speed + # # shuffle data + # indices = np.random.permutation(len(labels)) + # data = data[indices] + # labels = labels[indices] + self.model.partial_fit(data, labels, classes=classes) + # evaluate on tuning set + auc = self.evaluate() + # early stopping + if auc > best_auc: + best_auc = auc + best_epoch = epoch + if epoch - best_epoch > self.cfg.model_params.early_stopping_rounds: + break + + @TimeableMixin.TimeAs + def _train(self): + """Trains the model.""" + # two cases: data is in memory or data is streamed + if self.keep_data_in_memory: + self.model.fit(self.dtrain.get_data(), self.dtrain.get_label()) + else: + self._fit_from_partial() + + @TimeableMixin.TimeAs + def train(self): + """Trains the model.""" + self._build_data() + self._train() + + @TimeableMixin.TimeAs + def _build_matrix_in_memory(self): + """Builds the DMatrix from the data in memory.""" + self.dtrain = BaseMatrix(*self.itrain.get_data()) + self.dtuning = BaseMatrix(*self.ituning.get_data()) + self.dheld_out = BaseMatrix(*self.iheld_out.get_data()) + + @TimeableMixin.TimeAs + def _build_iterators(self): + """Builds the iterators for training, validation, and testing.""" + self.itrain = BaseIterator(self.cfg, split="train") + self.ituning = BaseIterator(self.cfg, split="tuning") + self.iheld_out = BaseIterator(self.cfg, split="held_out") + + @TimeableMixin.TimeAs + def evaluate(self) -> float: + """Evaluates the model on the tuning set. + + Returns: + The evaluation metric as the ROC AUC score. + """ + # check if model has predict_proba method + if not hasattr(self.model, "predict_proba"): + raise ValueError(f"Model {self.model.__class__.__name__} does not have a predict_proba method.") + # two cases: data is in memory or data is streamed + if self.keep_data_in_memory: + y_pred = self.model.predict_proba(self.dtuning.get_data())[:, 1] + y_true = self.dtuning.get_label() + else: + y_pred = [] + y_true = [] + for shard_idx in range(len(self.ituning._data_shards)): + data, labels = self.ituning.get_data_shards(shard_idx) + y_pred.extend(self.model.predict_proba(data)[:, 1]) + y_true.extend(labels) + y_pred = np.array(y_pred) + y_true = np.array(y_true) + # check if y_pred and y_true are not empty + if len(y_pred) == 0 or len(y_true) == 0: + raise ValueError("Predictions or true labels are empty.") + return roc_auc_score(y_true, y_pred) + + def save_model(self, output_fp: str): + """Saves the model to the specified file path. + + Args: + output_fp: The file path to save the model to. + """ + output_fp = Path(output_fp) + # check if model has save method + if not hasattr(self.model, "save_model"): + logger.info(f"Model {self.model.__class__.__name__} does not have a save_model method.") + logger.info("Model will be saved using pickle dump.") + from pickle import dump + + with open(output_fp.parent / "model.pkl", "wb") as f: + dump(self.model, f, protocol=5) + else: + self.model.save_model(output_fp) diff --git a/src/MEDS_tabular_automl/configs/launch_basemodel.yaml b/src/MEDS_tabular_automl/configs/launch_basemodel.yaml new file mode 100644 index 0000000..0be805a --- /dev/null +++ b/src/MEDS_tabular_automl/configs/launch_basemodel.yaml @@ -0,0 +1,33 @@ +defaults: + - default + - tabularization: default + - override hydra/sweeper: optuna + - override hydra/sweeper/sampler: tpe + - override hydra/launcher: joblib + - _self_ + +task_name: task + +# Task cached data dir +input_dir: ${output_cohort_dir}/${task_name}/task_cache +# Directory with task labels +input_label_dir: ${output_cohort_dir}/${task_name}/labels/ +# Where to output the model and cached data +model_dir: ${output_cohort_dir}/model/model_${now:%Y-%m-%d_%H-%M-%S} +output_filepath: ${model_dir}/model_metadata.json + +# Model parameters +model_params: + epochs: 20 + early_stopping_rounds: 5 + model: + _target_: sklearn.linear_model.SGDClassifier + loss: log_loss + # n_iter: ${model_params.epochs} # not sure if we want this behaviour + iterator: + keep_data_in_memory: True + binarize_task: True + +log_dir: ${model_dir}/.logs/ + +name: launch_basemodel diff --git a/src/MEDS_tabular_automl/scripts/launch_basemodel.py b/src/MEDS_tabular_automl/scripts/launch_basemodel.py new file mode 100644 index 0000000..fbbb0ee --- /dev/null +++ b/src/MEDS_tabular_automl/scripts/launch_basemodel.py @@ -0,0 +1,58 @@ +from pathlib import Path + +import hydra +from loguru import logger +from omegaconf import DictConfig + +from ..base_model import BaseModel +from ..utils import hydra_loguru_init + +# config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_basemodel.yaml") +# if not config_yaml.is_file(): +# raise FileNotFoundError("Core configuration not successfully installed!") +config_yaml = Path("./src/MEDS_tabular_automl/configs/launch_basemodel.yaml") + + +@hydra.main(version_base=None, config_path=str(config_yaml.parent.resolve()), config_name=config_yaml.stem) +def main(cfg: DictConfig) -> float: + """Optimizes the model based on the provided configuration. + + Args: + cfg: The configuration dictionary specifying model and training parameters. + + Returns: + The evaluation result as the ROC AUC score on the held-out test set. + """ + + # print(OmegaConf.to_yaml(cfg)) + if not cfg.loguru_init: + hydra_loguru_init() + try: + model = BaseModel(cfg) + model.train() + auc = model.evaluate() + logger.info(f"AUC: {auc}") + + # print( + # "Time Profiling for window sizes ", + # f"{cfg.tabularization.window_sizes} and min ", + # f"code frequency of {cfg.tabularization.min_code_inclusion_frequency}:", + # ) + # print("Train Time: \n", model._profile_durations()) + # print("Train Iterator Time: \n", model.itrain._profile_durations()) + # print("Tuning Iterator Time: \n", model.ituning._profile_durations()) + # print("Held Out Iterator Time: \n", model.iheld_out._profile_durations()) + + # save model + output_fp = Path(cfg.output_filepath) + output_fp.parent.mkdir(parents=True, exist_ok=True) + + model.save_model(output_fp) + except Exception as e: + logger.error(f"Error occurred: {e}") + auc = 0.0 + return auc + + +if __name__ == "__main__": + main() diff --git a/src/MEDS_tabular_automl/scripts/launch_xgboost.py b/src/MEDS_tabular_automl/scripts/launch_xgboost.py index 25bd7de..fd09e70 100644 --- a/src/MEDS_tabular_automl/scripts/launch_xgboost.py +++ b/src/MEDS_tabular_automl/scripts/launch_xgboost.py @@ -1,413 +1,18 @@ -from collections.abc import Callable, Mapping from importlib.resources import files from pathlib import Path import hydra -import numpy as np -import polars as pl -import scipy.sparse as sp -import xgboost as xgb from loguru import logger -from mixins import TimeableMixin -from omegaconf import DictConfig, OmegaConf -from sklearn.metrics import roc_auc_score +from omegaconf import DictConfig -from ..describe_codes import get_feature_columns -from ..file_name import get_model_files, list_subdir_files -from ..utils import get_feature_indices, hydra_loguru_init +from ..utils import hydra_loguru_init +from ..xgboost_model import XGBoostModel config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_xgboost.yaml") if not config_yaml.is_file(): raise FileNotFoundError("Core configuration not successfully installed!") -class Iterator(xgb.DataIter, TimeableMixin): - """Iterator class for loading and processing data shards for use in XGBoost models. - - This class provides functionality for iterating through data shards, loading - feature data and labels, and processing them based on the provided configuration. - - Args: - cfg: A configuration dictionary containing parameters for - data processing, feature selection, and other settings. - split: The data split to use, which can be one of "train", "tuning", - or "held_out". This determines which subset of the data is loaded and processed. - - Attributes: - cfg: Configuration dictionary containing parameters for - data processing, feature selection, and other settings. - file_name_resolver: Object for resolving file names and paths based on the configuration. - split: The data split being used for loading and processing data shards. - _data_shards: List of data shard names. - valid_event_ids: Dictionary mapping shard number to a list of valid event IDs. - labels: Dictionary mapping shard number to a list of labels for the corresponding event IDs. - codes_set: Set of codes to include in the data. - code_masks: Dictionary of code masks for filtering features based on aggregation. - num_features: Total number of features in the data. - """ - - def __init__(self, cfg: DictConfig, split: str = "train"): - """Initializes the Iterator with the provided configuration and data split. - - Args: - cfg: A configuration dictionary containing parameters for - data processing, feature selection, and other settings. - split: The data split to use, which can be one of "train", "tuning", - or "held_out". This determines which subset of the data is loaded and processed. - """ - # generate_permutations(cfg.tabularization.window_sizes) - # generate_permutations(cfg.tabularization.aggs) - self.cfg = cfg - self.split = split - # Load shards for this split - self._data_shards = sorted( - [shard.stem for shard in list_subdir_files(Path(cfg.input_label_dir) / split, "parquet")] - ) - self.valid_event_ids, self.labels = self.load_labels() - self.codes_set, self.code_masks, self.num_features = self._get_code_set() - self._it = 0 - - super().__init__(cache_prefix=Path(cfg.cache_dir)) - - @TimeableMixin.TimeAs - def _get_code_masks(self, feature_columns: list, codes_set: set) -> Mapping[str, list[bool]]: - """Creates boolean masks for filtering features. - - Creates a dictionary of boolean masks for each aggregation type. The masks are used to filter - the feature columns based on the specified included codes and minimum code inclusion frequency. - - Args: - feature_columns: List of feature columns. - codes_set: Set of codes to include. - - Returns: - Dictionary of code masks for each aggregation. - """ - code_masks = {} - for agg in set(self.cfg.tabularization.aggs): - feature_ids = get_feature_indices(agg, feature_columns) - code_mask = [True if idx in codes_set else False for idx in feature_ids] - code_masks[agg] = code_mask - return code_masks - - @TimeableMixin.TimeAs - def _load_matrix(self, path: Path) -> sp.csc_matrix: - """Loads a sparse matrix from disk. - - Args: - path: Path to the sparse matrix. - - Returns: - The sparse matrix. - - Raises: - ValueError: If the loaded array does not have exactly 3 rows, indicating an unexpected format. - """ - npzfile = np.load(path) - array, shape = npzfile["array"], npzfile["shape"] - if array.shape[0] != 3: - raise ValueError(f"Expected array to have 3 rows, but got {array.shape[0]} rows") - data, row, col = array - return sp.csc_matrix((data, (row, col)), shape=shape) - - @TimeableMixin.TimeAs - def load_labels(self) -> tuple[Mapping[int, list], Mapping[int, list]]: - """Loads valid event ids and labels for each shard. - - Returns: - A tuple containing two mappings: one from shard indices to lists of valid event IDs - which is used for indexing rows in the sparse matrix, and another from shard indices - to lists of corresponding labels. - """ - label_fps = { - shard: (Path(self.cfg.input_label_dir) / self.split / shard).with_suffix(".parquet") - for shard in self._data_shards - for shard in self._data_shards - } - cached_labels, cached_event_ids = dict(), dict() - for shard, label_fp in label_fps.items(): - label_df = pl.scan_parquet(label_fp) - cached_event_ids[shard] = label_df.select(pl.col("event_id")).collect().to_series() - - # TODO: check this for Nan or any other case we need to worry about - cached_labels[shard] = label_df.select(pl.col("label")).collect().to_series() - if self.cfg.model_params.iterator.binarize_task: - cached_labels[shard] = cached_labels[shard].map_elements( - lambda x: 1 if x > 0 else 0, return_dtype=pl.Int8 - ) - - return cached_event_ids, cached_labels - - @TimeableMixin.TimeAs - def _get_code_set(self) -> tuple[set[int], Mapping[str, list[bool]], int]: - """Determines the set of feature codes to include based on the configuration settings. - - Returns: - A tuple containing: - - A set of feature indices to be included. - - A mapping from aggregation types to boolean masks indicating whether each feature is included. - - The total number of features. - """ - feature_columns = get_feature_columns(self.cfg.tabularization.filtered_code_metadata_fp) - feature_dict = {col: i for i, col in enumerate(feature_columns)} - allowed_codes = set(self.cfg.tabularization._resolved_codes) - codes_set = {feature_dict[code] for code in feature_dict if code in allowed_codes} - - return ( - codes_set, - self._get_code_masks(feature_columns, codes_set), - len(feature_columns), - ) - - @TimeableMixin.TimeAs - def _load_dynamic_shard_from_file(self, path: Path, idx: int) -> sp.csc_matrix: - """Loads a specific data shard into memory as a sparse matrix. - - Args: - path: Path to the sparse shard. - idx: Index of the shard. - - Returns: - The sparse matrix loaded from the file. - """ - # column_shard is of form event_idx, feature_idx, value - matrix = self._load_matrix(path) - if path.stem in ["first", "present"]: - agg = f"static/{path.stem}" - else: - agg = f"{path.parent.stem}/{path.stem}" - - return self._filter_shard_on_codes_and_freqs(agg, matrix) - - @TimeableMixin.TimeAs - def _get_dynamic_shard_by_index(self, idx: int) -> sp.csc_matrix: - """Loads a shard and returns it as a sparse matrix after applying feature inclusion filtering. - - Args: - idx: Index of the shard to load from disk. - - Returns: - The filtered sparse matrix. - - Raises: - ValueError: If any of the required files for the shard do not exist. - """ - # get all window_size x aggreagation files using the file resolver - files = get_model_files(self.cfg, self.split, self._data_shards[idx]) - - if not all(file.exists() for file in files): - raise ValueError(f"Not all files exist for shard {self._data_shards[idx]}") - - dynamic_cscs = [self._load_dynamic_shard_from_file(file, idx) for file in files] - - combined_csc = sp.hstack(dynamic_cscs, format="csc") - - return combined_csc - - @TimeableMixin.TimeAs - def _get_shard_by_index(self, idx: int) -> tuple[sp.csc_matrix, np.ndarray]: - """Loads a specific shard of data from disk and concatenate with static data. - - Args: - idx: Index of the shard to load. - - Returns: - A tuple containing the combined feature data and the corresponding labels - for the given shard. - """ - dynamic_df = self._get_dynamic_shard_by_index(idx) - label_df = self.labels[self._data_shards[idx]] - return dynamic_df, label_df - - @TimeableMixin.TimeAs - def _filter_shard_on_codes_and_freqs(self, agg: str, df: sp.csc_matrix) -> sp.csc_matrix: - """Filters the given data frame based on the inclusion sets and aggregation type. - - Given the codes_mask, the method filters the dynamic data frame to only include - columns that are True in the mask. - - Args: - agg: The aggregation type used to determine the filtering logic. - df: The data frame to be filtered. - - Returns: - The filtered data frame. - """ - if self.codes_set is None: - return df - - ckey = f"_filter_shard_on_codes_and_freqs/{agg}" - self._register_start(key=ckey) - - df = df[:, self.code_masks[agg]] - - self._register_end(key=ckey) - - return df - - @TimeableMixin.TimeAs - def next(self, input_data: Callable) -> int: - """Advances the iterator by one step and provides data to XGBoost for DMatrix construction. - - Args: - input_data: A function passed by XGBoost with the same signature as `DMatrix`. - - Returns: - 0 if end of iteration, 1 otherwise. - """ - if self._it == len(self._data_shards): - # return 0 to let XGBoost know this is the end of iteration - return 0 - - # input_data is a function passed in by XGBoost who has the exact same signature of - # ``DMatrix`` - X, y = self._get_shard_by_index(self._it) # self._data_shards[self._it]) - input_data(data=sp.csr_matrix(X), label=y) - self._it += 1 - # Return 1 to let XGBoost know we haven't seen all the files yet. - return 1 - - @TimeableMixin.TimeAs - def reset(self): - """Resets the iterator to its beginning.""" - self._it = 0 - - @TimeableMixin.TimeAs - def collect_in_memory(self) -> tuple[sp.csc_matrix, np.ndarray]: - """Collects data from all shards into memory and returns it. - - This method iterates through all data shards, retrieves the feature data and labels - from each shard, and then concatenates them into a single sparse matrix and a single - array, respectively. - - Returns: - A tuple where the first element is a sparse matrix containing the - feature data, and the second element is a numpy array containing the labels. - - Raises: - ValueError: If no data is found in the shards or labels, indicating an issue with input files. - """ - X = [] - y = [] - for i in range(len(self._data_shards)): - X_, y_ = self._get_shard_by_index(i) - X.append(X_) - y.append(y_) - if len(X) == 0 or len(y) == 0: - raise ValueError("No data found in the shards or labels. Please check input files.") - X = sp.vstack(X) - y = np.concatenate(y, axis=0) - return X, y - - -class XGBoostModel(TimeableMixin): - """Class for configuring, training, and evaluating an XGBoost model. - - This class utilizes the configuration settings provided to manage the training and evaluation - process of an XGBoost model, ensuring the model is trained and validated using specified parameters - and data splits. It supports training with in-memory data handling as well as direct streaming from - disk using iterators. - - Args: - cfg: The configuration settings for the model, including data paths, model parameters, - and flags for data handling. - - Attributes: - cfg: Configuration object containing all settings required for model operation. - model: The XGBoost model after being trained. - dtrain: The training dataset in DMatrix format. - dtuning: The tuning (validation) dataset in DMatrix format. - dheld_out: The held-out (test) dataset in DMatrix format. - itrain: Iterator for the training dataset. - ituning: Iterator for the tuning dataset. - iheld_out: Iterator for the held-out dataset. - keep_data_in_memory: Flag indicating whether to keep all data in memory or stream from disk. - """ - - def __init__(self, cfg: DictConfig): - """Initializes the XGBoostClassifier with the provided configuration. - - Args: - cfg: The configuration dictionary. - """ - self.cfg = cfg - self.keep_data_in_memory = cfg.model_params.iterator.keep_data_in_memory - - self.itrain = None - self.ituning = None - self.iheld_out = None - - self.dtrain = None - self.dtuning = None - self.dheld_out = None - - self.model = None - - @TimeableMixin.TimeAs - def _build(self): - """Builds necessary data structures for training.""" - if self.keep_data_in_memory: - self._build_iterators() - self._build_dmatrix_in_memory() - else: - self._build_iterators() - self._build_dmatrix_from_iterators() - - @TimeableMixin.TimeAs - def _train(self): - """Trains the model.""" - self.model = xgb.train( - OmegaConf.to_container(self.cfg.model_params.model), - self.dtrain, - num_boost_round=self.cfg.model_params.num_boost_round, - early_stopping_rounds=self.cfg.model_params.early_stopping_rounds, - # nthreads=self.cfg.nthreads, - evals=[(self.dtrain, "train"), (self.dtuning, "tuning")], - verbose_eval=0, - ) - - @TimeableMixin.TimeAs - def train(self): - """Trains the model.""" - self._build() - self._train() - - @TimeableMixin.TimeAs - def _build_dmatrix_in_memory(self): - """Builds the DMatrix from the data in memory.""" - X_train, y_train = self.itrain.collect_in_memory() - X_tuning, y_tuning = self.ituning.collect_in_memory() - X_held_out, y_held_out = self.iheld_out.collect_in_memory() - self.dtrain = xgb.DMatrix(X_train, label=y_train) - self.dtuning = xgb.DMatrix(X_tuning, label=y_tuning) - self.dheld_out = xgb.DMatrix(X_held_out, label=y_held_out) - - @TimeableMixin.TimeAs - def _build_dmatrix_from_iterators(self): - """Builds the DMatrix from the iterators.""" - self.dtrain = xgb.DMatrix(self.itrain) - self.dtuning = xgb.DMatrix(self.ituning) - self.dheld_out = xgb.DMatrix(self.iheld_out) - - @TimeableMixin.TimeAs - def _build_iterators(self): - """Builds the iterators for training, validation, and testing.""" - self.itrain = Iterator(self.cfg, split="train") - self.ituning = Iterator(self.cfg, split="tuning") - self.iheld_out = Iterator(self.cfg, split="held_out") - - @TimeableMixin.TimeAs - def evaluate(self) -> float: - """Evaluates the model on the tuning set. - - Returns: - The evaluation metric as the ROC AUC score. - """ - y_pred = self.model.predict(self.dtuning) - y_true = self.dtuning.get_label() - return roc_auc_score(y_true, y_pred) - - @hydra.main(version_base=None, config_path=str(config_yaml.parent.resolve()), config_name=config_yaml.stem) def main(cfg: DictConfig) -> float: """Optimizes the model based on the provided configuration. @@ -442,7 +47,7 @@ def main(cfg: DictConfig) -> float: output_fp = Path(cfg.output_filepath) output_fp.parent.mkdir(parents=True, exist_ok=True) - model.model.save_model(output_fp) + model.save_model(output_fp) except Exception as e: logger.error(f"Error occurred: {e}") auc = 0.0 diff --git a/src/MEDS_tabular_automl/tabular_dataset.py b/src/MEDS_tabular_automl/tabular_dataset.py new file mode 100644 index 0000000..740dc4c --- /dev/null +++ b/src/MEDS_tabular_automl/tabular_dataset.py @@ -0,0 +1,474 @@ +from collections.abc import Mapping +from pathlib import Path + +import numpy as np +import polars as pl +import scipy.sparse as sp +from mixins import TimeableMixin +from omegaconf import DictConfig + +from .describe_codes import get_feature_columns +from .file_name import get_model_files, list_subdir_files +from .utils import get_feature_indices + + +class TabularDataset(TimeableMixin): + """Tabular Dataset class for loading and processing data shards. + + This class provides functionality for iterating through data shards, loading + feature data and labels, and processing them based on the provided configuration. + + Args: + cfg: A configuration dictionary containing parameters for + data processing, feature selection, and other settings. + split: The data split to use, which can be one of "train", "tuning", + or "held_out". This determines which subset of the data is loaded and processed. + + Attributes: + cfg: Configuration dictionary containing parameters for + data processing, feature selection, and other settings. + file_name_resolver: Object for resolving file names and paths based on the configuration. + split: The data split being used for loading and processing data shards. + _data_shards: List of data shard names. + valid_event_ids: Dictionary mapping shard number to a list of valid event IDs. + labels: Dictionary mapping shard number to a list of labels for the corresponding event IDs. + codes_set: Set of codes to include in the data. + code_masks: Dictionary of code masks for filtering features based on aggregation. + num_features: Total number of features in the data. + """ + + def __init__(self, cfg: DictConfig, split: str = "train"): + """Initializes the Iterator with the provided configuration and data split. + + Args: + cfg: A configuration dictionary containing parameters for + data processing, feature selection, and other settings. + split: The data split to use, which can be one of "train", "tuning", + or "held_out". This determines which subset of the data is loaded and processed. + """ + super().__init__(cache_prefix=Path(cfg.cache_dir)) + self.cfg = cfg + self.split = split + # Load shards for this split + self._data_shards = sorted( + [shard.stem for shard in list_subdir_files(Path(cfg.input_label_dir) / split, "parquet")] + ) + self.valid_event_ids, self.labels = None, None + # self.valid_event_ids, self.labels = self._load_ids_and_labels() + + self.codes_set, self.code_masks, self.num_features = self._get_code_set() + + @TimeableMixin.TimeAs + def _get_code_masks(self, feature_columns: list, codes_set: set) -> Mapping[str, list[bool]]: + """Creates boolean masks for filtering features. + + Creates a dictionary of boolean masks for each aggregation type. The masks are used to filter + the feature columns based on the specified included codes and minimum code inclusion frequency. + + Args: + feature_columns: List of feature columns. + codes_set: Set of codes to include. + + Returns: + Dictionary of code masks for each aggregation. + """ + code_masks = {} + for agg in set(self.cfg.tabularization.aggs): + feature_ids = get_feature_indices(agg, feature_columns) + code_mask = [True if idx in codes_set else False for idx in feature_ids] + code_masks[agg] = code_mask + return code_masks + + @TimeableMixin.TimeAs + def _load_matrix(self, path: Path) -> sp.csc_matrix: + """Loads a sparse matrix from disk. + + Args: + path: Path to the sparse matrix. + + Returns: + The sparse matrix. + + Raises: + ValueError: If the loaded array does not have exactly 3 rows, indicating an unexpected format. + """ + npzfile = np.load(path) + array, shape = npzfile["array"], npzfile["shape"] + if array.shape[0] != 3: + raise ValueError(f"Expected array to have 3 rows, but got {array.shape[0]} rows") + data, row, col = array + return sp.csc_matrix((data, (row, col)), shape=shape) + + @TimeableMixin.TimeAs + def _load_ids_and_labels( + self, load_ids: bool = True, load_labels: bool = True + ) -> tuple[Mapping[int, list], Mapping[int, list]]: + """Loads valid event ids and labels for each shard. + + Returns: + A tuple containing two mappings: one from shard indices to lists of valid event IDs + which is used for indexing rows in the sparse matrix, and another from shard indices + to lists of corresponding labels. + """ + label_fps = { + shard: (Path(self.cfg.input_label_dir) / self.split / shard).with_suffix(".parquet") + for shard in self._data_shards + for shard in self._data_shards + } + cached_labels, cached_event_ids = dict(), dict() + for shard, label_fp in label_fps.items(): + label_df = pl.scan_parquet(label_fp) + if load_ids: + cached_event_ids[shard] = label_df.select(pl.col("event_id")).collect().to_series() + + # TODO: check this for Nan or any other case we need to worry about + if load_labels: + cached_labels[shard] = label_df.select(pl.col("label")).collect().to_series() + if self.cfg.model_params.iterator.binarize_task: + cached_labels[shard] = cached_labels[shard].map_elements( + lambda x: 1 if x > 0 else 0, return_dtype=pl.Int8 + ) + + return cached_event_ids if load_ids else None, cached_labels if load_labels else None + + def _load_labels(self) -> tuple[Mapping[int, list], Mapping[int, list]]: + """Loads valid event ids and labels for each shard. + + Returns: + A tuple containing two mappings: one from shard indices to lists of valid event IDs + which is used for indexing rows in the sparse matrix, and another from shard indices + to lists of corresponding labels. + """ + _, cached_labels = self._load_ids_and_labels(load_ids=False) + + return cached_labels + + @TimeableMixin.TimeAs + def _load_event_ids(self) -> tuple[Mapping[int, list], Mapping[int, list]]: + """Loads valid event ids and labels for each shard. + + Returns: + A tuple containing two mappings: one from shard indices to lists of valid event IDs + which is used for indexing rows in the sparse matrix, and another from shard indices + to lists of corresponding labels. + """ + cached_event_ids, _ = self._load_ids_and_labels(load_labels=False) + + return cached_event_ids + + @TimeableMixin.TimeAs + def _get_code_set(self) -> tuple[set[int], Mapping[str, list[bool]], int]: + """Determines the set of feature codes to include based on the configuration settings. + + Returns: + A tuple containing: + - A set of feature indices to be included. + - A mapping from aggregation types to boolean masks indicating whether each feature is included. + - The total number of features. + """ + feature_columns = get_feature_columns(self.cfg.tabularization.filtered_code_metadata_fp) + feature_dict = {col: i for i, col in enumerate(feature_columns)} + allowed_codes = set(self.cfg.tabularization._resolved_codes) + codes_set = {feature_dict[code] for code in feature_dict if code in allowed_codes} + + return ( + codes_set, + self._get_code_masks(feature_columns, codes_set), + len(feature_columns), + ) + + @TimeableMixin.TimeAs + def _load_dynamic_shard_from_file(self, path: Path, idx: int) -> sp.csc_matrix: + """Loads a specific data shard into memory as a sparse matrix. + + Args: + path: Path to the sparse shard. + idx: Index of the shard. + + Returns: + The sparse matrix loaded from the file. + """ + # column_shard is of form event_idx, feature_idx, value + matrix = self._load_matrix(path) + if path.stem in ["first", "present"]: + agg = f"static/{path.stem}" + else: + agg = f"{path.parent.stem}/{path.stem}" + + return self._filter_shard_on_codes_and_freqs(agg, matrix) + + @TimeableMixin.TimeAs + def _get_dynamic_shard_by_index(self, idx: int) -> sp.csc_matrix: + """Loads a shard and returns it as a sparse matrix after applying feature inclusion filtering. + + Args: + idx: Index of the shard to load from disk. + + Returns: + The filtered sparse matrix. + + Raises: + ValueError: If any of the required files for the shard do not exist. + """ + # get all window_size x aggreagation files using the file resolver + files = get_model_files(self.cfg, self.split, self._data_shards[idx]) + + if not all(file.exists() for file in files): + # find missing files + missing_files = [file for file in files if not file.exists()] + raise ValueError( + f"Not all files exist for shard {self._data_shards[idx]}. Missing: {missing_files}" + ) + + dynamic_cscs = [self._load_dynamic_shard_from_file(file, idx) for file in files] + + combined_csc = sp.hstack(dynamic_cscs, format="csc") + + return combined_csc + + @TimeableMixin.TimeAs + def _get_shard_by_index(self, idx: int) -> tuple[sp.csc_matrix, np.ndarray]: + """Loads a specific shard of data from disk and concatenate with static data. + + Args: + idx: Index of the shard to load. + + Returns: + A tuple containing the combined feature data and the corresponding labels + for the given shard. + """ + dynamic_df = self._get_dynamic_shard_by_index(idx) + label_df = self.labels[self._data_shards[idx]] + return dynamic_df, label_df + + @TimeableMixin.TimeAs + def _filter_shard_on_codes_and_freqs(self, agg: str, df: sp.csc_matrix) -> sp.csc_matrix: + """Filters the given data frame based on the inclusion sets and aggregation type. + + Given the codes_mask, the method filters the dynamic data frame to only include + columns that are True in the mask. + + Args: + agg: The aggregation type used to determine the filtering logic. + df: The data frame to be filtered. + + Returns: + The filtered data frame. + """ + if self.codes_set is None: + return df + + ckey = f"_filter_shard_on_codes_and_freqs/{agg}" + self._register_start(key=ckey) + + df = df[:, self.code_masks[agg]] + + self._register_end(key=ckey) + + return df + + def get_data_shards(self, idx: int | list[int]) -> tuple[sp.csc_matrix, np.ndarray]: + """Retrieves the feature data and labels for specific shards. + + Args: + idx: Index of the shard to retrieve or list of indices. + + Returns: + A tuple where the first element is a sparse matrix containing the + feature data, and the second element is a numpy array containing the labels. + """ + X = [] + y = [] + if isinstance(idx, int): + idx = [idx] + for i in idx: + X_, y_ = self._get_shard_by_index(i) + X.append(X_) + y.append(y_) + if len(X) == 0 or len(y) == 0: + raise ValueError("No data found in the shards or labels. Please check input files.") + X = sp.vstack(X) + y = np.concatenate(y, axis=0) + + return X, y + + def get_data(self) -> tuple[sp.csc_matrix, np.ndarray]: + """Retrieves the feature data and labels for the current split. + + Returns: + A tuple where the first element is a sparse matrix containing the + feature data, and the second element is a numpy array containing the labels. + """ + return self.get_data_shards(range(len(self._data_shards))) + + def set_event_ids(self, event_ids=None | list[int]): + """Sets the valid event IDs for each shard. + + Args: + event_ids: List of event IDs for each shard. + """ + if event_ids is None: + self.valid_event_ids = self._load_event_ids() + else: + # parse some list of events they care about + pass + + def set_labels(self, labels=None | list[int]): + """Sets the labels for each shard. + + Args: + labels: List of labels for each shard. + """ + if labels is None: + self.labels = self._load_labels() + else: + # parse some list of events they care about + pass + + def set_codes(self, codes: list[str]): + """Sets the codes to the passed code set. Redeclares the code masks to match. + + Args: + codes: List of codes to include. + """ + self.codes_set = set(codes) + self.code_masks = self._get_code_masks(self.code_masks.keys(), self.codes_set) + + def add_code(self, code: str): + """Adds a code to the set of codes to include in the data. + + Args: + code: The code to add to the set. + """ + if code not in self.codes_set: + self.codes_set.add(code) + self.code_masks = self._get_code_masks(self.code_masks.keys(), self.codes_set) + + def remove_code(self, code: str): + """Removes a code from the set of codes to include in the data. + + Args: + code: The code to remove from the set. + """ + if code in self.codes_set: + self.codes_set.remove(code) + self.code_masks = self._get_code_masks(self.code_masks.keys(), self.codes_set) + + def get_codes(self) -> set[str]: + """Retrieves the set of codes to include in the data. + + Returns: + The set of codes to include. + """ + return self.codes_set + + def get_num_features(self) -> int: + """Retrieves the total number of features in the data. + + Returns: + The total number of features. + """ + return self.num_features + + def get_valid_event_ids(self) -> Mapping[int, list]: + """Retrieves the valid event IDs for each shard. + + Returns: + A mapping from shard indices to lists of valid event IDs. + """ + return self.valid_event_ids + + def get_label(self) -> Mapping[int, list]: + """Retrieves the labels for each shard. + + Returns: + A mapping from shard indices to lists of labels. + """ + return self.labels + + def get_data_shard_list(self) -> list[str]: + """Retrieves the list of data shards. + + Returns: + The list of data shards. + """ + return self._data_shards + + def get_data_shard_count(self) -> int: + """Retrieves the number of data shards. + + Returns: + The number of data shards. + """ + return len(self._data_shards) + + def get_split(self) -> str: + """Retrieves the data split being used. + + Returns: + The data split being used. + """ + return self.split + + def get_classes(self) -> int: + """Retrieves the unique labels in the data. + + Returns: + The unique labels. + """ + # get all labels in a list + all_labels = [] + for label in self.labels.values(): + all_labels.extend(label) + + return np.unique(all_labels) + + def get_all_column_names(self) -> list[str]: + """Retrieves the names of all columns in the data. + + Returns: + The names of all columns. + """ + files = get_model_files(self.cfg, self.split, self._data_shards[0]) + + def extract_name(test_file): + return str(Path(test_file.parent.parent.stem, test_file.parent.stem, test_file.stem)) + + agg_wind_combos = [extract_name(test_file) for test_file in files] + + feature_columns = get_feature_columns(self.cfg.tabularization.filtered_code_metadata_fp) + all_feats = [] + for agg_wind in agg_wind_combos: + window, feat, agg = agg_wind.split("/") + feature_ids = get_feature_indices(feat + "/" + agg, feature_columns) + feature_names = [feature_columns[i] for i in feature_ids] + for feat_name in feature_names: + all_feats.append(f"{feat_name}/{agg}/{window}") + + return all_feats + + def get_column_names(self, indices: list[int] = None) -> list[str]: + """Retrieves the names of the columns in the data. + + Returns: + The names of the columns. + """ + files = get_model_files(self.cfg, self.split, self._data_shards[0]) + + def extract_name(test_file): + return str(Path(test_file.parent.parent.stem, test_file.parent.stem, test_file.stem)) + + agg_wind_combos = [extract_name(test_file) for test_file in files] + + feature_columns = get_feature_columns(self.cfg.tabularization.filtered_code_metadata_fp) + all_feats = [] + for agg_wind in agg_wind_combos: + window, feat, agg = agg_wind.split("/") + feature_ids = get_feature_indices(feat + "/" + agg, feature_columns) + feature_names = [feature_columns[i] for i in feature_ids] + for feat_name in feature_names: + all_feats.append(f"{feat_name}/{agg}/{window}") + + # filter by only those in the list of indices + all_feats = [all_feats[i] for i in indices] + return all_feats diff --git a/src/MEDS_tabular_automl/xgboost_model.py b/src/MEDS_tabular_automl/xgboost_model.py new file mode 100644 index 0000000..371d247 --- /dev/null +++ b/src/MEDS_tabular_automl/xgboost_model.py @@ -0,0 +1,194 @@ +from collections.abc import Callable +from pathlib import Path + +import scipy.sparse as sp +import xgboost as xgb +from loguru import logger +from mixins import TimeableMixin +from omegaconf import DictConfig, OmegaConf +from sklearn.metrics import roc_auc_score + +from .tabular_dataset import TabularDataset + + +class XGBIterator(xgb.DataIter, TabularDataset, TimeableMixin): + """XGBIterator class for loading and processing data shards for use in XGBoost models. + + This class provides functionality for iterating through data shards, loading + feature data and labels, and processing them based on the provided configuration. + + Args: + cfg: A configuration dictionary containing parameters for + data processing, feature selection, and other settings. + split: The data split to use, which can be one of "train", "tuning", + or "held_out". This determines which subset of the data is loaded and processed. + + Attributes: + cfg: Configuration dictionary containing parameters for + data processing, feature selection, and other settings. + file_name_resolver: Object for resolving file names and paths based on the configuration. + split: The data split being used for loading and processing data shards. + _data_shards: List of data shard names. + valid_event_ids: Dictionary mapping shard number to a list of valid event IDs. + labels: Dictionary mapping shard number to a list of labels for the corresponding event IDs. + codes_set: Set of codes to include in the data. + code_masks: Dictionary of code masks for filtering features based on aggregation. + num_features: Total number of features in the data. + """ + + def __init__(self, cfg: DictConfig, split: str): + """Initializes the XGBIterator with the provided configuration and data split. + + Args: + cfg: The configuration dictionary. + split: The data split to use. + """ + xgb.DataIter.__init__(self, cache_prefix=Path(cfg.cache_dir)) + TabularDataset.__init__(self, cfg=cfg, split=split) + TimeableMixin.__init__(self) + self.valid_event_ids, self.labels = self._load_ids_and_labels() + # check if the labels are empty + if self.labels is None: + raise ValueError("No labels found.") + self._it = 0 + + @TimeableMixin.TimeAs + def next(self, input_data: Callable) -> int: + """Advances the XGBIterator by one step and provides data to XGBoost for DMatrix construction. + + Args: + input_data: A function passed by XGBoost with the same signature as `DMatrix`. + + Returns: + 0 if end of iteration, 1 otherwise. + """ + if self._it == len(self._data_shards): + return 0 + + X, y = self._get_shard_by_index(self._it) # self._data_shards[self._it]) + logger.debug(f"X shape: {X.shape}, y shape: {y.shape}") + input_data(data=sp.csr_matrix(X), label=y) + self._it += 1 + + return 1 + + @TimeableMixin.TimeAs + def reset(self): + """Resets the XGBIterator to its beginning.""" + self._it = 0 + + +class XGBoostModel(TimeableMixin): + """Class for configuring, training, and evaluating an XGBoost model. + + This class utilizes the configuration settings provided to manage the training and evaluation + process of an XGBoost model, ensuring the model is trained and validated using specified parameters + and data splits. It supports training with in-memory data handling as well as direct streaming from + disk using XGBIterators. + + Args: + cfg: The configuration settings for the model, including data paths, model parameters, + and flags for data handling. + + Attributes: + cfg: Configuration object containing all settings required for model operation. + model: The XGBoost model after being trained. + dtrain: The training dataset in DMatrix format. + dtuning: The tuning (validation) dataset in DMatrix format. + dheld_out: The held-out (test) dataset in DMatrix format. + itrain: XGBIterator for the training dataset. + ituning: XGBIterator for the tuning dataset. + iheld_out: XGBIterator for the held-out dataset. + keep_data_in_memory: Flag indicating whether to keep all data in memory or stream from disk. + """ + + def __init__(self, cfg: DictConfig): + """Initializes the XGBoostClassifier with the provided configuration. + + Args: + cfg: The configuration dictionary. + """ + self.cfg = cfg + self.keep_data_in_memory = cfg.model_params.iterator.keep_data_in_memory + + self.itrain = None + self.ituning = None + self.iheld_out = None + + self.dtrain = None + self.dtuning = None + self.dheld_out = None + + self.model = None + + @TimeableMixin.TimeAs + def _build(self): + """Builds necessary data structures for training.""" + if self.keep_data_in_memory: + self._build_iterators() + self._build_dmatrix_in_memory() + else: + self._build_iterators() + self._build_dmatrix_from_iterators() + + @TimeableMixin.TimeAs + def _train(self): + """Trains the model.""" + self.model = xgb.train( + OmegaConf.to_container(self.cfg.model_params.model), + self.dtrain, + num_boost_round=self.cfg.model_params.num_boost_round, + early_stopping_rounds=self.cfg.model_params.early_stopping_rounds, + # nthreads=self.cfg.nthreads, + evals=[(self.dtrain, "train"), (self.dtuning, "tuning")], + verbose_eval=0, + ) + + @TimeableMixin.TimeAs + def train(self): + """Trains the model.""" + self._build() + self._train() + + @TimeableMixin.TimeAs + def _build_dmatrix_in_memory(self): + """Builds the DMatrix from the data in memory.""" + X_train, y_train = self.itrain.get_data() + X_tuning, y_tuning = self.ituning.get_data() + X_held_out, y_held_out = self.iheld_out.get_data() + self.dtrain = xgb.DMatrix(X_train, label=y_train) + self.dtuning = xgb.DMatrix(X_tuning, label=y_tuning) + self.dheld_out = xgb.DMatrix(X_held_out, label=y_held_out) + + @TimeableMixin.TimeAs + def _build_dmatrix_from_iterators(self): + """Builds the DMatrix from the iterators.""" + self.dtrain = xgb.DMatrix(self.itrain) + self.dtuning = xgb.DMatrix(self.ituning) + self.dheld_out = xgb.DMatrix(self.iheld_out) + + @TimeableMixin.TimeAs + def _build_iterators(self): + """Builds the iterators for training, validation, and testing.""" + self.itrain = XGBIterator(self.cfg, split="train") + self.ituning = XGBIterator(self.cfg, split="tuning") + self.iheld_out = XGBIterator(self.cfg, split="held_out") + + @TimeableMixin.TimeAs + def evaluate(self) -> float: + """Evaluates the model on the tuning set. + + Returns: + The evaluation metric as the ROC AUC score. + """ + y_pred = self.model.predict(self.dtuning) + y_true = self.dtuning.get_label() + return roc_auc_score(y_true, y_pred) + + def save_model(self, output_fp: Path): + """Saves the trained model to the specified file path. + + Args: + output_fp: The file path to save the model to. + """ + self.model.save_model(output_fp) diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index 130721c..2bcfe66 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -17,6 +17,7 @@ from MEDS_tabular_automl.scripts import ( cache_task, describe_codes, + launch_basemodel, launch_xgboost, tabularize_static, tabularize_time_series, @@ -328,6 +329,44 @@ def test_tabularize(): output_files = list(output_dir.glob("**/*.json")) assert len(output_files) == 1 + basemodel_config_kwargs = { + **shared_config, + "tabularization.min_code_inclusion_frequency": 1, + "tabularization.window_sizes": "[30d,365d,full]", + } + + with initialize( + version_base=None, config_path="../src/MEDS_tabular_automl/configs/" + ): # path to config.yaml + overrides = [f"{k}={v}" for k, v in basemodel_config_kwargs.items()] + cfg = compose(config_name="launch_basemodel", overrides=overrides) # config.yaml + + output_dir = Path(cfg.output_cohort_dir) / "model" + + launch_basemodel.main(cfg) + output_files = list(output_dir.glob("**/*.pkl")) + assert len(output_files) == 1 + + basemodel_config_kwargs = { + **shared_config, + "tabularization.min_code_inclusion_frequency": 1, + "tabularization.window_sizes": "[30d,365d,full]", + "model_params.iterator.keep_data_in_memory": False, + "model_dir": "${output_cohort_dir}/model_online/model_${now:%Y-%m-%d_%H-%M-%S}", + } + + with initialize( + version_base=None, config_path="../src/MEDS_tabular_automl/configs/" + ): # path to config.yaml + overrides = [f"{k}={v}" for k, v in basemodel_config_kwargs.items()] + cfg = compose(config_name="launch_basemodel", overrides=overrides) # config.yaml + + output_dir = Path(cfg.output_cohort_dir) / "model_online" + + launch_basemodel.main(cfg) + output_files = list(output_dir.glob("**/*.pkl")) + assert len(output_files) == 1 + def run_command(script: str, args: list[str], hydra_kwargs: dict[str, str], test_name: str): command_parts = [script] + args + [f"{k}={v}" for k, v in hydra_kwargs.items()] From 061273050c3148186409a4e32ca3a255623f4e1f Mon Sep 17 00:00:00 2001 From: Teya Bergamaschi Date: Tue, 20 Aug 2024 15:05:49 +0000 Subject: [PATCH 03/54] [wip] filtering features --- ...asemodel.yaml => launch_sklearnmodel.yaml} | 2 +- .../configs/tabularization/default.yaml | 8 +- ...ch_basemodel.py => launch_sklearnmodel.py} | 8 +- .../scripts/tabularize_static.py | 4 +- .../{base_model.py => sklearn_model.py} | 50 ++++--- src/MEDS_tabular_automl/tabular_dataset.py | 140 +++++------------- src/MEDS_tabular_automl/utils.py | 22 ++- tests/test_tabularize.py | 6 +- 8 files changed, 102 insertions(+), 138 deletions(-) rename src/MEDS_tabular_automl/configs/{launch_basemodel.yaml => launch_sklearnmodel.yaml} (97%) rename src/MEDS_tabular_automl/scripts/{launch_basemodel.py => launch_sklearnmodel.py} (90%) rename src/MEDS_tabular_automl/{base_model.py => sklearn_model.py} (83%) diff --git a/src/MEDS_tabular_automl/configs/launch_basemodel.yaml b/src/MEDS_tabular_automl/configs/launch_sklearnmodel.yaml similarity index 97% rename from src/MEDS_tabular_automl/configs/launch_basemodel.yaml rename to src/MEDS_tabular_automl/configs/launch_sklearnmodel.yaml index 0be805a..805593e 100644 --- a/src/MEDS_tabular_automl/configs/launch_basemodel.yaml +++ b/src/MEDS_tabular_automl/configs/launch_sklearnmodel.yaml @@ -30,4 +30,4 @@ model_params: log_dir: ${model_dir}/.logs/ -name: launch_basemodel +name: launch_sklearnmodel diff --git a/src/MEDS_tabular_automl/configs/tabularization/default.yaml b/src/MEDS_tabular_automl/configs/tabularization/default.yaml index 3f8761c..5166b91 100644 --- a/src/MEDS_tabular_automl/configs/tabularization/default.yaml +++ b/src/MEDS_tabular_automl/configs/tabularization/default.yaml @@ -1,7 +1,9 @@ # User inputs -allowed_codes: null -min_code_inclusion_frequency: 10 filtered_code_metadata_fp: ${output_cohort_dir}/tabularized_code_metadata.parquet +allowed_codes: null +min_code_inclusion_count: 10 +min_code_inclusion_frequency: 0.01 +max_included_codes: null window_sizes: - "1d" - "7d" @@ -19,4 +21,4 @@ aggs: - "value/max" # Resolved inputs -_resolved_codes: ${filter_to_codes:${tabularization.allowed_codes},${tabularization.min_code_inclusion_frequency},${tabularization.filtered_code_metadata_fp}} +_resolved_codes: ${filter_to_codes:${tabularization.filtered_code_metadata_fp},${tabularization.allowed_codes},${tabularization.min_code_inclusion_count},$`{tabularization.min_code_inclusion_frequency},${tabularization.max_included_codes}}`} diff --git a/src/MEDS_tabular_automl/scripts/launch_basemodel.py b/src/MEDS_tabular_automl/scripts/launch_sklearnmodel.py similarity index 90% rename from src/MEDS_tabular_automl/scripts/launch_basemodel.py rename to src/MEDS_tabular_automl/scripts/launch_sklearnmodel.py index fbbb0ee..f264765 100644 --- a/src/MEDS_tabular_automl/scripts/launch_basemodel.py +++ b/src/MEDS_tabular_automl/scripts/launch_sklearnmodel.py @@ -4,13 +4,13 @@ from loguru import logger from omegaconf import DictConfig -from ..base_model import BaseModel +from ..sklearn_model import SklearnModel from ..utils import hydra_loguru_init -# config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_basemodel.yaml") +# config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_sklearnmodel.yaml") # if not config_yaml.is_file(): # raise FileNotFoundError("Core configuration not successfully installed!") -config_yaml = Path("./src/MEDS_tabular_automl/configs/launch_basemodel.yaml") +config_yaml = Path("./src/MEDS_tabular_automl/configs/launch_sklearnmodel.yaml") @hydra.main(version_base=None, config_path=str(config_yaml.parent.resolve()), config_name=config_yaml.stem) @@ -28,7 +28,7 @@ def main(cfg: DictConfig) -> float: if not cfg.loguru_init: hydra_loguru_init() try: - model = BaseModel(cfg) + model = SklearnModel(cfg) model.train() auc = model.evaluate() logger.info(f"AUC: {auc}") diff --git a/src/MEDS_tabular_automl/scripts/tabularize_static.py b/src/MEDS_tabular_automl/scripts/tabularize_static.py index 2474442..e1aa0ee 100644 --- a/src/MEDS_tabular_automl/scripts/tabularize_static.py +++ b/src/MEDS_tabular_automl/scripts/tabularize_static.py @@ -92,9 +92,11 @@ def read_fn(_): def compute_fn(_): filtered_feature_columns = filter_to_codes( + cfg.input_code_metadata_fp, cfg.tabularization.allowed_codes, + cfg.tabularization.min_code_inclusion_count, cfg.tabularization.min_code_inclusion_frequency, - cfg.input_code_metadata_fp, + cfg.tabularization.max_included_codes, ) feature_freqs = get_feature_freqs(cfg.input_code_metadata_fp) filtered_feature_columns_set = set(filtered_feature_columns) diff --git a/src/MEDS_tabular_automl/base_model.py b/src/MEDS_tabular_automl/sklearn_model.py similarity index 83% rename from src/MEDS_tabular_automl/base_model.py rename to src/MEDS_tabular_automl/sklearn_model.py index b6f0276..cbaa639 100644 --- a/src/MEDS_tabular_automl/base_model.py +++ b/src/MEDS_tabular_automl/sklearn_model.py @@ -11,8 +11,8 @@ from .tabular_dataset import TabularDataset -class BaseIterator(TabularDataset, TimeableMixin): - """BaseIterator class for loading and processing data shards for use in SciKit-Learn models. +class SklearnIterator(TabularDataset, TimeableMixin): + """SklearnIterator class for loading and processing data shards for use in SciKit-Learn models. This class provides functionality for iterating through data shards, loading feature data and labels, and processing them based on the provided configuration. @@ -37,7 +37,7 @@ class BaseIterator(TabularDataset, TimeableMixin): """ def __init__(self, cfg: DictConfig, split: str): - """Initializes the BaseIterator with the provided configuration and data split. + """Initializes the SklearnIterator with the provided configuration and data split. Args: cfg: The configuration dictionary. @@ -57,11 +57,11 @@ def __init__(self, cfg: DictConfig, split: str): # function(data, labels) -class BaseMatrix(TimeableMixin): - """BaseMatrix class for loading and processing data shards for use in SciKit-Learn models.""" +class SklearnMatrix(TimeableMixin): + """SklearnMatrix class for loading and processing data shards for use in SciKit-Learn models.""" def __init__(self, data: sp.csr_matrix, labels: np.ndarray): - """Initializes the BaseMatrix with the provided configuration and data split. + """Initializes the SklearnMatrix with the provided configuration and data split. Args: data @@ -77,7 +77,7 @@ def get_label(self): return self.labels -class BaseModel(TimeableMixin): +class SklearnModel(TimeableMixin): """Class for configuring, training, and evaluating an SciKit-Learn model. This class utilizes the configuration settings provided to manage the training and evaluation @@ -178,40 +178,54 @@ def train(self): @TimeableMixin.TimeAs def _build_matrix_in_memory(self): """Builds the DMatrix from the data in memory.""" - self.dtrain = BaseMatrix(*self.itrain.get_data()) - self.dtuning = BaseMatrix(*self.ituning.get_data()) - self.dheld_out = BaseMatrix(*self.iheld_out.get_data()) + self.dtrain = SklearnMatrix(*self.itrain.get_data()) + self.dtuning = SklearnMatrix(*self.ituning.get_data()) + self.dheld_out = SklearnMatrix(*self.iheld_out.get_data()) @TimeableMixin.TimeAs def _build_iterators(self): """Builds the iterators for training, validation, and testing.""" - self.itrain = BaseIterator(self.cfg, split="train") - self.ituning = BaseIterator(self.cfg, split="tuning") - self.iheld_out = BaseIterator(self.cfg, split="held_out") + self.itrain = SklearnIterator(self.cfg, split="train") + self.ituning = SklearnIterator(self.cfg, split="tuning") + self.iheld_out = SklearnIterator(self.cfg, split="held_out") @TimeableMixin.TimeAs - def evaluate(self) -> float: + def evaluate(self, split: str = "tuning") -> float: """Evaluates the model on the tuning set. Returns: The evaluation metric as the ROC AUC score. """ + # depending on split point to correct data + if split == "tuning": + dsplit = self.dtuning + isplit = self.ituning + elif split == "held_out": + dsplit = self.dheld_out + isplit = self.iheld_out + elif split == "train": + dsplit = self.dtrain + isplit = self.itrain + else: + raise ValueError(f"Split {split} is not valid.") + # check if model has predict_proba method if not hasattr(self.model, "predict_proba"): raise ValueError(f"Model {self.model.__class__.__name__} does not have a predict_proba method.") # two cases: data is in memory or data is streamed if self.keep_data_in_memory: - y_pred = self.model.predict_proba(self.dtuning.get_data())[:, 1] - y_true = self.dtuning.get_label() + y_pred = self.model.predict_proba(dsplit.get_data())[:, 1] + y_true = dsplit.get_label() else: y_pred = [] y_true = [] - for shard_idx in range(len(self.ituning._data_shards)): - data, labels = self.ituning.get_data_shards(shard_idx) + for shard_idx in range(len(isplit._data_shards)): + data, labels = isplit.get_data_shards(shard_idx) y_pred.extend(self.model.predict_proba(data)[:, 1]) y_true.extend(labels) y_pred = np.array(y_pred) y_true = np.array(y_true) + # check if y_pred and y_true are not empty if len(y_pred) == 0 or len(y_true) == 0: raise ValueError("Predictions or true labels are empty.") diff --git a/src/MEDS_tabular_automl/tabular_dataset.py b/src/MEDS_tabular_automl/tabular_dataset.py index 740dc4c..594b82c 100644 --- a/src/MEDS_tabular_automl/tabular_dataset.py +++ b/src/MEDS_tabular_automl/tabular_dataset.py @@ -54,7 +54,6 @@ def __init__(self, cfg: DictConfig, split: str = "train"): [shard.stem for shard in list_subdir_files(Path(cfg.input_label_dir) / split, "parquet")] ) self.valid_event_ids, self.labels = None, None - # self.valid_event_ids, self.labels = self._load_ids_and_labels() self.codes_set, self.code_masks, self.num_features = self._get_code_set() @@ -75,7 +74,7 @@ def _get_code_masks(self, feature_columns: list, codes_set: set) -> Mapping[str, code_masks = {} for agg in set(self.cfg.tabularization.aggs): feature_ids = get_feature_indices(agg, feature_columns) - code_mask = [True if idx in codes_set else False for idx in feature_ids] + code_mask = [idx in codes_set for idx in feature_ids] code_masks[agg] = code_mask return code_masks @@ -121,7 +120,6 @@ def _load_ids_and_labels( if load_ids: cached_event_ids[shard] = label_df.select(pl.col("event_id")).collect().to_series() - # TODO: check this for Nan or any other case we need to worry about if load_labels: cached_labels[shard] = label_df.select(pl.col("label")).collect().to_series() if self.cfg.model_params.iterator.binarize_task: @@ -171,11 +169,46 @@ def _get_code_set(self) -> tuple[set[int], Mapping[str, list[bool]], int]: allowed_codes = set(self.cfg.tabularization._resolved_codes) codes_set = {feature_dict[code] for code in feature_dict if code in allowed_codes} + if hasattr(self.cfg.tabularization, "max_by_correlation"): + corrs = self._get_approximate_correlation_per_feature(self.get_data_shards(0)[0], self.get_data_shards(0)[1]) + corrs = np.abs(corrs) + sorted_corrs = np.argsort(corrs)[::-1] + codes_set = set(sorted_corrs[: self.cfg.tabularization.max_by_correlation]) + if hasattr(self.cfg.tabularization, "min_correlation"): + corrs = self._get_approximate_correlation_per_feature(self.get_data_shards(0)[0], self.get_data_shards(0)[1]) + corrs = np.abs(corrs) + codes_set = set(np.where(corrs > self.cfg.tabularization.min_correlation)[0]) + return ( codes_set, self._get_code_masks(feature_columns, codes_set), len(feature_columns), ) + + def _get_approximate_correlation_per_feature(self, X: sp.csc_matrix, y: np.ndarray) -> np.ndarray: + """Calculates the approximate correlation of each feature with the target. + + Args: + X: The feature data. + y: The target labels. + + Returns: + The approximate correlation of each feature with the target. + """ + # calculate the pearson r correlation of each feature with the target + # this is a very rough approximation and should be used for feature selection + # and not as a definitive measure of feature importance + + # check that y has information + if len(np.unique(y)) == 1: + raise ValueError("Labels have no information. Cannot calculate correlation.") + + from scipy.stats import pearsonr + corrs = np.zeros(X.shape[1]) + for i in range(X.shape[1]): + corrs[i] = pearsonr(X[:, i].toarray().flatten(), y)[0] + return corrs + @TimeableMixin.TimeAs def _load_dynamic_shard_from_file(self, path: Path, idx: int) -> sp.csc_matrix: @@ -301,99 +334,6 @@ def get_data(self) -> tuple[sp.csc_matrix, np.ndarray]: """ return self.get_data_shards(range(len(self._data_shards))) - def set_event_ids(self, event_ids=None | list[int]): - """Sets the valid event IDs for each shard. - - Args: - event_ids: List of event IDs for each shard. - """ - if event_ids is None: - self.valid_event_ids = self._load_event_ids() - else: - # parse some list of events they care about - pass - - def set_labels(self, labels=None | list[int]): - """Sets the labels for each shard. - - Args: - labels: List of labels for each shard. - """ - if labels is None: - self.labels = self._load_labels() - else: - # parse some list of events they care about - pass - - def set_codes(self, codes: list[str]): - """Sets the codes to the passed code set. Redeclares the code masks to match. - - Args: - codes: List of codes to include. - """ - self.codes_set = set(codes) - self.code_masks = self._get_code_masks(self.code_masks.keys(), self.codes_set) - - def add_code(self, code: str): - """Adds a code to the set of codes to include in the data. - - Args: - code: The code to add to the set. - """ - if code not in self.codes_set: - self.codes_set.add(code) - self.code_masks = self._get_code_masks(self.code_masks.keys(), self.codes_set) - - def remove_code(self, code: str): - """Removes a code from the set of codes to include in the data. - - Args: - code: The code to remove from the set. - """ - if code in self.codes_set: - self.codes_set.remove(code) - self.code_masks = self._get_code_masks(self.code_masks.keys(), self.codes_set) - - def get_codes(self) -> set[str]: - """Retrieves the set of codes to include in the data. - - Returns: - The set of codes to include. - """ - return self.codes_set - - def get_num_features(self) -> int: - """Retrieves the total number of features in the data. - - Returns: - The total number of features. - """ - return self.num_features - - def get_valid_event_ids(self) -> Mapping[int, list]: - """Retrieves the valid event IDs for each shard. - - Returns: - A mapping from shard indices to lists of valid event IDs. - """ - return self.valid_event_ids - - def get_label(self) -> Mapping[int, list]: - """Retrieves the labels for each shard. - - Returns: - A mapping from shard indices to lists of labels. - """ - return self.labels - - def get_data_shard_list(self) -> list[str]: - """Retrieves the list of data shards. - - Returns: - The list of data shards. - """ - return self._data_shards - def get_data_shard_count(self) -> int: """Retrieves the number of data shards. @@ -402,14 +342,6 @@ def get_data_shard_count(self) -> int: """ return len(self._data_shards) - def get_split(self) -> str: - """Retrieves the data split being used. - - Returns: - The data split being used. - """ - return self.split - def get_classes(self) -> int: """Retrieves the unique labels in the data. diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py index 49de128..f5e6251 100644 --- a/src/MEDS_tabular_automl/utils.py +++ b/src/MEDS_tabular_automl/utils.py @@ -46,15 +46,17 @@ def hydra_loguru_init() -> None: def filter_to_codes( - allowed_codes: list[str] | None, - min_code_inclusion_frequency: int, code_metadata_fp: Path, + allowed_codes: list[str] | None, + min_code_inclusion_count: int | None, + min_code_inclusion_frequency: float | None, + max_include_codes: int | None, ) -> list[str]: """Filters and returns codes based on allowed list and minimum frequency. Args: allowed_codes: List of allowed codes, None means all codes are allowed. - min_code_inclusion_frequency: Minimum frequency a code must have to be included. + min_code_inclusion_count: Minimum frequency a code must have to be included. code_metadata_fp: Path to the metadata file containing code information. Returns: @@ -74,7 +76,19 @@ def filter_to_codes( if allowed_codes is not None: feature_freqs = feature_freqs.filter(pl.col("code").is_in(allowed_codes)) - feature_freqs = feature_freqs.filter(pl.col("count") >= min_code_inclusion_frequency) + if min_code_inclusion_frequency is not None: + pass + # need to consider size of the dataset vs count + + # feature_freqs = feature_freqs.filter(pl.col("frequency") >= min_code_inclusion_frequency) + + if min_code_inclusion_count is not None: + feature_freqs = feature_freqs.filter(pl.col("count") >= min_code_inclusion_count) + + if max_include_codes is not None: + feature_freqs = feature_freqs.sort("count", reverse=True).head(max_include_codes) + + return sorted(feature_freqs["code"].to_list()) diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index 2bcfe66..7eacb1b 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -17,7 +17,7 @@ from MEDS_tabular_automl.scripts import ( cache_task, describe_codes, - launch_basemodel, + launch_sklearnmodel, launch_xgboost, tabularize_static, tabularize_time_series, @@ -343,7 +343,7 @@ def test_tabularize(): output_dir = Path(cfg.output_cohort_dir) / "model" - launch_basemodel.main(cfg) + launch_sklearnmodel.main(cfg) output_files = list(output_dir.glob("**/*.pkl")) assert len(output_files) == 1 @@ -363,7 +363,7 @@ def test_tabularize(): output_dir = Path(cfg.output_cohort_dir) / "model_online" - launch_basemodel.main(cfg) + launch_sklearnmodel.main(cfg) output_files = list(output_dir.glob("**/*.pkl")) assert len(output_files) == 1 From 2feee790a3b1ad0123cde536dd2d948c2789cd4e Mon Sep 17 00:00:00 2001 From: Teya Bergamaschi Date: Tue, 20 Aug 2024 15:07:03 +0000 Subject: [PATCH 04/54] [wip] filtering features --- src/MEDS_tabular_automl/sklearn_model.py | 2 +- src/MEDS_tabular_automl/tabular_dataset.py | 14 +++++++++----- src/MEDS_tabular_automl/utils.py | 7 +++---- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/src/MEDS_tabular_automl/sklearn_model.py b/src/MEDS_tabular_automl/sklearn_model.py index cbaa639..b660123 100644 --- a/src/MEDS_tabular_automl/sklearn_model.py +++ b/src/MEDS_tabular_automl/sklearn_model.py @@ -225,7 +225,7 @@ def evaluate(self, split: str = "tuning") -> float: y_true.extend(labels) y_pred = np.array(y_pred) y_true = np.array(y_true) - + # check if y_pred and y_true are not empty if len(y_pred) == 0 or len(y_true) == 0: raise ValueError("Predictions or true labels are empty.") diff --git a/src/MEDS_tabular_automl/tabular_dataset.py b/src/MEDS_tabular_automl/tabular_dataset.py index 594b82c..e484598 100644 --- a/src/MEDS_tabular_automl/tabular_dataset.py +++ b/src/MEDS_tabular_automl/tabular_dataset.py @@ -170,12 +170,16 @@ def _get_code_set(self) -> tuple[set[int], Mapping[str, list[bool]], int]: codes_set = {feature_dict[code] for code in feature_dict if code in allowed_codes} if hasattr(self.cfg.tabularization, "max_by_correlation"): - corrs = self._get_approximate_correlation_per_feature(self.get_data_shards(0)[0], self.get_data_shards(0)[1]) + corrs = self._get_approximate_correlation_per_feature( + self.get_data_shards(0)[0], self.get_data_shards(0)[1] + ) corrs = np.abs(corrs) sorted_corrs = np.argsort(corrs)[::-1] codes_set = set(sorted_corrs[: self.cfg.tabularization.max_by_correlation]) if hasattr(self.cfg.tabularization, "min_correlation"): - corrs = self._get_approximate_correlation_per_feature(self.get_data_shards(0)[0], self.get_data_shards(0)[1]) + corrs = self._get_approximate_correlation_per_feature( + self.get_data_shards(0)[0], self.get_data_shards(0)[1] + ) corrs = np.abs(corrs) codes_set = set(np.where(corrs > self.cfg.tabularization.min_correlation)[0]) @@ -184,7 +188,7 @@ def _get_code_set(self) -> tuple[set[int], Mapping[str, list[bool]], int]: self._get_code_masks(feature_columns, codes_set), len(feature_columns), ) - + def _get_approximate_correlation_per_feature(self, X: sp.csc_matrix, y: np.ndarray) -> np.ndarray: """Calculates the approximate correlation of each feature with the target. @@ -202,14 +206,14 @@ def _get_approximate_correlation_per_feature(self, X: sp.csc_matrix, y: np.ndarr # check that y has information if len(np.unique(y)) == 1: raise ValueError("Labels have no information. Cannot calculate correlation.") - + from scipy.stats import pearsonr + corrs = np.zeros(X.shape[1]) for i in range(X.shape[1]): corrs[i] = pearsonr(X[:, i].toarray().flatten(), y)[0] return corrs - @TimeableMixin.TimeAs def _load_dynamic_shard_from_file(self, path: Path, idx: int) -> sp.csc_matrix: """Loads a specific data shard into memory as a sparse matrix. diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py index f5e6251..badb246 100644 --- a/src/MEDS_tabular_automl/utils.py +++ b/src/MEDS_tabular_automl/utils.py @@ -78,17 +78,16 @@ def filter_to_codes( if min_code_inclusion_frequency is not None: pass - # need to consider size of the dataset vs count - + # need to consider size of the dataset vs count + # feature_freqs = feature_freqs.filter(pl.col("frequency") >= min_code_inclusion_frequency) - + if min_code_inclusion_count is not None: feature_freqs = feature_freqs.filter(pl.col("count") >= min_code_inclusion_count) if max_include_codes is not None: feature_freqs = feature_freqs.sort("count", reverse=True).head(max_include_codes) - return sorted(feature_freqs["code"].to_list()) From f3c985a69a321359e84a3d3865f9852f5c575f9f Mon Sep 17 00:00:00 2001 From: Teya Bergamaschi Date: Tue, 20 Aug 2024 18:22:03 +0000 Subject: [PATCH 05/54] [wip] sharing for updates only --- src/MEDS_tabular_automl/base_model.py | 23 +++++++ .../configs/launch_model.yaml | 22 +++++++ .../configs/models/sgd_classifier.yaml | 19 ++++++ .../configs/models/xgboost.yaml | 27 ++++++++ .../configs/tabularization/default.yaml | 4 +- src/MEDS_tabular_automl/dense_iterator.py | 54 ++++++++++++++++ .../scripts/launch_autogluon.py | 60 ++++++++++++++++++ .../scripts/launch_model.py | 63 +++++++++++++++++++ .../scripts/launch_xgboost.py | 48 +++++++------- .../scripts/tabularize_static.py | 1 - src/MEDS_tabular_automl/sklearn_model.py | 6 +- src/MEDS_tabular_automl/utils.py | 1 - src/MEDS_tabular_automl/xgboost_model.py | 4 +- tests/test_integration.py | 6 +- tests/test_tabularize.py | 24 +++---- 15 files changed, 316 insertions(+), 46 deletions(-) create mode 100644 src/MEDS_tabular_automl/base_model.py create mode 100644 src/MEDS_tabular_automl/configs/launch_model.yaml create mode 100644 src/MEDS_tabular_automl/configs/models/sgd_classifier.yaml create mode 100644 src/MEDS_tabular_automl/configs/models/xgboost.yaml create mode 100644 src/MEDS_tabular_automl/dense_iterator.py create mode 100644 src/MEDS_tabular_automl/scripts/launch_autogluon.py create mode 100644 src/MEDS_tabular_automl/scripts/launch_model.py diff --git a/src/MEDS_tabular_automl/base_model.py b/src/MEDS_tabular_automl/base_model.py new file mode 100644 index 0000000..9f30a07 --- /dev/null +++ b/src/MEDS_tabular_automl/base_model.py @@ -0,0 +1,23 @@ +from typing import Dict, Type +from abc import ABC, abstractmethod +from pathlib import Path +from omegaconf import DictConfig +from mixins import TimeableMixin + + +class BaseModel(ABC, TimeableMixin): + @abstractmethod + def __init__(self): + pass + + @abstractmethod + def train(self): + pass + + @abstractmethod + def evaluate(self) -> float: + pass + + @abstractmethod + def save_model(self, output_fp: Path): + pass diff --git a/src/MEDS_tabular_automl/configs/launch_model.yaml b/src/MEDS_tabular_automl/configs/launch_model.yaml new file mode 100644 index 0000000..71fcc14 --- /dev/null +++ b/src/MEDS_tabular_automl/configs/launch_model.yaml @@ -0,0 +1,22 @@ +defaults: + - default + - tabularization: default + - model: xgboost # This can be changed to sgd_classifier or any other model + - override hydra/sweeper: optuna + - override hydra/sweeper/sampler: tpe + - override hydra/launcher: joblib + - _self_ + +task_name: task + +# Task cached data dir +input_dir: ${output_cohort_dir}/${task_name}/task_cache +# Directory with task labels +input_label_dir: ${output_cohort_dir}/${task_name}/labels/ +# Where to output the model and cached data +model_dir: ${output_cohort_dir}/model/model_${now:%Y-%m-%d_%H-%M-%S} +output_filepath: ${model_dir}/model_metadata.json + +log_dir: ${model_dir}/.logs/ + +name: launch_model \ No newline at end of file diff --git a/src/MEDS_tabular_automl/configs/models/sgd_classifier.yaml b/src/MEDS_tabular_automl/configs/models/sgd_classifier.yaml new file mode 100644 index 0000000..1b05f15 --- /dev/null +++ b/src/MEDS_tabular_automl/configs/models/sgd_classifier.yaml @@ -0,0 +1,19 @@ +model_params: + epochs: 20 + early_stopping_rounds: 5 + model: + type: sklearn + _target_: sklearn.linear_model.SGDClassifier + loss: log_loss + iterator: + keep_data_in_memory: True + binarize_task: True + +hydra: + sweeper: + params: + +model_params.model.alpha: tag(log, interval(1e-6, 1)) + +model_params.model.l1_ratio: interval(0, 1) + +model_params.model.penalty: choice(['l1', 'l2', 'elasticnet']) + model_params.epochs: range(10, 100) + model_params.early_stopping_rounds: range(1, 10) \ No newline at end of file diff --git a/src/MEDS_tabular_automl/configs/models/xgboost.yaml b/src/MEDS_tabular_automl/configs/models/xgboost.yaml new file mode 100644 index 0000000..a4be06e --- /dev/null +++ b/src/MEDS_tabular_automl/configs/models/xgboost.yaml @@ -0,0 +1,27 @@ +model_params: + num_boost_round: 1000 + early_stopping_rounds: 5 + model: + type: xgboost + # _target_: xgboost.XGBClassifier + booster: gbtree + device: cpu + nthread: 1 + tree_method: hist + objective: binary:logistic + iterator: + keep_data_in_memory: True + binarize_task: True + +hydra: + sweeper: + params: + +model_params.model.eta: tag(log, interval(0.001, 1)) + +model_params.model.lambda: tag(log, interval(0.001, 1)) + +model_params.model.alpha: tag(log, interval(0.001, 1)) + +model_params.model.subsample: interval(0.5, 1) + +model_params.model.min_child_weight: interval(1e-2, 100) + model_params.num_boost_round: range(100, 1000) + model_params.early_stopping_rounds: range(1, 10) + +model_params.model.max_depth: range(2, 16) + tabularization.min_code_inclusion_frequency: tag(log, range(10, 1000000)) \ No newline at end of file diff --git a/src/MEDS_tabular_automl/configs/tabularization/default.yaml b/src/MEDS_tabular_automl/configs/tabularization/default.yaml index 5166b91..a4ffdc6 100644 --- a/src/MEDS_tabular_automl/configs/tabularization/default.yaml +++ b/src/MEDS_tabular_automl/configs/tabularization/default.yaml @@ -2,7 +2,7 @@ filtered_code_metadata_fp: ${output_cohort_dir}/tabularized_code_metadata.parquet allowed_codes: null min_code_inclusion_count: 10 -min_code_inclusion_frequency: 0.01 +min_code_inclusion_frequency: null max_included_codes: null window_sizes: - "1d" @@ -21,4 +21,4 @@ aggs: - "value/max" # Resolved inputs -_resolved_codes: ${filter_to_codes:${tabularization.filtered_code_metadata_fp},${tabularization.allowed_codes},${tabularization.min_code_inclusion_count},$`{tabularization.min_code_inclusion_frequency},${tabularization.max_included_codes}}`} +_resolved_codes: ${filter_to_codes:${tabularization.filtered_code_metadata_fp},${tabularization.allowed_codes},${tabularization.min_code_inclusion_count},${tabularization.min_code_inclusion_frequency},${tabularization.max_included_codes}} diff --git a/src/MEDS_tabular_automl/dense_iterator.py b/src/MEDS_tabular_automl/dense_iterator.py new file mode 100644 index 0000000..c9c6485 --- /dev/null +++ b/src/MEDS_tabular_automl/dense_iterator.py @@ -0,0 +1,54 @@ +from pathlib import Path + +import hydra +import numpy as np +import scipy.sparse as sp +from loguru import logger +from mixins import TimeableMixin +from omegaconf import DictConfig +from sklearn.metrics import roc_auc_score + +from .tabular_dataset import TabularDataset +from .base_model import BaseModel + + +class DenseIterator(TabularDataset, TimeableMixin): + + def __init__(self, cfg: DictConfig, split: str): + """Initializes the SklearnIterator with the provided configuration and data split. + + Args: + cfg: The configuration dictionary. + split: The data split to use. + """ + TabularDataset.__init__(self, cfg=cfg, split=split) + TimeableMixin.__init__(self) + self.valid_event_ids, self.labels = self._load_ids_and_labels() + # check if the labels are empty + if len(self.labels) == 0: + raise ValueError("No labels found.") + # self._it = 0 + + def densify(self) -> np.ndarray: + """Builds the data as a dense matrix based on column subselection.""" + + # get the column indices to include + cols = self.get_feature_indices() + + # map those to the feature names in the data + feature_names = self.get_all_column_names() + selected_features = [feature_names[col] for col in cols] + + # get the dense matrix by iterating through the data shards + data = [] + labels = [] + for shard_idx in range(len(self._data_shards)): + shard_data, shard_labels = self.get_data_shards(shard_idx) + shard_data = shard_data[:, cols] + data.append(shard_data) + labels.append(shard_labels) + data = sp.vstack(data) + labels = np.concatenate(labels, axis=0) + return data, labels, selected_features + + diff --git a/src/MEDS_tabular_automl/scripts/launch_autogluon.py b/src/MEDS_tabular_automl/scripts/launch_autogluon.py new file mode 100644 index 0000000..ac11c3c --- /dev/null +++ b/src/MEDS_tabular_automl/scripts/launch_autogluon.py @@ -0,0 +1,60 @@ +from importlib.resources import files +from pathlib import Path + +import hydra +import pandas as pd +from loguru import logger +from omegaconf import DictConfig + +from MEDS_tabular_automl.dense_iterator import DenseIterator + +from ..utils import hydra_loguru_init + +config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_xgboost.yaml") +if not config_yaml.is_file(): + raise FileNotFoundError("Core configuration not successfully installed!") + + +@hydra.main(version_base=None, config_path=str(config_yaml.parent.resolve()), config_name=config_yaml.stem) +def main(cfg: DictConfig) -> float: + """Launches AutoGluon after collecting data based on the provided configuration. + + Args: + cfg: The configuration dictionary specifying model and training parameters. + + """ + + # print(OmegaConf.to_yaml(cfg)) + if not cfg.loguru_init: + hydra_loguru_init() + + # check that autogluon is installed + try: + import autogluon as ag + except ImportError: + logger.error("AutoGluon is not installed. Please install AutoGluon.") + + # collect data based on the configuration + itrain = DenseIterator(cfg, "train") + ituning = DenseIterator(cfg, "tuning") + iheld_out = DenseIterator(cfg, "held_out") + + # collect data for AutoGluon + train_data, train_labels, cols = itrain.densify() + tuning_data, tuning_labels, _ = ituning.densify() + held_out_data, held_out_labels, _ = iheld_out.densify() + + # construct dfs for AutoGluon + train_df = pd.DataFrame(train_data.todense(), columns=cols) + train_df[cfg.task_name] = train_labels + tuning_df = pd.DataFrame(tuning_data.todense(), columns=cols) + tuning_df[cfg.task_name] = tuning_labels + held_out_df = pd.DataFrame(held_out_data.todense(), columns=cols) + held_out_df[cfg.task_name] = held_out_labels + + # launch AutoGluon + predictor = ag.TabularPredictor(label=cfg.task_name).fit(train_data=train_df, tuning_data=tuning_df) + + +if __name__ == "__main__": + main() diff --git a/src/MEDS_tabular_automl/scripts/launch_model.py b/src/MEDS_tabular_automl/scripts/launch_model.py new file mode 100644 index 0000000..7e7fbf8 --- /dev/null +++ b/src/MEDS_tabular_automl/scripts/launch_model.py @@ -0,0 +1,63 @@ +from importlib.resources import files +from pathlib import Path + +import hydra +from loguru import logger +from omegaconf import DictConfig +from typing import Dict, Type + +from MEDS_tabular_automl.base_model import BaseModel +from MEDS_tabular_automl.sklearn_model import SklearnModel +from MEDS_tabular_automl.xgboost_model import XGBoostModel + + +MODEL_CLASSES: Dict[str, Type[BaseModel]] = { + "xgboost": XGBoostModel, + "sklearn": SklearnModel +} + +from ..utils import hydra_loguru_init + +config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_xgboost.yaml") +if not config_yaml.is_file(): + raise FileNotFoundError("Core configuration not successfully installed!") + + +@hydra.main(version_base=None, config_path=str(config_yaml.parent.resolve()), config_name=config_yaml.stem) +def main(cfg: DictConfig) -> float: + """Optimizes the model based on the provided configuration. + + Args: + cfg: The configuration dictionary specifying model and training parameters. + + Returns: + The evaluation result as the ROC AUC score on the held-out test set. + """ + + # print(OmegaConf.to_yaml(cfg)) + if not cfg.loguru_init: + hydra_loguru_init() + try: + model_type = cfg.model.type + ModelClass = MODEL_CLASSES.get(model_type) + if ModelClass is None: + raise ValueError(f"Model type {model_type} not supported.") + + model = ModelClass(cfg) + model.train() + auc = model.evaluate() + logger.info(f"AUC: {auc}") + + # save model + output_fp = Path(cfg.output_filepath) + output_fp.parent.mkdir(parents=True, exist_ok=True) + + model.save_model(output_fp) + except Exception as e: + logger.error(f"Error occurred: {e}") + auc = 0.0 + return auc + + +if __name__ == "__main__": + main() diff --git a/src/MEDS_tabular_automl/scripts/launch_xgboost.py b/src/MEDS_tabular_automl/scripts/launch_xgboost.py index fd09e70..22d10c3 100644 --- a/src/MEDS_tabular_automl/scripts/launch_xgboost.py +++ b/src/MEDS_tabular_automl/scripts/launch_xgboost.py @@ -27,30 +27,30 @@ def main(cfg: DictConfig) -> float: # print(OmegaConf.to_yaml(cfg)) if not cfg.loguru_init: hydra_loguru_init() - try: - model = XGBoostModel(cfg) - model.train() - auc = model.evaluate() - logger.info(f"AUC: {auc}") - - # print( - # "Time Profiling for window sizes ", - # f"{cfg.tabularization.window_sizes} and min ", - # f"code frequency of {cfg.tabularization.min_code_inclusion_frequency}:", - # ) - # print("Train Time: \n", model._profile_durations()) - # print("Train Iterator Time: \n", model.itrain._profile_durations()) - # print("Tuning Iterator Time: \n", model.ituning._profile_durations()) - # print("Held Out Iterator Time: \n", model.iheld_out._profile_durations()) - - # save model - output_fp = Path(cfg.output_filepath) - output_fp.parent.mkdir(parents=True, exist_ok=True) - - model.save_model(output_fp) - except Exception as e: - logger.error(f"Error occurred: {e}") - auc = 0.0 + # try: + model = XGBoostModel(cfg) + model.train() + auc = model.evaluate() + logger.info(f"AUC: {auc}") + + # print( + # "Time Profiling for window sizes ", + # f"{cfg.tabularization.window_sizes} and min ", + # f"code frequency of {cfg.tabularization.min_code_inclusion_frequency}:", + # ) + # print("Train Time: \n", model._profile_durations()) + # print("Train Iterator Time: \n", model.itrain._profile_durations()) + # print("Tuning Iterator Time: \n", model.ituning._profile_durations()) + # print("Held Out Iterator Time: \n", model.iheld_out._profile_durations()) + + # save model + output_fp = Path(cfg.output_filepath) + output_fp.parent.mkdir(parents=True, exist_ok=True) + + model.save_model(output_fp) + # except Exception as e: + # logger.error(f"Error occurred: {e}") + # auc = 0.0 return auc diff --git a/src/MEDS_tabular_automl/scripts/tabularize_static.py b/src/MEDS_tabular_automl/scripts/tabularize_static.py index e1aa0ee..34d9c0d 100644 --- a/src/MEDS_tabular_automl/scripts/tabularize_static.py +++ b/src/MEDS_tabular_automl/scripts/tabularize_static.py @@ -86,7 +86,6 @@ def main( hydra_loguru_init() # Step 1: Cache the filtered features that will be used in the tabularization process and modeling - # import pdb; pdb.set_trace() def read_fn(_): return _ diff --git a/src/MEDS_tabular_automl/sklearn_model.py b/src/MEDS_tabular_automl/sklearn_model.py index b660123..e8c9f6a 100644 --- a/src/MEDS_tabular_automl/sklearn_model.py +++ b/src/MEDS_tabular_automl/sklearn_model.py @@ -9,6 +9,7 @@ from sklearn.metrics import roc_auc_score from .tabular_dataset import TabularDataset +from .base_model import BaseModel class SklearnIterator(TabularDataset, TimeableMixin): @@ -77,7 +78,7 @@ def get_label(self): return self.labels -class SklearnModel(TimeableMixin): +class SklearnModel(BaseModel, TimeableMixin): """Class for configuring, training, and evaluating an SciKit-Learn model. This class utilizes the configuration settings provided to manage the training and evaluation @@ -86,7 +87,7 @@ class SklearnModel(TimeableMixin): disk using iterators. Args: - cfg: The configuration settings for the model, including data paths, model parameters, + cfg: The configuration settings for the model, including data paths, model parameters,ß and flags for data handling. Attributes: @@ -107,6 +108,7 @@ def __init__(self, cfg: DictConfig): Args: cfg: The configuration dictionary. """ + super().__init__() self.cfg = cfg self.keep_data_in_memory = cfg.model_params.iterator.keep_data_in_memory diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py index badb246..eb4d4d1 100644 --- a/src/MEDS_tabular_automl/utils.py +++ b/src/MEDS_tabular_automl/utils.py @@ -108,7 +108,6 @@ def load_tqdm(use_tqdm: bool): return tqdm else: - def noop(x, **kwargs): return x diff --git a/src/MEDS_tabular_automl/xgboost_model.py b/src/MEDS_tabular_automl/xgboost_model.py index 371d247..9f0f119 100644 --- a/src/MEDS_tabular_automl/xgboost_model.py +++ b/src/MEDS_tabular_automl/xgboost_model.py @@ -9,6 +9,7 @@ from sklearn.metrics import roc_auc_score from .tabular_dataset import TabularDataset +from .base_model import BaseModel class XGBIterator(xgb.DataIter, TabularDataset, TimeableMixin): @@ -78,7 +79,7 @@ def reset(self): self._it = 0 -class XGBoostModel(TimeableMixin): +class XGBoostModel(BaseModel, TimeableMixin): """Class for configuring, training, and evaluating an XGBoost model. This class utilizes the configuration settings provided to manage the training and evaluation @@ -108,6 +109,7 @@ def __init__(self, cfg: DictConfig): Args: cfg: The configuration dictionary. """ + super().__init__() self.cfg = cfg self.keep_data_in_memory = cfg.model_params.iterator.keep_data_in_memory diff --git a/tests/test_integration.py b/tests/test_integration.py index d22eac5..81336ed 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -112,7 +112,7 @@ def test_integration(): # Step 2: Run the static data tabularization script tabularize_config = { **shared_config, - "tabularization.min_code_inclusion_frequency": 1, + "tabularization.min_code_inclusion_count": 1, "tabularization.window_sizes": "[30d,365d,full]", } stderr, stdout = run_command( @@ -161,7 +161,7 @@ def test_integration(): # Step 3: Run the time series tabularization script tabularize_config = { **shared_config, - "tabularization.min_code_inclusion_frequency": 1, + "tabularization.min_code_inclusion_count": 1, "tabularization.window_sizes": "[30d,365d,full]", } @@ -203,7 +203,7 @@ def test_integration(): # Step 4: Run the task_specific_caching script cache_config = { **shared_config, - "tabularization.min_code_inclusion_frequency": 1, + "tabularization.min_code_inclusion_count": 1, "tabularization.window_sizes": "[30d,365d,full]", } with initialize( diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index 7eacb1b..931b7a1 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -211,7 +211,7 @@ def test_tabularize(): # Step 2: Tabularization tabularize_static_config = { **shared_config, - "tabularization.min_code_inclusion_frequency": 1, + "tabularization.min_code_inclusion_count": 1, "tabularization.window_sizes": "[30d,365d,full]", } @@ -289,7 +289,7 @@ def test_tabularize(): # Step 3: Cache Task data cache_config = { **shared_config, - "tabularization.min_code_inclusion_frequency": 1, + "tabularization.min_code_inclusion_count": 1, "tabularization.window_sizes": "[30d,365d,full]", } @@ -313,7 +313,7 @@ def test_tabularize(): xgboost_config_kwargs = { **shared_config, - "tabularization.min_code_inclusion_frequency": 1, + "tabularization.min_code_inclusion_count": 1, "tabularization.window_sizes": "[30d,365d,full]", } @@ -329,17 +329,17 @@ def test_tabularize(): output_files = list(output_dir.glob("**/*.json")) assert len(output_files) == 1 - basemodel_config_kwargs = { + sklearnmodel_config_kwargs = { **shared_config, - "tabularization.min_code_inclusion_frequency": 1, + "tabularization.min_code_inclusion_count": 1, "tabularization.window_sizes": "[30d,365d,full]", } with initialize( version_base=None, config_path="../src/MEDS_tabular_automl/configs/" ): # path to config.yaml - overrides = [f"{k}={v}" for k, v in basemodel_config_kwargs.items()] - cfg = compose(config_name="launch_basemodel", overrides=overrides) # config.yaml + overrides = [f"{k}={v}" for k, v in sklearnmodel_config_kwargs.items()] + cfg = compose(config_name="launch_sklearnmodel", overrides=overrides) # config.yaml output_dir = Path(cfg.output_cohort_dir) / "model" @@ -347,9 +347,9 @@ def test_tabularize(): output_files = list(output_dir.glob("**/*.pkl")) assert len(output_files) == 1 - basemodel_config_kwargs = { + sklearnmodel_config_kwargs = { **shared_config, - "tabularization.min_code_inclusion_frequency": 1, + "tabularization.min_code_inclusion_count": 1, "tabularization.window_sizes": "[30d,365d,full]", "model_params.iterator.keep_data_in_memory": False, "model_dir": "${output_cohort_dir}/model_online/model_${now:%Y-%m-%d_%H-%M-%S}", @@ -358,8 +358,8 @@ def test_tabularize(): with initialize( version_base=None, config_path="../src/MEDS_tabular_automl/configs/" ): # path to config.yaml - overrides = [f"{k}={v}" for k, v in basemodel_config_kwargs.items()] - cfg = compose(config_name="launch_basemodel", overrides=overrides) # config.yaml + overrides = [f"{k}={v}" for k, v in sklearnmodel_config_kwargs.items()] + cfg = compose(config_name="launch_sklearnmodel", overrides=overrides) # config.yaml output_dir = Path(cfg.output_cohort_dir) / "model_online" @@ -390,7 +390,7 @@ def test_xgboost_config(): "hydra.verbose": True, "tqdm": False, "loguru_init": True, - "tabularization.min_code_inclusion_frequency": 1, + "tabularization.min_code_inclusion_count": 1, "tabularization.window_sizes": f"{stdout_ws.strip()}", } From b65754c0a51c75ac951fd4ab0b693b59768a3873 Mon Sep 17 00:00:00 2001 From: Teya Bergamaschi Date: Tue, 20 Aug 2024 18:28:41 +0000 Subject: [PATCH 06/54] [wip] sharing for updates only --- src/MEDS_tabular_automl/base_model.py | 3 +- .../configs/launch_model.yaml | 4 +- .../configs/models/sgd_classifier.yaml | 2 +- .../configs/models/xgboost.yaml | 2 +- src/MEDS_tabular_automl/dense_iterator.py | 9 ---- .../scripts/launch_autogluon.py | 9 ++-- .../scripts/launch_model.py | 9 +--- .../scripts/launch_xgboost.py | 48 +++++++++---------- src/MEDS_tabular_automl/sklearn_model.py | 2 +- src/MEDS_tabular_automl/utils.py | 1 + src/MEDS_tabular_automl/xgboost_model.py | 2 +- 11 files changed, 39 insertions(+), 52 deletions(-) diff --git a/src/MEDS_tabular_automl/base_model.py b/src/MEDS_tabular_automl/base_model.py index 9f30a07..a9943d9 100644 --- a/src/MEDS_tabular_automl/base_model.py +++ b/src/MEDS_tabular_automl/base_model.py @@ -1,7 +1,6 @@ -from typing import Dict, Type from abc import ABC, abstractmethod from pathlib import Path -from omegaconf import DictConfig + from mixins import TimeableMixin diff --git a/src/MEDS_tabular_automl/configs/launch_model.yaml b/src/MEDS_tabular_automl/configs/launch_model.yaml index 71fcc14..50cecf7 100644 --- a/src/MEDS_tabular_automl/configs/launch_model.yaml +++ b/src/MEDS_tabular_automl/configs/launch_model.yaml @@ -1,7 +1,7 @@ defaults: - default - tabularization: default - - model: xgboost # This can be changed to sgd_classifier or any other model + - model: xgboost # This can be changed to sgd_classifier or any other model - override hydra/sweeper: optuna - override hydra/sweeper/sampler: tpe - override hydra/launcher: joblib @@ -19,4 +19,4 @@ output_filepath: ${model_dir}/model_metadata.json log_dir: ${model_dir}/.logs/ -name: launch_model \ No newline at end of file +name: launch_model diff --git a/src/MEDS_tabular_automl/configs/models/sgd_classifier.yaml b/src/MEDS_tabular_automl/configs/models/sgd_classifier.yaml index 1b05f15..5ebca73 100644 --- a/src/MEDS_tabular_automl/configs/models/sgd_classifier.yaml +++ b/src/MEDS_tabular_automl/configs/models/sgd_classifier.yaml @@ -16,4 +16,4 @@ hydra: +model_params.model.l1_ratio: interval(0, 1) +model_params.model.penalty: choice(['l1', 'l2', 'elasticnet']) model_params.epochs: range(10, 100) - model_params.early_stopping_rounds: range(1, 10) \ No newline at end of file + model_params.early_stopping_rounds: range(1, 10) diff --git a/src/MEDS_tabular_automl/configs/models/xgboost.yaml b/src/MEDS_tabular_automl/configs/models/xgboost.yaml index a4be06e..180f44b 100644 --- a/src/MEDS_tabular_automl/configs/models/xgboost.yaml +++ b/src/MEDS_tabular_automl/configs/models/xgboost.yaml @@ -24,4 +24,4 @@ hydra: model_params.num_boost_round: range(100, 1000) model_params.early_stopping_rounds: range(1, 10) +model_params.model.max_depth: range(2, 16) - tabularization.min_code_inclusion_frequency: tag(log, range(10, 1000000)) \ No newline at end of file + tabularization.min_code_inclusion_frequency: tag(log, range(10, 1000000)) diff --git a/src/MEDS_tabular_automl/dense_iterator.py b/src/MEDS_tabular_automl/dense_iterator.py index c9c6485..0eaddce 100644 --- a/src/MEDS_tabular_automl/dense_iterator.py +++ b/src/MEDS_tabular_automl/dense_iterator.py @@ -1,19 +1,12 @@ -from pathlib import Path - -import hydra import numpy as np import scipy.sparse as sp -from loguru import logger from mixins import TimeableMixin from omegaconf import DictConfig -from sklearn.metrics import roc_auc_score from .tabular_dataset import TabularDataset -from .base_model import BaseModel class DenseIterator(TabularDataset, TimeableMixin): - def __init__(self, cfg: DictConfig, split: str): """Initializes the SklearnIterator with the provided configuration and data split. @@ -50,5 +43,3 @@ def densify(self) -> np.ndarray: data = sp.vstack(data) labels = np.concatenate(labels, axis=0) return data, labels, selected_features - - diff --git a/src/MEDS_tabular_automl/scripts/launch_autogluon.py b/src/MEDS_tabular_automl/scripts/launch_autogluon.py index ac11c3c..cdf8626 100644 --- a/src/MEDS_tabular_automl/scripts/launch_autogluon.py +++ b/src/MEDS_tabular_automl/scripts/launch_autogluon.py @@ -1,5 +1,4 @@ from importlib.resources import files -from pathlib import Path import hydra import pandas as pd @@ -21,13 +20,12 @@ def main(cfg: DictConfig) -> float: Args: cfg: The configuration dictionary specifying model and training parameters. - """ # print(OmegaConf.to_yaml(cfg)) if not cfg.loguru_init: hydra_loguru_init() - + # check that autogluon is installed try: import autogluon as ag @@ -54,7 +52,10 @@ def main(cfg: DictConfig) -> float: # launch AutoGluon predictor = ag.TabularPredictor(label=cfg.task_name).fit(train_data=train_df, tuning_data=tuning_df) - + # TODO: fix logging, etc. + auc = predictor.evaluate(held_out_df) + logger.info(f"AUC: {auc}") + if __name__ == "__main__": main() diff --git a/src/MEDS_tabular_automl/scripts/launch_model.py b/src/MEDS_tabular_automl/scripts/launch_model.py index 7e7fbf8..3cd91af 100644 --- a/src/MEDS_tabular_automl/scripts/launch_model.py +++ b/src/MEDS_tabular_automl/scripts/launch_model.py @@ -4,17 +4,12 @@ import hydra from loguru import logger from omegaconf import DictConfig -from typing import Dict, Type from MEDS_tabular_automl.base_model import BaseModel from MEDS_tabular_automl.sklearn_model import SklearnModel from MEDS_tabular_automl.xgboost_model import XGBoostModel - -MODEL_CLASSES: Dict[str, Type[BaseModel]] = { - "xgboost": XGBoostModel, - "sklearn": SklearnModel -} +MODEL_CLASSES: dict[str, type[BaseModel]] = {"xgboost": XGBoostModel, "sklearn": SklearnModel} from ..utils import hydra_loguru_init @@ -42,7 +37,7 @@ def main(cfg: DictConfig) -> float: ModelClass = MODEL_CLASSES.get(model_type) if ModelClass is None: raise ValueError(f"Model type {model_type} not supported.") - + model = ModelClass(cfg) model.train() auc = model.evaluate() diff --git a/src/MEDS_tabular_automl/scripts/launch_xgboost.py b/src/MEDS_tabular_automl/scripts/launch_xgboost.py index 22d10c3..fd09e70 100644 --- a/src/MEDS_tabular_automl/scripts/launch_xgboost.py +++ b/src/MEDS_tabular_automl/scripts/launch_xgboost.py @@ -27,30 +27,30 @@ def main(cfg: DictConfig) -> float: # print(OmegaConf.to_yaml(cfg)) if not cfg.loguru_init: hydra_loguru_init() - # try: - model = XGBoostModel(cfg) - model.train() - auc = model.evaluate() - logger.info(f"AUC: {auc}") - - # print( - # "Time Profiling for window sizes ", - # f"{cfg.tabularization.window_sizes} and min ", - # f"code frequency of {cfg.tabularization.min_code_inclusion_frequency}:", - # ) - # print("Train Time: \n", model._profile_durations()) - # print("Train Iterator Time: \n", model.itrain._profile_durations()) - # print("Tuning Iterator Time: \n", model.ituning._profile_durations()) - # print("Held Out Iterator Time: \n", model.iheld_out._profile_durations()) - - # save model - output_fp = Path(cfg.output_filepath) - output_fp.parent.mkdir(parents=True, exist_ok=True) - - model.save_model(output_fp) - # except Exception as e: - # logger.error(f"Error occurred: {e}") - # auc = 0.0 + try: + model = XGBoostModel(cfg) + model.train() + auc = model.evaluate() + logger.info(f"AUC: {auc}") + + # print( + # "Time Profiling for window sizes ", + # f"{cfg.tabularization.window_sizes} and min ", + # f"code frequency of {cfg.tabularization.min_code_inclusion_frequency}:", + # ) + # print("Train Time: \n", model._profile_durations()) + # print("Train Iterator Time: \n", model.itrain._profile_durations()) + # print("Tuning Iterator Time: \n", model.ituning._profile_durations()) + # print("Held Out Iterator Time: \n", model.iheld_out._profile_durations()) + + # save model + output_fp = Path(cfg.output_filepath) + output_fp.parent.mkdir(parents=True, exist_ok=True) + + model.save_model(output_fp) + except Exception as e: + logger.error(f"Error occurred: {e}") + auc = 0.0 return auc diff --git a/src/MEDS_tabular_automl/sklearn_model.py b/src/MEDS_tabular_automl/sklearn_model.py index e8c9f6a..ea01822 100644 --- a/src/MEDS_tabular_automl/sklearn_model.py +++ b/src/MEDS_tabular_automl/sklearn_model.py @@ -8,8 +8,8 @@ from omegaconf import DictConfig from sklearn.metrics import roc_auc_score -from .tabular_dataset import TabularDataset from .base_model import BaseModel +from .tabular_dataset import TabularDataset class SklearnIterator(TabularDataset, TimeableMixin): diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py index eb4d4d1..badb246 100644 --- a/src/MEDS_tabular_automl/utils.py +++ b/src/MEDS_tabular_automl/utils.py @@ -108,6 +108,7 @@ def load_tqdm(use_tqdm: bool): return tqdm else: + def noop(x, **kwargs): return x diff --git a/src/MEDS_tabular_automl/xgboost_model.py b/src/MEDS_tabular_automl/xgboost_model.py index 9f0f119..2223c90 100644 --- a/src/MEDS_tabular_automl/xgboost_model.py +++ b/src/MEDS_tabular_automl/xgboost_model.py @@ -8,8 +8,8 @@ from omegaconf import DictConfig, OmegaConf from sklearn.metrics import roc_auc_score -from .tabular_dataset import TabularDataset from .base_model import BaseModel +from .tabular_dataset import TabularDataset class XGBIterator(xgb.DataIter, TabularDataset, TimeableMixin): From a8d8417e29fede2c8de5cea1e14c448826347df2 Mon Sep 17 00:00:00 2001 From: Teya Bergamaschi Date: Tue, 20 Aug 2024 18:41:16 +0000 Subject: [PATCH 07/54] [wip] doctests --- src/MEDS_tabular_automl/utils.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py index badb246..e61d45b 100644 --- a/src/MEDS_tabular_automl/utils.py +++ b/src/MEDS_tabular_automl/utils.py @@ -67,7 +67,7 @@ def filter_to_codes( >>> from tempfile import NamedTemporaryFile >>> with NamedTemporaryFile() as f: ... pl.DataFrame({"code": ["E", "D", "A"], "count": [4, 3, 2]}).write_parquet(f.name) - ... filter_to_codes(["A", "D"], 3, f.name) + ... filter_to_codes( f.name, ["A", "D"], 3, None, None) ['D'] """ @@ -276,10 +276,9 @@ def write_df(df: pl.LazyFrame | pl.DataFrame | coo_array, fp: Path, do_overwrite ... fp = Path(tmpdir) / "test.npz" ... write_df(df_coo_array, fp, do_overwrite=True) ... assert load_matrix(fp).toarray().tolist() == [[1], [2], [3]] - ... write_df(df_coo_array, fp, do_overwrite=False) - Traceback (most recent call last): - ... - FileExistsError: ...test.npz exists and do_overwrite is False! + ... import pytest + ... with pytest.raises(FileExistsError): + ... write_df(df_coo_array, fp, do_overwrite=False) """ if fp.is_file() and not do_overwrite: raise FileExistsError(f"{fp} exists and do_overwrite is {do_overwrite}!") From d07f6a2c4b0146567bfaca23aaf814b30ee4f94d Mon Sep 17 00:00:00 2001 From: Teya Bergamaschi Date: Tue, 20 Aug 2024 23:27:16 +0000 Subject: [PATCH 08/54] autogluon --- .../configs/launch_autogluon.yaml | 28 ++++++ .../configs/launch_xgboost.yaml | 2 +- src/MEDS_tabular_automl/dense_iterator.py | 10 +-- .../scripts/launch_autogluon.py | 40 ++++++--- src/MEDS_tabular_automl/tabular_dataset.py | 85 ++++++++++++++++++- 5 files changed, 140 insertions(+), 25 deletions(-) create mode 100644 src/MEDS_tabular_automl/configs/launch_autogluon.yaml diff --git a/src/MEDS_tabular_automl/configs/launch_autogluon.yaml b/src/MEDS_tabular_automl/configs/launch_autogluon.yaml new file mode 100644 index 0000000..d9a9b74 --- /dev/null +++ b/src/MEDS_tabular_automl/configs/launch_autogluon.yaml @@ -0,0 +1,28 @@ +defaults: + - default + - tabularization: default + - override hydra/sweeper: optuna + - override hydra/sweeper/sampler: tpe + - override hydra/launcher: joblib + - _self_ + +task_name: task + +# Task cached data dir +input_dir: ${output_cohort_dir}/${task_name}/task_cache +# Directory with task labels +input_label_dir: ${output_cohort_dir}/${task_name}/labels/ +# Where to output the model and cached data +model_dir: ${output_cohort_dir}/autogluon/autogluon_${now:%Y-%m-%d_%H-%M-%S} +output_filepath: ${model_dir} + +# Model parameters +model_params: + iterator: + keep_data_in_memory: True + binarize_task: True + +log_dir: ${model_dir}/.logs/ +log_filepath: ${log_dir}/log.txt + +name: launch_autogluon diff --git a/src/MEDS_tabular_automl/configs/launch_xgboost.yaml b/src/MEDS_tabular_automl/configs/launch_xgboost.yaml index af3735d..a95187b 100644 --- a/src/MEDS_tabular_automl/configs/launch_xgboost.yaml +++ b/src/MEDS_tabular_automl/configs/launch_xgboost.yaml @@ -53,6 +53,6 @@ hydra: model_params.num_boost_round: range(100, 1000) model_params.early_stopping_rounds: range(1, 10) +model_params.model.max_depth: range(2, 16) - tabularization.min_code_inclusion_frequency: tag(log, range(10, 1000000)) + tabularization.min_code_inclusion_count: tag(log, range(10, 1000000)) name: launch_xgboost diff --git a/src/MEDS_tabular_automl/dense_iterator.py b/src/MEDS_tabular_automl/dense_iterator.py index 0eaddce..33d13b0 100644 --- a/src/MEDS_tabular_automl/dense_iterator.py +++ b/src/MEDS_tabular_automl/dense_iterator.py @@ -25,21 +25,13 @@ def __init__(self, cfg: DictConfig, split: str): def densify(self) -> np.ndarray: """Builds the data as a dense matrix based on column subselection.""" - # get the column indices to include - cols = self.get_feature_indices() - - # map those to the feature names in the data - feature_names = self.get_all_column_names() - selected_features = [feature_names[col] for col in cols] - # get the dense matrix by iterating through the data shards data = [] labels = [] for shard_idx in range(len(self._data_shards)): shard_data, shard_labels = self.get_data_shards(shard_idx) - shard_data = shard_data[:, cols] data.append(shard_data) labels.append(shard_labels) data = sp.vstack(data) labels = np.concatenate(labels, axis=0) - return data, labels, selected_features + return data, labels diff --git a/src/MEDS_tabular_automl/scripts/launch_autogluon.py b/src/MEDS_tabular_automl/scripts/launch_autogluon.py index cdf8626..0e163ea 100644 --- a/src/MEDS_tabular_automl/scripts/launch_autogluon.py +++ b/src/MEDS_tabular_automl/scripts/launch_autogluon.py @@ -9,7 +9,7 @@ from ..utils import hydra_loguru_init -config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_xgboost.yaml") +config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_autogluon.yaml") if not config_yaml.is_file(): raise FileNotFoundError("Core configuration not successfully installed!") @@ -28,7 +28,7 @@ def main(cfg: DictConfig) -> float: # check that autogluon is installed try: - import autogluon as ag + import autogluon.tabular as ag except ImportError: logger.error("AutoGluon is not installed. Please install AutoGluon.") @@ -38,23 +38,37 @@ def main(cfg: DictConfig) -> float: iheld_out = DenseIterator(cfg, "held_out") # collect data for AutoGluon - train_data, train_labels, cols = itrain.densify() - tuning_data, tuning_labels, _ = ituning.densify() - held_out_data, held_out_labels, _ = iheld_out.densify() + train_data, train_labels = itrain.densify() + tuning_data, tuning_labels = ituning.densify() + held_out_data, held_out_labels = iheld_out.densify() # construct dfs for AutoGluon - train_df = pd.DataFrame(train_data.todense(), columns=cols) + train_df = pd.DataFrame(train_data.todense()) # , columns=cols) train_df[cfg.task_name] = train_labels - tuning_df = pd.DataFrame(tuning_data.todense(), columns=cols) + tuning_df = pd.DataFrame( + tuning_data.todense(), + ) # columns=cols) tuning_df[cfg.task_name] = tuning_labels - held_out_df = pd.DataFrame(held_out_data.todense(), columns=cols) + held_out_df = pd.DataFrame(held_out_data.todense()) # , columns=cols) held_out_df[cfg.task_name] = held_out_labels - # launch AutoGluon - predictor = ag.TabularPredictor(label=cfg.task_name).fit(train_data=train_df, tuning_data=tuning_df) - # TODO: fix logging, etc. - auc = predictor.evaluate(held_out_df) - logger.info(f"AUC: {auc}") + train_dataset = ag.TabularDataset(train_df) + tuning_dataset = ag.TabularDataset(tuning_df) + held_out_dataset = ag.TabularDataset(held_out_df) + + # train model with AutoGluon + predictor = ag.TabularPredictor( + label=cfg.task_name, log_to_file=True, log_file_path=cfg.log_filepath, path=cfg.output_filepath + ).fit(train_data=train_dataset, tuning_data=tuning_dataset) + + # predict + predictions = predictor.predict(held_out_dataset.drop(columns=[cfg.task_name])) + print("Predictions:", predictions) + # evaluate + score = predictor.evaluate(held_out_dataset) + print("Test score:", score) + + # TODO(model) add tests for autogluon pipeline if __name__ == "__main__": diff --git a/src/MEDS_tabular_automl/tabular_dataset.py b/src/MEDS_tabular_automl/tabular_dataset.py index e484598..c60ca61 100644 --- a/src/MEDS_tabular_automl/tabular_dataset.py +++ b/src/MEDS_tabular_automl/tabular_dataset.py @@ -1,6 +1,7 @@ from collections.abc import Mapping from pathlib import Path +import hydra import numpy as np import polars as pl import scipy.sparse as sp @@ -57,6 +58,9 @@ def __init__(self, cfg: DictConfig, split: str = "train"): self.codes_set, self.code_masks, self.num_features = self._get_code_set() + self._set_scaler() + self._set_imputer() + @TimeableMixin.TimeAs def _get_code_masks(self, feature_columns: list, codes_set: set) -> Mapping[str, list[bool]]: """Creates boolean masks for filtering features. @@ -214,6 +218,54 @@ def _get_approximate_correlation_per_feature(self, X: sp.csc_matrix, y: np.ndarr corrs[i] = pearsonr(X[:, i].toarray().flatten(), y)[0] return corrs + def _set_imputer(self): + """Sets the imputer for the data.""" + if hasattr(self.cfg.model_params.iterator, "impute"): + imputer = hydra.utils.instantiate(self.cfg.model_params.iterator.imputer) + if hasattr(imputer, "partial_fit"): + for i in range(len(self._data_shards)): + X, _ = self.get_data_shards(i) + imputer.partial_fit(X) + elif hasattr(imputer, "fit"): + imputer.fit(self.get_data_shards(0)[0]) + else: + raise ValueError("Imputer must have a fit or partial_fit method.") + self.imputer = imputer + else: + self.imputer = None + + def _set_scaler(self): + """Sets the scaler for the data.""" + if hasattr(self.cfg.model_params.iterator, "scaler"): + scaler = hydra.utils.instantiate(self.cfg.model_params.iterator.scaler) + if hasattr(scaler, "partial_fit"): + for i in range(len(self._data_shards)): + X, _ = self.get_data_shards(i) + scaler.partial_fit(X) + elif hasattr(scaler, "fit"): + X = self.get_data_shards(0)[0] + scaler.fit(X) + else: + raise ValueError("Scaler must have a fit or partial_fit method.") + self.scaler = scaler + else: + self.scaler = None + + def _impute_and_scale_data(self, data: sp.csc_matrix) -> sp.csc_matrix: + """Scales the data using the fitted scaler. + + Args: + data: The data to scale. + + Returns: + The scaled data. + """ + if self.imputer is not None: + data = self.imputer.transform(data) + if self.scaler is not None: + return self.scaler.transform(data) + return data + @TimeableMixin.TimeAs def _load_dynamic_shard_from_file(self, path: Path, idx: int) -> sp.csc_matrix: """Loads a specific data shard into memory as a sparse matrix. @@ -320,7 +372,7 @@ def get_data_shards(self, idx: int | list[int]) -> tuple[sp.csc_matrix, np.ndarr idx = [idx] for i in idx: X_, y_ = self._get_shard_by_index(i) - X.append(X_) + X.append(self._impute_and_scale_data(X_)) y.append(y_) if len(X) == 0 or len(y) == 0: raise ValueError("No data found in the shards or labels. Please check input files.") @@ -406,5 +458,34 @@ def extract_name(test_file): all_feats.append(f"{feat_name}/{agg}/{window}") # filter by only those in the list of indices - all_feats = [all_feats[i] for i in indices] + if indices is not None: + all_feats = [all_feats[i] for i in indices] return all_feats + + def get_columns_and_indices(self) -> tuple[list[str], list[int]]: + """Retrieves the names and indices of the columns in the data. + + Returns: + A tuple containing the names of the columns and their indices. + """ + raise NotImplementedError("This method is not implemented yet.") + files = get_model_files(self.cfg, self.split, self._data_shards[0]) + + def extract_name(test_file): + return str(Path(test_file.parent.parent.stem, test_file.parent.stem, test_file.stem)) + + agg_wind_combos = [extract_name(test_file) for test_file in files] + + feature_columns = get_feature_columns(self.cfg.tabularization.filtered_code_metadata_fp) + all_feats = [] + all_indices = [] + for agg_wind in agg_wind_combos: + window, feat, agg = agg_wind.split("/") + feature_ids = get_feature_indices(feat + "/" + agg, feature_columns) + feature_names = [feature_columns[i] for i in feature_ids] + for feat_name in feature_names: + all_feats.append(f"{feat_name}/{agg}/{window}") + # use mask to append indices + all_indices.extend(feature_ids) + + return all_feats, all_indices From 2aebd706af8f7618010fdffc0b91c3cd666cc081 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Tue, 20 Aug 2024 23:32:59 +0000 Subject: [PATCH 09/54] added logged warning for static data being empty and added support for launching any model via config specificaly using the launch_model.py hydra script --- src/MEDS_tabular_automl/base_model.py | 8 +++ .../configs/launch_model.yaml | 11 +++- .../{models => model}/sgd_classifier.yaml | 13 +++- .../configs/{models => model}/xgboost.yaml | 15 ++++- .../configs/tabularization/default.yaml | 2 +- .../generate_static_features.py | 2 + .../scripts/launch_autogluon.py | 40 +++++++---- .../scripts/launch_model.py | 42 +++++------- .../scripts/launch_sklearnmodel.py | 8 +-- src/MEDS_tabular_automl/sklearn_model.py | 3 +- tests/test_configs.py | 66 +++++++++++++++++++ tests/test_tabularize.py | 45 +++++++++---- 12 files changed, 195 insertions(+), 60 deletions(-) rename src/MEDS_tabular_automl/configs/{models => model}/sgd_classifier.yaml (61%) rename src/MEDS_tabular_automl/configs/{models => model}/xgboost.yaml (64%) create mode 100644 tests/test_configs.py diff --git a/src/MEDS_tabular_automl/base_model.py b/src/MEDS_tabular_automl/base_model.py index a9943d9..35a9ccf 100644 --- a/src/MEDS_tabular_automl/base_model.py +++ b/src/MEDS_tabular_automl/base_model.py @@ -1,7 +1,11 @@ from abc import ABC, abstractmethod from pathlib import Path +from typing import TypeVar from mixins import TimeableMixin +from omegaconf import DictConfig + +T = TypeVar("T") class BaseModel(ABC, TimeableMixin): @@ -20,3 +24,7 @@ def evaluate(self) -> float: @abstractmethod def save_model(self, output_fp: Path): pass + + @classmethod + def initialize(cls: T, **kwargs) -> T: + return cls(DictConfig(kwargs, flags={"allow_objects": True})) diff --git a/src/MEDS_tabular_automl/configs/launch_model.yaml b/src/MEDS_tabular_automl/configs/launch_model.yaml index 50cecf7..ad1414f 100644 --- a/src/MEDS_tabular_automl/configs/launch_model.yaml +++ b/src/MEDS_tabular_automl/configs/launch_model.yaml @@ -1,11 +1,11 @@ defaults: + - _self_ - default - tabularization: default - model: xgboost # This can be changed to sgd_classifier or any other model - override hydra/sweeper: optuna - override hydra/sweeper/sampler: tpe - override hydra/launcher: joblib - - _self_ task_name: task @@ -20,3 +20,12 @@ output_filepath: ${model_dir}/model_metadata.json log_dir: ${model_dir}/.logs/ name: launch_model + +hydra: + verbose: False + job: + name: MEDS_TAB_${name}_${worker}_{now:%Y-%m-%d_%H-%M-%S} + sweep: + dir: ${log_dir} + run: + dir: ${log_dir} diff --git a/src/MEDS_tabular_automl/configs/models/sgd_classifier.yaml b/src/MEDS_tabular_automl/configs/model/sgd_classifier.yaml similarity index 61% rename from src/MEDS_tabular_automl/configs/models/sgd_classifier.yaml rename to src/MEDS_tabular_automl/configs/model/sgd_classifier.yaml index 5ebca73..312bb75 100644 --- a/src/MEDS_tabular_automl/configs/models/sgd_classifier.yaml +++ b/src/MEDS_tabular_automl/configs/model/sgd_classifier.yaml @@ -1,8 +1,19 @@ +# @package _global_ + +model_target: + _target_: MEDS_tabular_automl.sklearn_model.SklearnModel.initialize + model_params: ${model_params} + input_dir: ${input_dir} + input_label_dir: ${input_label_dir} + model_dir: ${model_dir} + output_filepath: ${output_filepath} + log_dir: ${log_dir} + cache_dir: ${cache_dir} + model_params: epochs: 20 early_stopping_rounds: 5 model: - type: sklearn _target_: sklearn.linear_model.SGDClassifier loss: log_loss iterator: diff --git a/src/MEDS_tabular_automl/configs/models/xgboost.yaml b/src/MEDS_tabular_automl/configs/model/xgboost.yaml similarity index 64% rename from src/MEDS_tabular_automl/configs/models/xgboost.yaml rename to src/MEDS_tabular_automl/configs/model/xgboost.yaml index 180f44b..ed0af15 100644 --- a/src/MEDS_tabular_automl/configs/models/xgboost.yaml +++ b/src/MEDS_tabular_automl/configs/model/xgboost.yaml @@ -1,9 +1,20 @@ +# @package _global_ + +model_target: + _target_: MEDS_tabular_automl.xgboost_model.XGBoostModel.initialize + model_params: ${model_params} + input_dir: ${input_dir} + input_label_dir: ${input_label_dir} + model_dir: ${model_dir} + output_filepath: ${output_filepath} + log_dir: ${log_dir} + cache_dir: ${cache_dir} + # tabularization: ${tabularization} # Ideally we should define tabularization here, but there is an issue initializing with it's resolvers. + model_params: num_boost_round: 1000 early_stopping_rounds: 5 model: - type: xgboost - # _target_: xgboost.XGBClassifier booster: gbtree device: cpu nthread: 1 diff --git a/src/MEDS_tabular_automl/configs/tabularization/default.yaml b/src/MEDS_tabular_automl/configs/tabularization/default.yaml index a4ffdc6..6fc3703 100644 --- a/src/MEDS_tabular_automl/configs/tabularization/default.yaml +++ b/src/MEDS_tabular_automl/configs/tabularization/default.yaml @@ -1,5 +1,5 @@ # User inputs -filtered_code_metadata_fp: ${output_cohort_dir}/tabularized_code_metadata.parquet +filtered_code_metadata_fp: ${output_cohort_dir}/metadata/codes.parquet allowed_codes: null min_code_inclusion_count: 10 min_code_inclusion_frequency: null diff --git a/src/MEDS_tabular_automl/generate_static_features.py b/src/MEDS_tabular_automl/generate_static_features.py index 8ff4003..c990ece 100644 --- a/src/MEDS_tabular_automl/generate_static_features.py +++ b/src/MEDS_tabular_automl/generate_static_features.py @@ -185,6 +185,8 @@ def get_flat_static_rep( """ static_features = get_feature_names(agg=agg, feature_columns=feature_columns) static_measurements = summarize_static_measurements(agg, static_features, df=shard_df) + if len(static_features) == 0: + raise ValueError(f"No static features found. Remove the aggregation function {agg}") # convert to sparse_matrix matrix = get_sparse_static_rep(static_features, static_measurements.lazy(), shard_df, feature_columns) assert matrix.shape[1] == len( diff --git a/src/MEDS_tabular_automl/scripts/launch_autogluon.py b/src/MEDS_tabular_automl/scripts/launch_autogluon.py index cdf8626..0e163ea 100644 --- a/src/MEDS_tabular_automl/scripts/launch_autogluon.py +++ b/src/MEDS_tabular_automl/scripts/launch_autogluon.py @@ -9,7 +9,7 @@ from ..utils import hydra_loguru_init -config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_xgboost.yaml") +config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_autogluon.yaml") if not config_yaml.is_file(): raise FileNotFoundError("Core configuration not successfully installed!") @@ -28,7 +28,7 @@ def main(cfg: DictConfig) -> float: # check that autogluon is installed try: - import autogluon as ag + import autogluon.tabular as ag except ImportError: logger.error("AutoGluon is not installed. Please install AutoGluon.") @@ -38,23 +38,37 @@ def main(cfg: DictConfig) -> float: iheld_out = DenseIterator(cfg, "held_out") # collect data for AutoGluon - train_data, train_labels, cols = itrain.densify() - tuning_data, tuning_labels, _ = ituning.densify() - held_out_data, held_out_labels, _ = iheld_out.densify() + train_data, train_labels = itrain.densify() + tuning_data, tuning_labels = ituning.densify() + held_out_data, held_out_labels = iheld_out.densify() # construct dfs for AutoGluon - train_df = pd.DataFrame(train_data.todense(), columns=cols) + train_df = pd.DataFrame(train_data.todense()) # , columns=cols) train_df[cfg.task_name] = train_labels - tuning_df = pd.DataFrame(tuning_data.todense(), columns=cols) + tuning_df = pd.DataFrame( + tuning_data.todense(), + ) # columns=cols) tuning_df[cfg.task_name] = tuning_labels - held_out_df = pd.DataFrame(held_out_data.todense(), columns=cols) + held_out_df = pd.DataFrame(held_out_data.todense()) # , columns=cols) held_out_df[cfg.task_name] = held_out_labels - # launch AutoGluon - predictor = ag.TabularPredictor(label=cfg.task_name).fit(train_data=train_df, tuning_data=tuning_df) - # TODO: fix logging, etc. - auc = predictor.evaluate(held_out_df) - logger.info(f"AUC: {auc}") + train_dataset = ag.TabularDataset(train_df) + tuning_dataset = ag.TabularDataset(tuning_df) + held_out_dataset = ag.TabularDataset(held_out_df) + + # train model with AutoGluon + predictor = ag.TabularPredictor( + label=cfg.task_name, log_to_file=True, log_file_path=cfg.log_filepath, path=cfg.output_filepath + ).fit(train_data=train_dataset, tuning_data=tuning_dataset) + + # predict + predictions = predictor.predict(held_out_dataset.drop(columns=[cfg.task_name])) + print("Predictions:", predictions) + # evaluate + score = predictor.evaluate(held_out_dataset) + print("Test score:", score) + + # TODO(model) add tests for autogluon pipeline if __name__ == "__main__": diff --git a/src/MEDS_tabular_automl/scripts/launch_model.py b/src/MEDS_tabular_automl/scripts/launch_model.py index 3cd91af..df238cc 100644 --- a/src/MEDS_tabular_automl/scripts/launch_model.py +++ b/src/MEDS_tabular_automl/scripts/launch_model.py @@ -3,17 +3,13 @@ import hydra from loguru import logger -from omegaconf import DictConfig +from omegaconf import DictConfig, open_dict from MEDS_tabular_automl.base_model import BaseModel -from MEDS_tabular_automl.sklearn_model import SklearnModel -from MEDS_tabular_automl.xgboost_model import XGBoostModel - -MODEL_CLASSES: dict[str, type[BaseModel]] = {"xgboost": XGBoostModel, "sklearn": SklearnModel} from ..utils import hydra_loguru_init -config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_xgboost.yaml") +config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_model.yaml") if not config_yaml.is_file(): raise FileNotFoundError("Core configuration not successfully installed!") @@ -32,25 +28,21 @@ def main(cfg: DictConfig) -> float: # print(OmegaConf.to_yaml(cfg)) if not cfg.loguru_init: hydra_loguru_init() - try: - model_type = cfg.model.type - ModelClass = MODEL_CLASSES.get(model_type) - if ModelClass is None: - raise ValueError(f"Model type {model_type} not supported.") - - model = ModelClass(cfg) - model.train() - auc = model.evaluate() - logger.info(f"AUC: {auc}") - - # save model - output_fp = Path(cfg.output_filepath) - output_fp.parent.mkdir(parents=True, exist_ok=True) - - model.save_model(output_fp) - except Exception as e: - logger.error(f"Error occurred: {e}") - auc = 0.0 + + model: BaseModel = hydra.utils.instantiate(cfg.model_target) + # TODO - make tabularuzation be copied in the yaml instead of here + with open_dict(cfg): + model.cfg.tabularization = hydra.utils.instantiate(cfg.tabularization) + + model.train() + auc = model.evaluate() + logger.info(f"AUC: {auc}") + + # save model + output_fp = Path(cfg.output_filepath) + output_fp.parent.mkdir(parents=True, exist_ok=True) + + model.save_model(output_fp) return auc diff --git a/src/MEDS_tabular_automl/scripts/launch_sklearnmodel.py b/src/MEDS_tabular_automl/scripts/launch_sklearnmodel.py index f264765..8e76872 100644 --- a/src/MEDS_tabular_automl/scripts/launch_sklearnmodel.py +++ b/src/MEDS_tabular_automl/scripts/launch_sklearnmodel.py @@ -1,3 +1,4 @@ +from importlib.resources import files from pathlib import Path import hydra @@ -7,10 +8,9 @@ from ..sklearn_model import SklearnModel from ..utils import hydra_loguru_init -# config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_sklearnmodel.yaml") -# if not config_yaml.is_file(): -# raise FileNotFoundError("Core configuration not successfully installed!") -config_yaml = Path("./src/MEDS_tabular_automl/configs/launch_sklearnmodel.yaml") +config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_sklearnmodel.yaml") +if not config_yaml.is_file(): + raise FileNotFoundError("Core configuration not successfully installed!") @hydra.main(version_base=None, config_path=str(config_yaml.parent.resolve()), config_name=config_yaml.stem) diff --git a/src/MEDS_tabular_automl/sklearn_model.py b/src/MEDS_tabular_automl/sklearn_model.py index ea01822..9b94fd0 100644 --- a/src/MEDS_tabular_automl/sklearn_model.py +++ b/src/MEDS_tabular_automl/sklearn_model.py @@ -1,6 +1,5 @@ from pathlib import Path -import hydra import numpy as np import scipy.sparse as sp from loguru import logger @@ -120,7 +119,7 @@ def __init__(self, cfg: DictConfig): self.dtuning = None self.dheld_out = None - self.model = hydra.utils.call(cfg.model_params.model) + self.model = cfg.model_params.model # check that self.model is a valid model if not hasattr(self.model, "fit"): raise ValueError("Model does not have a fit method.") diff --git a/tests/test_configs.py b/tests/test_configs.py new file mode 100644 index 0000000..8d6acdc --- /dev/null +++ b/tests/test_configs.py @@ -0,0 +1,66 @@ +import rootutils + +root = rootutils.setup_root(__file__, dotenv=True, pythonpath=True, cwd=True) + +import subprocess + +import hydra +import pytest +from hydra import compose, initialize +from hydra.core.hydra_config import HydraConfig +from loguru import logger + +from MEDS_tabular_automl.sklearn_model import SklearnModel +from MEDS_tabular_automl.xgboost_model import XGBoostModel + +logger.disable("MEDS_tabular_automl") +from omegaconf import OmegaConf + + +def run_command(script: str, args: list[str], hydra_kwargs: dict[str, str], test_name: str): + command_parts = [script] + args + [f"{k}={v}" for k, v in hydra_kwargs.items()] + command_out = subprocess.run(" ".join(command_parts), shell=True, capture_output=True) + stderr = command_out.stderr.decode() + stdout = command_out.stdout.decode() + if command_out.returncode != 0: + raise AssertionError(f"{test_name} failed!\nstdout:\n{stdout}\nstderr:\n{stderr}") + return stderr, stdout + + +def make_config_mutable(cfg): + OmegaConf.set_readonly(cfg, False) + for key in cfg: + if isinstance(cfg[key], OmegaConf): + make_config_mutable(cfg[key]) + + +@pytest.mark.parametrize("model", ["xgboost", "sgd_classifier"]) +def test_model_config(model): + MEDS_cohort_dir = "blah" + xgboost_config_kwargs = { + "MEDS_cohort_dir": MEDS_cohort_dir, + "output_cohort_dir": "blah", + "do_overwrite": False, + "seed": 1, + "hydra.verbose": True, + "tqdm": False, + "loguru_init": True, + "tabularization.min_code_inclusion_count": 1, + "tabularization.window_sizes": "[30d,365d,full]", + "tabularization._resolved_codes": "[test,test2]", + } + + with initialize( + version_base=None, config_path="../src/MEDS_tabular_automl/configs/" + ): # path to config.yaml + overrides = [f"model={model}"] + [f"{k}={v}" for k, v in xgboost_config_kwargs.items()] + cfg = compose( + config_name="launch_model", overrides=overrides, return_hydra_config=True + ) # config.yaml + + HydraConfig().set_config(cfg) + # make_config_mutable(cfg) + expected_model_class = XGBoostModel if model == "xgboost" else SklearnModel + model = hydra.utils.instantiate(cfg.model_target) + assert isinstance(model, expected_model_class) + # assert cfg.tabularization.window_sizes diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index 931b7a1..ef2582e 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -10,6 +10,7 @@ import polars as pl from hydra import compose, initialize +from hydra.core.hydra_config import HydraConfig from loguru import logger from MEDS_tabular_automl.describe_codes import get_feature_columns @@ -17,8 +18,7 @@ from MEDS_tabular_automl.scripts import ( cache_task, describe_codes, - launch_sklearnmodel, - launch_xgboost, + launch_model, tabularize_static, tabularize_time_series, ) @@ -320,12 +320,15 @@ def test_tabularize(): with initialize( version_base=None, config_path="../src/MEDS_tabular_automl/configs/" ): # path to config.yaml - overrides = [f"{k}={v}" for k, v in xgboost_config_kwargs.items()] - cfg = compose(config_name="launch_xgboost", overrides=overrides) # config.yaml + overrides = ["model=xgboost"] + [f"{k}={v}" for k, v in xgboost_config_kwargs.items()] + cfg = compose( + config_name="launch_model", overrides=overrides, return_hydra_config=True + ) # config.yaml output_dir = Path(cfg.output_cohort_dir) / "model" - launch_xgboost.main(cfg) + HydraConfig().set_config(cfg) + launch_model.main(cfg) output_files = list(output_dir.glob("**/*.json")) assert len(output_files) == 1 @@ -338,12 +341,12 @@ def test_tabularize(): with initialize( version_base=None, config_path="../src/MEDS_tabular_automl/configs/" ): # path to config.yaml - overrides = [f"{k}={v}" for k, v in sklearnmodel_config_kwargs.items()] - cfg = compose(config_name="launch_sklearnmodel", overrides=overrides) # config.yaml + overrides = ["model=sgd_classifier"] + [f"{k}={v}" for k, v in sklearnmodel_config_kwargs.items()] + cfg = compose(config_name="launch_model", overrides=overrides) # config.yaml output_dir = Path(cfg.output_cohort_dir) / "model" - launch_sklearnmodel.main(cfg) + launch_model.main(cfg) output_files = list(output_dir.glob("**/*.pkl")) assert len(output_files) == 1 @@ -358,15 +361,35 @@ def test_tabularize(): with initialize( version_base=None, config_path="../src/MEDS_tabular_automl/configs/" ): # path to config.yaml - overrides = [f"{k}={v}" for k, v in sklearnmodel_config_kwargs.items()] - cfg = compose(config_name="launch_sklearnmodel", overrides=overrides) # config.yaml + overrides = ["model=sgd_classifier"] + [f"{k}={v}" for k, v in sklearnmodel_config_kwargs.items()] + cfg = compose(config_name="launch_model", overrides=overrides) # config.yaml output_dir = Path(cfg.output_cohort_dir) / "model_online" - launch_sklearnmodel.main(cfg) + launch_model.main(cfg) output_files = list(output_dir.glob("**/*.pkl")) assert len(output_files) == 1 + # autogluon_config_kwargs = { + # **shared_config, + # "tabularization.min_code_inclusion_count": 1, + # "tabularization.window_sizes": "[30d,365d,full]", + # "model_params.iterator.keep_data_in_memory": False, + # "model_dir": "${output_cohort_dir}/model_online/model_${now:%Y-%m-%d_%H-%M-%S}", + # } + + # with initialize( + # version_base=None, config_path="../src/MEDS_tabular_automl/configs/" + # ): # path to config.yaml + # overrides = [f"{k}={v}" for k, v in sklearnmodel_config_kwargs.items()] + # cfg = compose(config_name="launch_sklearnmodel", overrides=overrides) # config.yaml + + # output_dir = Path(cfg.output_cohort_dir) / "model_online" + + # launch_model.main(cfg) + # output_files = list(output_dir.glob("**/*.pkl")) + # assert len(output_files) == 1 + def run_command(script: str, args: list[str], hydra_kwargs: dict[str, str], test_name: str): command_parts = [script] + args + [f"{k}={v}" for k, v in hydra_kwargs.items()] From ecf92922774364b1a8ec79e6cf88e93cfac1f396 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Wed, 21 Aug 2024 01:33:37 +0000 Subject: [PATCH 10/54] Added support via hydra for selecting among four imputation methods (none, mean, median, mode), and three normalization methods (none, standard_scaler, and min_max_scaler) --- .../configs/imputer/default.yaml | 1 + .../configs/imputer/mean_imputer.yaml | 3 ++ .../configs/imputer/median_imputer.yaml | 3 ++ .../configs/imputer/mode_imputer.yaml | 3 ++ .../configs/launch_model.yaml | 2 ++ .../configs/model/sgd_classifier.yaml | 4 +++ .../configs/model/xgboost.yaml | 4 +++ .../configs/normalization/default.yaml | 1 + .../configs/normalization/min_max_scaler.yaml | 2 ++ .../normalization/standard_scaler.yaml | 3 ++ src/MEDS_tabular_automl/tabular_dataset.py | 34 +++++++++++-------- tests/test_configs.py | 8 +++-- 12 files changed, 51 insertions(+), 17 deletions(-) create mode 100644 src/MEDS_tabular_automl/configs/imputer/default.yaml create mode 100644 src/MEDS_tabular_automl/configs/imputer/mean_imputer.yaml create mode 100644 src/MEDS_tabular_automl/configs/imputer/median_imputer.yaml create mode 100644 src/MEDS_tabular_automl/configs/imputer/mode_imputer.yaml create mode 100644 src/MEDS_tabular_automl/configs/normalization/default.yaml create mode 100644 src/MEDS_tabular_automl/configs/normalization/min_max_scaler.yaml create mode 100644 src/MEDS_tabular_automl/configs/normalization/standard_scaler.yaml diff --git a/src/MEDS_tabular_automl/configs/imputer/default.yaml b/src/MEDS_tabular_automl/configs/imputer/default.yaml new file mode 100644 index 0000000..40d291f --- /dev/null +++ b/src/MEDS_tabular_automl/configs/imputer/default.yaml @@ -0,0 +1 @@ +imputer_target: null diff --git a/src/MEDS_tabular_automl/configs/imputer/mean_imputer.yaml b/src/MEDS_tabular_automl/configs/imputer/mean_imputer.yaml new file mode 100644 index 0000000..3a87523 --- /dev/null +++ b/src/MEDS_tabular_automl/configs/imputer/mean_imputer.yaml @@ -0,0 +1,3 @@ +imputer_target: + _target_: sklearn.impute.SimpleImputer + strategy: "mean" diff --git a/src/MEDS_tabular_automl/configs/imputer/median_imputer.yaml b/src/MEDS_tabular_automl/configs/imputer/median_imputer.yaml new file mode 100644 index 0000000..fa82606 --- /dev/null +++ b/src/MEDS_tabular_automl/configs/imputer/median_imputer.yaml @@ -0,0 +1,3 @@ +imputer_target: + _target_: sklearn.impute.SimpleImputer + strategy: "median" diff --git a/src/MEDS_tabular_automl/configs/imputer/mode_imputer.yaml b/src/MEDS_tabular_automl/configs/imputer/mode_imputer.yaml new file mode 100644 index 0000000..247bc11 --- /dev/null +++ b/src/MEDS_tabular_automl/configs/imputer/mode_imputer.yaml @@ -0,0 +1,3 @@ +imputer_target: + _target_: sklearn.impute.SimpleImputer + strategy: "most_frequent" diff --git a/src/MEDS_tabular_automl/configs/launch_model.yaml b/src/MEDS_tabular_automl/configs/launch_model.yaml index ad1414f..fc68bc5 100644 --- a/src/MEDS_tabular_automl/configs/launch_model.yaml +++ b/src/MEDS_tabular_automl/configs/launch_model.yaml @@ -3,6 +3,8 @@ defaults: - default - tabularization: default - model: xgboost # This can be changed to sgd_classifier or any other model + - imputer: default + - normalization: default - override hydra/sweeper: optuna - override hydra/sweeper/sampler: tpe - override hydra/launcher: joblib diff --git a/src/MEDS_tabular_automl/configs/model/sgd_classifier.yaml b/src/MEDS_tabular_automl/configs/model/sgd_classifier.yaml index 312bb75..d935cc6 100644 --- a/src/MEDS_tabular_automl/configs/model/sgd_classifier.yaml +++ b/src/MEDS_tabular_automl/configs/model/sgd_classifier.yaml @@ -9,6 +9,8 @@ model_target: output_filepath: ${output_filepath} log_dir: ${log_dir} cache_dir: ${cache_dir} + imputer: ${model_params.iterator.imputer} + normalization: ${model_params.iterator.normalization} model_params: epochs: 20 @@ -19,6 +21,8 @@ model_params: iterator: keep_data_in_memory: True binarize_task: True + normalization: ${normalization} + imputer: ${imputer} hydra: sweeper: diff --git a/src/MEDS_tabular_automl/configs/model/xgboost.yaml b/src/MEDS_tabular_automl/configs/model/xgboost.yaml index ed0af15..ebb7c2e 100644 --- a/src/MEDS_tabular_automl/configs/model/xgboost.yaml +++ b/src/MEDS_tabular_automl/configs/model/xgboost.yaml @@ -9,6 +9,8 @@ model_target: output_filepath: ${output_filepath} log_dir: ${log_dir} cache_dir: ${cache_dir} + imputer: ${imputer} + normalization: ${normalization} # tabularization: ${tabularization} # Ideally we should define tabularization here, but there is an issue initializing with it's resolvers. model_params: @@ -23,6 +25,8 @@ model_params: iterator: keep_data_in_memory: True binarize_task: True + normalization: ${normalization} + imputer: ${imputer} hydra: sweeper: diff --git a/src/MEDS_tabular_automl/configs/normalization/default.yaml b/src/MEDS_tabular_automl/configs/normalization/default.yaml new file mode 100644 index 0000000..d7eeb76 --- /dev/null +++ b/src/MEDS_tabular_automl/configs/normalization/default.yaml @@ -0,0 +1 @@ +normalizer: null diff --git a/src/MEDS_tabular_automl/configs/normalization/min_max_scaler.yaml b/src/MEDS_tabular_automl/configs/normalization/min_max_scaler.yaml new file mode 100644 index 0000000..1253b34 --- /dev/null +++ b/src/MEDS_tabular_automl/configs/normalization/min_max_scaler.yaml @@ -0,0 +1,2 @@ +normalizer: + _target_: sklearn.preprocessing.MinMaxScaler diff --git a/src/MEDS_tabular_automl/configs/normalization/standard_scaler.yaml b/src/MEDS_tabular_automl/configs/normalization/standard_scaler.yaml new file mode 100644 index 0000000..6931610 --- /dev/null +++ b/src/MEDS_tabular_automl/configs/normalization/standard_scaler.yaml @@ -0,0 +1,3 @@ +normalizer: + _target_: sklearn.preprocessing.StandardScaler + with_mean: False # This preserves the sparsity of the input data. diff --git a/src/MEDS_tabular_automl/tabular_dataset.py b/src/MEDS_tabular_automl/tabular_dataset.py index c60ca61..5e14f91 100644 --- a/src/MEDS_tabular_automl/tabular_dataset.py +++ b/src/MEDS_tabular_automl/tabular_dataset.py @@ -1,12 +1,12 @@ from collections.abc import Mapping from pathlib import Path -import hydra import numpy as np import polars as pl import scipy.sparse as sp from mixins import TimeableMixin from omegaconf import DictConfig +from scipy.stats import pearsonr from .describe_codes import get_feature_columns from .file_name import get_model_files, list_subdir_files @@ -173,19 +173,27 @@ def _get_code_set(self) -> tuple[set[int], Mapping[str, list[bool]], int]: allowed_codes = set(self.cfg.tabularization._resolved_codes) codes_set = {feature_dict[code] for code in feature_dict if code in allowed_codes} - if hasattr(self.cfg.tabularization, "max_by_correlation"): + if ( + hasattr(self.cfg.tabularization, "max_by_correlation") + and self.cfg.tabularization.max_by_correlation + ): corrs = self._get_approximate_correlation_per_feature( self.get_data_shards(0)[0], self.get_data_shards(0)[1] ) corrs = np.abs(corrs) sorted_corrs = np.argsort(corrs)[::-1] - codes_set = set(sorted_corrs[: self.cfg.tabularization.max_by_correlation]) - if hasattr(self.cfg.tabularization, "min_correlation"): + + codes_set = codes_set.intersection( + set(sorted_corrs[: self.cfg.tabularization.max_by_correlation]) + ) + if hasattr(self.cfg.tabularization, "min_correlation") and self.cfg.tabularization.min_correlation: corrs = self._get_approximate_correlation_per_feature( self.get_data_shards(0)[0], self.get_data_shards(0)[1] ) corrs = np.abs(corrs) - codes_set = set(np.where(corrs > self.cfg.tabularization.min_correlation)[0]) + codes_set = codes_set.intersection( + set(np.where(corrs > self.cfg.tabularization.min_correlation)[0]) + ) return ( codes_set, @@ -209,19 +217,15 @@ def _get_approximate_correlation_per_feature(self, X: sp.csc_matrix, y: np.ndarr # check that y has information if len(np.unique(y)) == 1: - raise ValueError("Labels have no information. Cannot calculate correlation.") - - from scipy.stats import pearsonr + raise ValueError("Labels have only one unique value. Cannot calculate correlation.") - corrs = np.zeros(X.shape[1]) - for i in range(X.shape[1]): - corrs[i] = pearsonr(X[:, i].toarray().flatten(), y)[0] + corrs = np.apply_along_axis(lambda col: pearsonr(col.flatten(), y)[0], 0, X.toarray()) return corrs def _set_imputer(self): """Sets the imputer for the data.""" - if hasattr(self.cfg.model_params.iterator, "impute"): - imputer = hydra.utils.instantiate(self.cfg.model_params.iterator.imputer) + if self.cfg.model_params.iterator.imputer.imputer_target: + imputer = self.cfg.model_params.iterator.imputer.imputer_target if hasattr(imputer, "partial_fit"): for i in range(len(self._data_shards)): X, _ = self.get_data_shards(i) @@ -236,8 +240,8 @@ def _set_imputer(self): def _set_scaler(self): """Sets the scaler for the data.""" - if hasattr(self.cfg.model_params.iterator, "scaler"): - scaler = hydra.utils.instantiate(self.cfg.model_params.iterator.scaler) + if self.cfg.model_params.iterator.normalization.normalizer: + scaler = self.cfg.model_params.iterator.normalization.normalizer if hasattr(scaler, "partial_fit"): for i in range(len(self._data_shards)): X, _ = self.get_data_shards(i) diff --git a/tests/test_configs.py b/tests/test_configs.py index 8d6acdc..8c0c138 100644 --- a/tests/test_configs.py +++ b/tests/test_configs.py @@ -35,7 +35,9 @@ def make_config_mutable(cfg): @pytest.mark.parametrize("model", ["xgboost", "sgd_classifier"]) -def test_model_config(model): +@pytest.mark.parametrize("imputer", ["default", "mean_imputer", "mode_imputer", "median_imputer"]) +@pytest.mark.parametrize("normalization", ["min_max_scaler", "standard_scaler"]) +def test_model_config(model, imputer, normalization): MEDS_cohort_dir = "blah" xgboost_config_kwargs = { "MEDS_cohort_dir": MEDS_cohort_dir, @@ -53,7 +55,9 @@ def test_model_config(model): with initialize( version_base=None, config_path="../src/MEDS_tabular_automl/configs/" ): # path to config.yaml - overrides = [f"model={model}"] + [f"{k}={v}" for k, v in xgboost_config_kwargs.items()] + overrides = [f"model={model}", f"imputer={imputer}", f"normalization={normalization}"] + [ + f"{k}={v}" for k, v in xgboost_config_kwargs.items() + ] cfg = compose( config_name="launch_model", overrides=overrides, return_hydra_config=True ) # config.yaml From e6cf0853bcd617aa8af2abf0931430e9cf65a360 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Wed, 21 Aug 2024 01:35:39 +0000 Subject: [PATCH 11/54] fixed xgboost model yaml to load imputer and normalization from the model_params which optuna can modify --- src/MEDS_tabular_automl/configs/model/xgboost.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/MEDS_tabular_automl/configs/model/xgboost.yaml b/src/MEDS_tabular_automl/configs/model/xgboost.yaml index ebb7c2e..7eb7c8d 100644 --- a/src/MEDS_tabular_automl/configs/model/xgboost.yaml +++ b/src/MEDS_tabular_automl/configs/model/xgboost.yaml @@ -9,8 +9,8 @@ model_target: output_filepath: ${output_filepath} log_dir: ${log_dir} cache_dir: ${cache_dir} - imputer: ${imputer} - normalization: ${normalization} + imputer: ${model_params.iterator.imputer} + normalization: ${model_params.iterator.normalization} # tabularization: ${tabularization} # Ideally we should define tabularization here, but there is an issue initializing with it's resolvers. model_params: From 94dfde2bb36c7b7f6ef39b232acba7c8fbf4193d Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Wed, 21 Aug 2024 03:49:22 +0000 Subject: [PATCH 12/54] added autogluon test and cli support --- pyproject.toml | 4 +- .../configs/launch_autogluon.yaml | 2 + .../configs/launch_sklearnmodel.yaml | 33 ----------- .../configs/launch_xgboost.yaml | 58 ------------------- .../scripts/launch_sklearnmodel.py | 58 ------------------- .../scripts/launch_xgboost.py | 58 ------------------- src/MEDS_tabular_automl/tabular_dataset.py | 10 +++- tests/test_tabularize.py | 42 ++++++++------ 8 files changed, 38 insertions(+), 227 deletions(-) delete mode 100644 src/MEDS_tabular_automl/configs/launch_sklearnmodel.yaml delete mode 100644 src/MEDS_tabular_automl/configs/launch_xgboost.yaml delete mode 100644 src/MEDS_tabular_automl/scripts/launch_sklearnmodel.py delete mode 100644 src/MEDS_tabular_automl/scripts/launch_xgboost.py diff --git a/pyproject.toml b/pyproject.toml index 5070616..c6d9c6c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,9 @@ meds-tab-describe = "MEDS_tabular_automl.scripts.describe_codes:main" meds-tab-tabularize-static = "MEDS_tabular_automl.scripts.tabularize_static:main" meds-tab-tabularize-time-series = "MEDS_tabular_automl.scripts.tabularize_time_series:main" meds-tab-cache-task = "MEDS_tabular_automl.scripts.cache_task:main" -meds-tab-xgboost = "MEDS_tabular_automl.scripts.launch_xgboost:main" +meds-tab-xgboost = "MEDS_tabular_automl.scripts.launch_model:main" +meds-tab-model = "MEDS_tabular_automl.scripts.launch_model:main" +meds-tab-autogluon = "MEDS_tabular_automl.scripts.launch_autogluon:main" generate-subsets = "MEDS_tabular_automl.scripts.generate_subsets:main" diff --git a/src/MEDS_tabular_automl/configs/launch_autogluon.yaml b/src/MEDS_tabular_automl/configs/launch_autogluon.yaml index d9a9b74..c11d116 100644 --- a/src/MEDS_tabular_automl/configs/launch_autogluon.yaml +++ b/src/MEDS_tabular_automl/configs/launch_autogluon.yaml @@ -1,6 +1,8 @@ defaults: - default - tabularization: default + - imputer: default + - normalization: default - override hydra/sweeper: optuna - override hydra/sweeper/sampler: tpe - override hydra/launcher: joblib diff --git a/src/MEDS_tabular_automl/configs/launch_sklearnmodel.yaml b/src/MEDS_tabular_automl/configs/launch_sklearnmodel.yaml deleted file mode 100644 index 805593e..0000000 --- a/src/MEDS_tabular_automl/configs/launch_sklearnmodel.yaml +++ /dev/null @@ -1,33 +0,0 @@ -defaults: - - default - - tabularization: default - - override hydra/sweeper: optuna - - override hydra/sweeper/sampler: tpe - - override hydra/launcher: joblib - - _self_ - -task_name: task - -# Task cached data dir -input_dir: ${output_cohort_dir}/${task_name}/task_cache -# Directory with task labels -input_label_dir: ${output_cohort_dir}/${task_name}/labels/ -# Where to output the model and cached data -model_dir: ${output_cohort_dir}/model/model_${now:%Y-%m-%d_%H-%M-%S} -output_filepath: ${model_dir}/model_metadata.json - -# Model parameters -model_params: - epochs: 20 - early_stopping_rounds: 5 - model: - _target_: sklearn.linear_model.SGDClassifier - loss: log_loss - # n_iter: ${model_params.epochs} # not sure if we want this behaviour - iterator: - keep_data_in_memory: True - binarize_task: True - -log_dir: ${model_dir}/.logs/ - -name: launch_sklearnmodel diff --git a/src/MEDS_tabular_automl/configs/launch_xgboost.yaml b/src/MEDS_tabular_automl/configs/launch_xgboost.yaml deleted file mode 100644 index a95187b..0000000 --- a/src/MEDS_tabular_automl/configs/launch_xgboost.yaml +++ /dev/null @@ -1,58 +0,0 @@ -defaults: - - default - - tabularization: default - - override hydra/sweeper: optuna - - override hydra/sweeper/sampler: tpe - - override hydra/launcher: joblib - - _self_ - -task_name: task - -# Task cached data dir -input_dir: ${output_cohort_dir}/${task_name}/task_cache -# Directory with task labels -input_label_dir: ${output_cohort_dir}/${task_name}/labels/ -# Where to output the model and cached data -model_dir: ${output_cohort_dir}/model/model_${now:%Y-%m-%d_%H-%M-%S} -output_filepath: ${model_dir}/model_metadata.json - -# Model parameters -model_params: - num_boost_round: 1000 - early_stopping_rounds: 5 - model: - booster: gbtree - device: cpu - nthread: 1 - tree_method: hist - objective: binary:logistic - iterator: - keep_data_in_memory: True - binarize_task: True - -log_dir: ${model_dir}/.logs/ - -hydra: - # Optuna Sweeper - sweeper: - sampler: - seed: 1 - study_name: null #study_${now:%Y-%m-%d_%H-%M-%S} - storage: null - direction: maximize - n_trials: 250 - n_jobs: 25 - - # Define search space for Optuna - params: - +model_params.model.eta: tag(log, interval(0.001, 1)) - +model_params.model.lambda: tag(log, interval(0.001, 1)) - +model_params.model.alpha: tag(log, interval(0.001, 1)) - +model_params.model.subsample: interval(0.5, 1) - +model_params.model.min_child_weight: interval(1e-2, 100) - model_params.num_boost_round: range(100, 1000) - model_params.early_stopping_rounds: range(1, 10) - +model_params.model.max_depth: range(2, 16) - tabularization.min_code_inclusion_count: tag(log, range(10, 1000000)) - -name: launch_xgboost diff --git a/src/MEDS_tabular_automl/scripts/launch_sklearnmodel.py b/src/MEDS_tabular_automl/scripts/launch_sklearnmodel.py deleted file mode 100644 index 8e76872..0000000 --- a/src/MEDS_tabular_automl/scripts/launch_sklearnmodel.py +++ /dev/null @@ -1,58 +0,0 @@ -from importlib.resources import files -from pathlib import Path - -import hydra -from loguru import logger -from omegaconf import DictConfig - -from ..sklearn_model import SklearnModel -from ..utils import hydra_loguru_init - -config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_sklearnmodel.yaml") -if not config_yaml.is_file(): - raise FileNotFoundError("Core configuration not successfully installed!") - - -@hydra.main(version_base=None, config_path=str(config_yaml.parent.resolve()), config_name=config_yaml.stem) -def main(cfg: DictConfig) -> float: - """Optimizes the model based on the provided configuration. - - Args: - cfg: The configuration dictionary specifying model and training parameters. - - Returns: - The evaluation result as the ROC AUC score on the held-out test set. - """ - - # print(OmegaConf.to_yaml(cfg)) - if not cfg.loguru_init: - hydra_loguru_init() - try: - model = SklearnModel(cfg) - model.train() - auc = model.evaluate() - logger.info(f"AUC: {auc}") - - # print( - # "Time Profiling for window sizes ", - # f"{cfg.tabularization.window_sizes} and min ", - # f"code frequency of {cfg.tabularization.min_code_inclusion_frequency}:", - # ) - # print("Train Time: \n", model._profile_durations()) - # print("Train Iterator Time: \n", model.itrain._profile_durations()) - # print("Tuning Iterator Time: \n", model.ituning._profile_durations()) - # print("Held Out Iterator Time: \n", model.iheld_out._profile_durations()) - - # save model - output_fp = Path(cfg.output_filepath) - output_fp.parent.mkdir(parents=True, exist_ok=True) - - model.save_model(output_fp) - except Exception as e: - logger.error(f"Error occurred: {e}") - auc = 0.0 - return auc - - -if __name__ == "__main__": - main() diff --git a/src/MEDS_tabular_automl/scripts/launch_xgboost.py b/src/MEDS_tabular_automl/scripts/launch_xgboost.py deleted file mode 100644 index fd09e70..0000000 --- a/src/MEDS_tabular_automl/scripts/launch_xgboost.py +++ /dev/null @@ -1,58 +0,0 @@ -from importlib.resources import files -from pathlib import Path - -import hydra -from loguru import logger -from omegaconf import DictConfig - -from ..utils import hydra_loguru_init -from ..xgboost_model import XGBoostModel - -config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_xgboost.yaml") -if not config_yaml.is_file(): - raise FileNotFoundError("Core configuration not successfully installed!") - - -@hydra.main(version_base=None, config_path=str(config_yaml.parent.resolve()), config_name=config_yaml.stem) -def main(cfg: DictConfig) -> float: - """Optimizes the model based on the provided configuration. - - Args: - cfg: The configuration dictionary specifying model and training parameters. - - Returns: - The evaluation result as the ROC AUC score on the held-out test set. - """ - - # print(OmegaConf.to_yaml(cfg)) - if not cfg.loguru_init: - hydra_loguru_init() - try: - model = XGBoostModel(cfg) - model.train() - auc = model.evaluate() - logger.info(f"AUC: {auc}") - - # print( - # "Time Profiling for window sizes ", - # f"{cfg.tabularization.window_sizes} and min ", - # f"code frequency of {cfg.tabularization.min_code_inclusion_frequency}:", - # ) - # print("Train Time: \n", model._profile_durations()) - # print("Train Iterator Time: \n", model.itrain._profile_durations()) - # print("Tuning Iterator Time: \n", model.ituning._profile_durations()) - # print("Held Out Iterator Time: \n", model.iheld_out._profile_durations()) - - # save model - output_fp = Path(cfg.output_filepath) - output_fp.parent.mkdir(parents=True, exist_ok=True) - - model.save_model(output_fp) - except Exception as e: - logger.error(f"Error occurred: {e}") - auc = 0.0 - return auc - - -if __name__ == "__main__": - main() diff --git a/src/MEDS_tabular_automl/tabular_dataset.py b/src/MEDS_tabular_automl/tabular_dataset.py index 5e14f91..ff918fe 100644 --- a/src/MEDS_tabular_automl/tabular_dataset.py +++ b/src/MEDS_tabular_automl/tabular_dataset.py @@ -224,7 +224,10 @@ def _get_approximate_correlation_per_feature(self, X: sp.csc_matrix, y: np.ndarr def _set_imputer(self): """Sets the imputer for the data.""" - if self.cfg.model_params.iterator.imputer.imputer_target: + if ( + hasattr(self.cfg.model_params.iterator, "imputer") + and self.cfg.model_params.iterator.imputer.imputer_target + ): imputer = self.cfg.model_params.iterator.imputer.imputer_target if hasattr(imputer, "partial_fit"): for i in range(len(self._data_shards)): @@ -240,7 +243,10 @@ def _set_imputer(self): def _set_scaler(self): """Sets the scaler for the data.""" - if self.cfg.model_params.iterator.normalization.normalizer: + if ( + hasattr(self.cfg.model_params.iterator, "normalization") + and self.cfg.model_params.iterator.normalization.normalizer + ): scaler = self.cfg.model_params.iterator.normalization.normalizer if hasattr(scaler, "partial_fit"): for i in range(len(self._data_shards)): diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index ef2582e..9a48041 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -2,7 +2,9 @@ root = rootutils.setup_root(__file__, dotenv=True, pythonpath=True, cwd=True) +import importlib.util import json +import os import subprocess import tempfile from io import StringIO @@ -370,25 +372,31 @@ def test_tabularize(): output_files = list(output_dir.glob("**/*.pkl")) assert len(output_files) == 1 - # autogluon_config_kwargs = { - # **shared_config, - # "tabularization.min_code_inclusion_count": 1, - # "tabularization.window_sizes": "[30d,365d,full]", - # "model_params.iterator.keep_data_in_memory": False, - # "model_dir": "${output_cohort_dir}/model_online/model_${now:%Y-%m-%d_%H-%M-%S}", - # } + if importlib.util.find_spec("autogluon") is not None: + import autogluon as ag - # with initialize( - # version_base=None, config_path="../src/MEDS_tabular_automl/configs/" - # ): # path to config.yaml - # overrides = [f"{k}={v}" for k, v in sklearnmodel_config_kwargs.items()] - # cfg = compose(config_name="launch_sklearnmodel", overrides=overrides) # config.yaml + from MEDS_tabular_automl.scripts import launch_autogluon - # output_dir = Path(cfg.output_cohort_dir) / "model_online" + autogluon_config_kwargs = { + **shared_config, + "tabularization.min_code_inclusion_count": 1, + "tabularization.window_sizes": "[30d,365d,full]", + "model_params.iterator.keep_data_in_memory": False, + "model_dir": "${output_cohort_dir}/model_online/model_${now:%Y-%m-%d_%H-%M-%S}", + } - # launch_model.main(cfg) - # output_files = list(output_dir.glob("**/*.pkl")) - # assert len(output_files) == 1 + with initialize( + version_base=None, config_path="../src/MEDS_tabular_automl/configs/" + ): # path to config.yaml + overrides = [f"{k}={v}" for k, v in autogluon_config_kwargs.items()] + cfg = compose(config_name="launch_autogluon", overrides=overrides) # config.yaml + + output_dir = Path(cfg.output_cohort_dir) / "model_online" + + launch_autogluon.main(cfg) + output_files = list(output_dir.glob("*")) + most_recent_file = max(output_files, key=os.path.getmtime) + ag.tabular.TabularPredictor.load(most_recent_file) def run_command(script: str, args: list[str], hydra_kwargs: dict[str, str], test_name: str): @@ -421,5 +429,5 @@ def test_xgboost_config(): version_base=None, config_path="../src/MEDS_tabular_automl/configs/" ): # path to config.yaml overrides = [f"{k}={v}" for k, v in xgboost_config_kwargs.items()] - cfg = compose(config_name="launch_xgboost", overrides=overrides) # config.yaml + cfg = compose(config_name="launch_model", overrides=overrides) # config.yaml assert cfg.tabularization.window_sizes From 527eda59e9024282bc6d326da90481a759b399ef Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Wed, 21 Aug 2024 05:46:58 +0000 Subject: [PATCH 13/54] added three more sklearn models and fixed bug with normalzation and imputers so they run on mimiciv test data --- .../configs/model/knn_classifier.yaml | 39 ++++++++++++++++ .../configs/model/logistic_regression.yaml | 44 ++++++++++++++++++ .../model/random_forest_classifier.yaml | 46 +++++++++++++++++++ .../configs/normalization/max_abs_scaler.yaml | 2 + .../configs/normalization/min_max_scaler.yaml | 2 - src/MEDS_tabular_automl/tabular_dataset.py | 10 ++-- 6 files changed, 137 insertions(+), 6 deletions(-) create mode 100644 src/MEDS_tabular_automl/configs/model/knn_classifier.yaml create mode 100644 src/MEDS_tabular_automl/configs/model/logistic_regression.yaml create mode 100644 src/MEDS_tabular_automl/configs/model/random_forest_classifier.yaml create mode 100644 src/MEDS_tabular_automl/configs/normalization/max_abs_scaler.yaml delete mode 100644 src/MEDS_tabular_automl/configs/normalization/min_max_scaler.yaml diff --git a/src/MEDS_tabular_automl/configs/model/knn_classifier.yaml b/src/MEDS_tabular_automl/configs/model/knn_classifier.yaml new file mode 100644 index 0000000..58a7850 --- /dev/null +++ b/src/MEDS_tabular_automl/configs/model/knn_classifier.yaml @@ -0,0 +1,39 @@ +# @package _global_ + +model_target: + _target_: MEDS_tabular_automl.sklearn_model.SklearnModel.initialize + model_params: ${model_params} + input_dir: ${input_dir} + input_label_dir: ${input_label_dir} + model_dir: ${model_dir} + output_filepath: ${output_filepath} + log_dir: ${log_dir} + cache_dir: ${cache_dir} + imputer: ${model_params.iterator.imputer} + normalization: ${model_params.iterator.normalization} + +model_params: + epochs: 20 + early_stopping_rounds: 5 + model: + _target_: sklearn.neighbors.KNeighborsClassifier + weights: "distance" + leaf_size: 30 + p: 2 + metric: "minkowski" + iterator: + keep_data_in_memory: True + binarize_task: True + normalization: ${normalization} + imputer: ${imputer} + +hydra: + sweeper: + params: + model_params.model.n_neighbors: range(1, 20) + model_params.model.weights: choice(['uniform', 'distance']) + model_params.model.leaf_size: range(10, 50) + model_params.model.p: choice([1, 2]) + model_params.model.metric: choice(['minkowski', 'euclidean', 'manhattan']) + model_params.epochs: range(10, 100) + model_params.early_stopping_rounds: range(1, 10) diff --git a/src/MEDS_tabular_automl/configs/model/logistic_regression.yaml b/src/MEDS_tabular_automl/configs/model/logistic_regression.yaml new file mode 100644 index 0000000..f8e283f --- /dev/null +++ b/src/MEDS_tabular_automl/configs/model/logistic_regression.yaml @@ -0,0 +1,44 @@ +# @package _global_ + +model_target: + _target_: MEDS_tabular_automl.sklearn_model.SklearnModel.initialize + model_params: ${model_params} + input_dir: ${input_dir} + input_label_dir: ${input_label_dir} + model_dir: ${model_dir} + output_filepath: ${output_filepath} + log_dir: ${log_dir} + cache_dir: ${cache_dir} + imputer: ${model_params.iterator.imputer} + normalization: ${model_params.iterator.normalization} + +model_params: + epochs: 20 + early_stopping_rounds: 5 + model: + _target_: sklearn.linear_model.LogisticRegression + penalty: "l2" + dual: false + tol: 0.0001 + C: 1.0 + fit_intercept: True + intercept_scaling: 1 + class_weight: null + random_state: null + solver: "lbfgs" + max_iter: 100 + + iterator: + keep_data_in_memory: True + binarize_task: True + normalization: ${normalization} + imputer: ${imputer} + +hydra: + sweeper: + params: + model_params.model.C: tag(log, interval(1e-6, 1)) + model_params.model.penalty: choice(['l1', 'l2', 'elasticnet']) + model_params.model.solver: choice(['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']) + model_params.epochs: range(10, 100) + model_params.early_stopping_rounds: range(1, 10) diff --git a/src/MEDS_tabular_automl/configs/model/random_forest_classifier.yaml b/src/MEDS_tabular_automl/configs/model/random_forest_classifier.yaml new file mode 100644 index 0000000..2bd9d01 --- /dev/null +++ b/src/MEDS_tabular_automl/configs/model/random_forest_classifier.yaml @@ -0,0 +1,46 @@ +# @package _global_ + +model_target: + _target_: MEDS_tabular_automl.sklearn_model.SklearnModel.initialize + model_params: ${model_params} + input_dir: ${input_dir} + input_label_dir: ${input_label_dir} + model_dir: ${model_dir} + output_filepath: ${output_filepath} + log_dir: ${log_dir} + cache_dir: ${cache_dir} + imputer: ${model_params.iterator.imputer} + normalization: ${model_params.iterator.normalization} + +model_params: + epochs: 20 + early_stopping_rounds: 5 + model: + _target_: sklearn.ensemble.RandomForestClassifier + criterion: "gini" + max_depth: null + min_samples_split: 2 + min_samples_leaf: 1 + min_weight_fraction_leaf: 0.0 + max_features: "sqrt" + max_leaf_nodes: null + min_impurity_decrease: 0.0 + bootstrap: True + iterator: + keep_data_in_memory: True + binarize_task: True + normalization: ${normalization} + imputer: ${imputer} + +hydra: + sweeper: + params: + model_params.model.n_estimators: range(50, 300, 50) + model_params.model.max_depth: choice([null, 10, 20, 30, 40, 50]) + model_params.model.min_samples_split: range(2, 11) + model_params.model.min_samples_leaf: range(1, 5) + model_params.model.max_features: choice(['sqrt', 'log2', null]) + model_params.model.bootstrap: choice([True, False]) + model_params.model.criterion: choice(['gini', 'entropy']) + model_params.epochs: range(10, 100) + model_params.early_stopping_rounds: range(1, 10) diff --git a/src/MEDS_tabular_automl/configs/normalization/max_abs_scaler.yaml b/src/MEDS_tabular_automl/configs/normalization/max_abs_scaler.yaml new file mode 100644 index 0000000..4cf4e1a --- /dev/null +++ b/src/MEDS_tabular_automl/configs/normalization/max_abs_scaler.yaml @@ -0,0 +1,2 @@ +normalizer: + _target_: sklearn.preprocessing.MaxAbsScaler diff --git a/src/MEDS_tabular_automl/configs/normalization/min_max_scaler.yaml b/src/MEDS_tabular_automl/configs/normalization/min_max_scaler.yaml deleted file mode 100644 index 1253b34..0000000 --- a/src/MEDS_tabular_automl/configs/normalization/min_max_scaler.yaml +++ /dev/null @@ -1,2 +0,0 @@ -normalizer: - _target_: sklearn.preprocessing.MinMaxScaler diff --git a/src/MEDS_tabular_automl/tabular_dataset.py b/src/MEDS_tabular_automl/tabular_dataset.py index ff918fe..5a6ba43 100644 --- a/src/MEDS_tabular_automl/tabular_dataset.py +++ b/src/MEDS_tabular_automl/tabular_dataset.py @@ -231,10 +231,10 @@ def _set_imputer(self): imputer = self.cfg.model_params.iterator.imputer.imputer_target if hasattr(imputer, "partial_fit"): for i in range(len(self._data_shards)): - X, _ = self.get_data_shards(i) + X, _ = self._get_shard_by_index(i) imputer.partial_fit(X) elif hasattr(imputer, "fit"): - imputer.fit(self.get_data_shards(0)[0]) + imputer.fit(self._get_shard_by_index(0)[0]) else: raise ValueError("Imputer must have a fit or partial_fit method.") self.imputer = imputer @@ -250,10 +250,10 @@ def _set_scaler(self): scaler = self.cfg.model_params.iterator.normalization.normalizer if hasattr(scaler, "partial_fit"): for i in range(len(self._data_shards)): - X, _ = self.get_data_shards(i) + X, _ = self._get_shard_by_index(i) scaler.partial_fit(X) elif hasattr(scaler, "fit"): - X = self.get_data_shards(0)[0] + X = self._get_shard_by_index(0)[0] scaler.fit(X) else: raise ValueError("Scaler must have a fit or partial_fit method.") @@ -337,6 +337,8 @@ def _get_shard_by_index(self, idx: int) -> tuple[sp.csc_matrix, np.ndarray]: for the given shard. """ dynamic_df = self._get_dynamic_shard_by_index(idx) + if self.labels is None: + self.labels = self._load_labels() label_df = self.labels[self._data_shards[idx]] return dynamic_df, label_df From 0d7ed275d4c4c5a7fb21a527ddf2016700fb36e3 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Wed, 21 Aug 2024 06:41:41 +0000 Subject: [PATCH 14/54] fixed bugs so correlation code filters work now --- .../configs/tabularization/default.yaml | 2 ++ src/MEDS_tabular_automl/tabular_dataset.py | 10 +++------- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/MEDS_tabular_automl/configs/tabularization/default.yaml b/src/MEDS_tabular_automl/configs/tabularization/default.yaml index 6fc3703..ada7dc9 100644 --- a/src/MEDS_tabular_automl/configs/tabularization/default.yaml +++ b/src/MEDS_tabular_automl/configs/tabularization/default.yaml @@ -3,6 +3,8 @@ filtered_code_metadata_fp: ${output_cohort_dir}/metadata/codes.parquet allowed_codes: null min_code_inclusion_count: 10 min_code_inclusion_frequency: null +min_correlation: null +max_by_correlation: null max_included_codes: null window_sizes: - "1d" diff --git a/src/MEDS_tabular_automl/tabular_dataset.py b/src/MEDS_tabular_automl/tabular_dataset.py index 5a6ba43..84a6609 100644 --- a/src/MEDS_tabular_automl/tabular_dataset.py +++ b/src/MEDS_tabular_automl/tabular_dataset.py @@ -177,9 +177,7 @@ def _get_code_set(self) -> tuple[set[int], Mapping[str, list[bool]], int]: hasattr(self.cfg.tabularization, "max_by_correlation") and self.cfg.tabularization.max_by_correlation ): - corrs = self._get_approximate_correlation_per_feature( - self.get_data_shards(0)[0], self.get_data_shards(0)[1] - ) + corrs = self._get_approximate_correlation_per_feature(*self._get_shard_by_index(0)) corrs = np.abs(corrs) sorted_corrs = np.argsort(corrs)[::-1] @@ -187,9 +185,7 @@ def _get_code_set(self) -> tuple[set[int], Mapping[str, list[bool]], int]: set(sorted_corrs[: self.cfg.tabularization.max_by_correlation]) ) if hasattr(self.cfg.tabularization, "min_correlation") and self.cfg.tabularization.min_correlation: - corrs = self._get_approximate_correlation_per_feature( - self.get_data_shards(0)[0], self.get_data_shards(0)[1] - ) + corrs = self._get_approximate_correlation_per_feature(*self._get_shard_by_index(0)) corrs = np.abs(corrs) codes_set = codes_set.intersection( set(np.where(corrs > self.cfg.tabularization.min_correlation)[0]) @@ -356,7 +352,7 @@ def _filter_shard_on_codes_and_freqs(self, agg: str, df: sp.csc_matrix) -> sp.cs Returns: The filtered data frame. """ - if self.codes_set is None: + if not hasattr(self, "codes_set") or self.codes_set is None: return df ckey = f"_filter_shard_on_codes_and_freqs/{agg}" From 9c542eaae25f1169fae23b4b6893f3fd2edec0ba Mon Sep 17 00:00:00 2001 From: Teya Bergamaschi Date: Wed, 21 Aug 2024 13:18:11 +0000 Subject: [PATCH 15/54] sweeper --- src/MEDS_tabular_automl/configs/model/knn_classifier.yaml | 4 ++++ .../configs/model/logistic_regression.yaml | 4 ++++ .../configs/model/random_forest_classifier.yaml | 4 ++++ src/MEDS_tabular_automl/configs/model/sgd_classifier.yaml | 3 +++ src/MEDS_tabular_automl/configs/model/xgboost.yaml | 4 ++++ 5 files changed, 19 insertions(+) diff --git a/src/MEDS_tabular_automl/configs/model/knn_classifier.yaml b/src/MEDS_tabular_automl/configs/model/knn_classifier.yaml index 58a7850..9e86e81 100644 --- a/src/MEDS_tabular_automl/configs/model/knn_classifier.yaml +++ b/src/MEDS_tabular_automl/configs/model/knn_classifier.yaml @@ -29,6 +29,10 @@ model_params: hydra: sweeper: + direction: maximize + n_trials: 250 + n_jobs: 25 + params: model_params.model.n_neighbors: range(1, 20) model_params.model.weights: choice(['uniform', 'distance']) diff --git a/src/MEDS_tabular_automl/configs/model/logistic_regression.yaml b/src/MEDS_tabular_automl/configs/model/logistic_regression.yaml index f8e283f..bdfc19a 100644 --- a/src/MEDS_tabular_automl/configs/model/logistic_regression.yaml +++ b/src/MEDS_tabular_automl/configs/model/logistic_regression.yaml @@ -36,6 +36,10 @@ model_params: hydra: sweeper: + direction: maximize + n_trials: 250 + n_jobs: 25 + params: model_params.model.C: tag(log, interval(1e-6, 1)) model_params.model.penalty: choice(['l1', 'l2', 'elasticnet']) diff --git a/src/MEDS_tabular_automl/configs/model/random_forest_classifier.yaml b/src/MEDS_tabular_automl/configs/model/random_forest_classifier.yaml index 2bd9d01..c36c506 100644 --- a/src/MEDS_tabular_automl/configs/model/random_forest_classifier.yaml +++ b/src/MEDS_tabular_automl/configs/model/random_forest_classifier.yaml @@ -34,6 +34,10 @@ model_params: hydra: sweeper: + direction: maximize + n_trials: 250 + n_jobs: 25 + params: model_params.model.n_estimators: range(50, 300, 50) model_params.model.max_depth: choice([null, 10, 20, 30, 40, 50]) diff --git a/src/MEDS_tabular_automl/configs/model/sgd_classifier.yaml b/src/MEDS_tabular_automl/configs/model/sgd_classifier.yaml index d935cc6..8411817 100644 --- a/src/MEDS_tabular_automl/configs/model/sgd_classifier.yaml +++ b/src/MEDS_tabular_automl/configs/model/sgd_classifier.yaml @@ -26,6 +26,9 @@ model_params: hydra: sweeper: + direction: maximize + n_trials: 250 + n_jobs: 25 params: +model_params.model.alpha: tag(log, interval(1e-6, 1)) +model_params.model.l1_ratio: interval(0, 1) diff --git a/src/MEDS_tabular_automl/configs/model/xgboost.yaml b/src/MEDS_tabular_automl/configs/model/xgboost.yaml index 7eb7c8d..11c8d81 100644 --- a/src/MEDS_tabular_automl/configs/model/xgboost.yaml +++ b/src/MEDS_tabular_automl/configs/model/xgboost.yaml @@ -30,6 +30,10 @@ model_params: hydra: sweeper: + direction: maximize + n_trials: 250 + n_jobs: 25 + params: +model_params.model.eta: tag(log, interval(0.001, 1)) +model_params.model.lambda: tag(log, interval(0.001, 1)) From 1a519ffd8ac4df3785ef280ba7fadd1d6c7f1879 Mon Sep 17 00:00:00 2001 From: Teya Bergamaschi Date: Wed, 21 Aug 2024 17:14:56 +0000 Subject: [PATCH 16/54] logging --- .../hydra/callbacks/evaluation_callback.yaml | 5 ++ .../configs/launch_model.yaml | 11 ++-- .../evaluation_callback.py | 59 +++++++++++++++++++ .../scripts/launch_model.py | 23 +++++++- src/MEDS_tabular_automl/xgboost_model.py | 15 ++++- 5 files changed, 103 insertions(+), 10 deletions(-) create mode 100644 src/MEDS_tabular_automl/configs/hydra/callbacks/evaluation_callback.yaml create mode 100644 src/MEDS_tabular_automl/evaluation_callback.py diff --git a/src/MEDS_tabular_automl/configs/hydra/callbacks/evaluation_callback.yaml b/src/MEDS_tabular_automl/configs/hydra/callbacks/evaluation_callback.yaml new file mode 100644 index 0000000..82d4687 --- /dev/null +++ b/src/MEDS_tabular_automl/configs/hydra/callbacks/evaluation_callback.yaml @@ -0,0 +1,5 @@ +# @package _global_ +hydra: + callbacks: + evaluation_callback: + _target_: MEDS_tabular_automl.evaluation_callback.EvaluationCallback diff --git a/src/MEDS_tabular_automl/configs/launch_model.yaml b/src/MEDS_tabular_automl/configs/launch_model.yaml index fc68bc5..ad47261 100644 --- a/src/MEDS_tabular_automl/configs/launch_model.yaml +++ b/src/MEDS_tabular_automl/configs/launch_model.yaml @@ -5,6 +5,7 @@ defaults: - model: xgboost # This can be changed to sgd_classifier or any other model - imputer: default - normalization: default + - override hydra/callbacks: evaluation_callback - override hydra/sweeper: optuna - override hydra/sweeper/sampler: tpe - override hydra/launcher: joblib @@ -18,16 +19,16 @@ input_label_dir: ${output_cohort_dir}/${task_name}/labels/ # Where to output the model and cached data model_dir: ${output_cohort_dir}/model/model_${now:%Y-%m-%d_%H-%M-%S} output_filepath: ${model_dir}/model_metadata.json - -log_dir: ${model_dir}/.logs/ +model_log_dir: ${model_dir}/.logs/ +save_top_k: -1 name: launch_model hydra: verbose: False job: - name: MEDS_TAB_${name}_${worker}_{now:%Y-%m-%d_%H-%M-%S} + name: MEDS_TAB_${name}_${worker}_${now:%Y-%m-%d_%H-%M-%S} sweep: - dir: ${log_dir} + dir: ${model_log_dir} run: - dir: ${log_dir} + dir: ${model_log_dir} diff --git a/src/MEDS_tabular_automl/evaluation_callback.py b/src/MEDS_tabular_automl/evaluation_callback.py new file mode 100644 index 0000000..84ce5bb --- /dev/null +++ b/src/MEDS_tabular_automl/evaluation_callback.py @@ -0,0 +1,59 @@ +import ast +from pathlib import Path + +import pandas as pd +from hydra.experimental.callback import Callback +from omegaconf import DictConfig, OmegaConf + + +class EvaluationCallback(Callback): + def __init__(self, **kwargs): + self.kwargs = kwargs + + def on_multirun_end(self, config: DictConfig, **kwargs): + """Find best model based on log files and print its performance and hyperparameters.""" + log_fp = Path(config.model_log_dir) + + performance = pd.read_csv( + log_fp / "performance.log", sep=",", header=None + ) # , columns=["model_fp", "tuning_auc", "test_auc"]) + performance.columns = ["model_fp", "tuning_auc", "test_auc"] + performance.sort_values("tuning_auc", ascending=False, inplace=True) + print(performance.head()) + + hyperparams = pd.read_csv(log_fp / "hyperparameters.log", sep="\t", header=None) + hyperparams.columns = ["model_fp", "tabularization", "model_params"] + + best_model = performance.head(1)["model_fp"].values[0] + best_hyperparams = hyperparams[hyperparams["model_fp"] == best_model] + + print(f"The best model can be found at {best_model}") + self.print_performance(performance.head(1)) + self.print_hyperparams(best_hyperparams) + if hasattr(config, "save_top_k") and config.save_top_k >= 0: + self.save_top_k_models(performance, config.save_top_k, config.model_dir) + + return performance.head(1) + + def print_performance(self, performance): + """Print performance of the best model with nice formatting.""" + print("Performance of the best model:") + print(f"Tuning AUC: {performance['tuning_auc'].values[0]}") + print(f"Test AUC: {performance['test_auc'].values[0]}") + + def print_hyperparams(self, hyperparams): + """Print hyperparameters of the best model with nice formatting.""" + print("Hyperparameters of the best model:") + print( + f"Tabularization: {OmegaConf.to_yaml(ast.literal_eval(hyperparams['tabularization'].values[0]))}" + ) + print( + f"Model parameters: {OmegaConf.to_yaml(ast.literal_eval(hyperparams['model_params'].values[0]))}" + ) + + def save_top_k_models(self, performance, k, model_dir): + """Save only top k models from the model directory and delete all other files.""" + top_k_models = performance.head(k)["model_fp"].values + for model_fp in Path(model_dir).iterdir(): + if model_fp.is_file() and model_fp.suffix != ".log" and str(model_fp) not in top_k_models: + model_fp.unlink() diff --git a/src/MEDS_tabular_automl/scripts/launch_model.py b/src/MEDS_tabular_automl/scripts/launch_model.py index df238cc..d0232aa 100644 --- a/src/MEDS_tabular_automl/scripts/launch_model.py +++ b/src/MEDS_tabular_automl/scripts/launch_model.py @@ -1,8 +1,8 @@ +import time from importlib.resources import files from pathlib import Path import hydra -from loguru import logger from omegaconf import DictConfig, open_dict from MEDS_tabular_automl.base_model import BaseModel @@ -14,6 +14,21 @@ raise FileNotFoundError("Core configuration not successfully installed!") +def log_to_logfile(model, cfg, output_fp): + """Log model hyperparameters and performance to two log files.""" + log_fp = Path(cfg.model_log_dir) + log_fp.mkdir(parents=True, exist_ok=True) + # log hyperparameters + with open(log_fp / "hyperparameters.log", "a") as f: + f.write(f"{output_fp}\t") + f.write(f"{cfg.tabularization}\t") + f.write(f"{cfg.model_params}\n") + + # log performance + with open(log_fp / "performance.log", "a") as f: + f.write(f"{output_fp}, {model.evaluate()}, {model.evaluate(split='held_out')}\n") + + @hydra.main(version_base=None, config_path=str(config_yaml.parent.resolve()), config_name=config_yaml.stem) def main(cfg: DictConfig) -> float: """Optimizes the model based on the provided configuration. @@ -36,12 +51,16 @@ def main(cfg: DictConfig) -> float: model.train() auc = model.evaluate() - logger.info(f"AUC: {auc}") + # logger.info(f"AUC: {auc}") # save model output_fp = Path(cfg.output_filepath) + output_fp = output_fp.parent / f"{output_fp.stem}_{auc:.4f}_{time.time()}{output_fp.suffix}" output_fp.parent.mkdir(parents=True, exist_ok=True) + # log to logfile + log_to_logfile(model, cfg, output_fp) + model.save_model(output_fp) return auc diff --git a/src/MEDS_tabular_automl/xgboost_model.py b/src/MEDS_tabular_automl/xgboost_model.py index 2223c90..424d9f0 100644 --- a/src/MEDS_tabular_automl/xgboost_model.py +++ b/src/MEDS_tabular_automl/xgboost_model.py @@ -177,14 +177,23 @@ def _build_iterators(self): self.iheld_out = XGBIterator(self.cfg, split="held_out") @TimeableMixin.TimeAs - def evaluate(self) -> float: + def evaluate(self, split="tuning") -> float: """Evaluates the model on the tuning set. Returns: The evaluation metric as the ROC AUC score. """ - y_pred = self.model.predict(self.dtuning) - y_true = self.dtuning.get_label() + if split == "tuning": + y_pred = self.model.predict(self.dtuning) + y_true = self.dtuning.get_label() + elif split == "held_out": + y_pred = self.model.predict(self.dheld_out) + y_true = self.dheld_out.get_label() + elif split == "train": + y_pred = self.model.predict(self.dtrain) + y_true = self.dtrain.get_label() + else: + raise ValueError(f"Invalid split for evaluation: {split}") return roc_auc_score(y_true, y_pred) def save_model(self, output_fp: Path): From 8fc88632fc2f51f35f4abc5a04f2a8a7a73272bf Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Wed, 21 Aug 2024 20:26:59 +0000 Subject: [PATCH 17/54] made tash caching parallelize and updated tests for configs --- src/MEDS_tabular_automl/scripts/cache_task.py | 54 ++++++++++--------- tests/test_configs.py | 7 ++- 2 files changed, 34 insertions(+), 27 deletions(-) diff --git a/src/MEDS_tabular_automl/scripts/cache_task.py b/src/MEDS_tabular_automl/scripts/cache_task.py index 15c194b..884dadb 100644 --- a/src/MEDS_tabular_automl/scripts/cache_task.py +++ b/src/MEDS_tabular_automl/scripts/cache_task.py @@ -1,7 +1,6 @@ #!/usr/bin/env python """Aggregates time-series data for feature columns across different window sizes.""" -from functools import partial from importlib.resources import files from pathlib import Path @@ -104,35 +103,40 @@ def main(cfg: DictConfig): split, shard_num, window_size, code_type, agg_name = Path(data_fp).with_suffix("").parts[-5:] raw_data_fp = Path(cfg.output_cohort_dir) / "data" / split / f"{shard_num}.parquet" - raw_data_df = filter_parquet(raw_data_fp, cfg.tabularization._resolved_codes) - raw_data_df = ( - get_unique_time_events_df(get_events_df(raw_data_df, feature_columns)) - .with_row_index("event_id") - .select("patient_id", "time", "event_id") - ) - shard_label_df = label_df.join( - raw_data_df.select("patient_id").unique(), on="patient_id", how="inner" - ).join_asof(other=raw_data_df, by="patient_id", on="time") - shard_label_fp = Path(cfg.output_label_dir) / split / f"{shard_num}.parquet" - rwlock_wrap( - raw_data_fp, - shard_label_fp, - pl.scan_parquet, - write_lazyframe, - lambda df: shard_label_df, - do_overwrite=cfg.do_overwrite, - do_return=False, - ) - out_fp = (Path(cfg.output_dir) / get_shard_prefix(cfg.input_dir, data_fp)).with_suffix(".npz") - compute_fn = partial(generate_row_cached_matrix, label_df=shard_label_df) - write_fn = partial(write_df, do_overwrite=cfg.do_overwrite) + + def read_fn(in_fp_tuple): + raw_data_fp, data_fp = in_fp_tuple + raw_data_df = filter_parquet(raw_data_fp, cfg.tabularization._resolved_codes) + matrix = load_matrix(data_fp) + return raw_data_df, matrix + + def compute_fn(input_tuple): + raw_data_df, matrix = input_tuple + raw_data_df = ( + get_unique_time_events_df(get_events_df(raw_data_df, feature_columns)) + .with_row_index("event_id") + .select("patient_id", "time", "event_id") + ) + shard_label_df = label_df.join( + raw_data_df.select("patient_id").unique(), on="patient_id", how="inner" + ).join_asof(other=raw_data_df, by="patient_id", on="time") + + row_cached_matrix = generate_row_cached_matrix(matrix=matrix, label_df=shard_label_df) + + return shard_label_df, row_cached_matrix + + def write_fn(output_tuple, out_fp): + shard_label_df, row_cached_matrix = output_tuple + Path(shard_label_fp).parent.mkdir(parents=True, exist_ok=True) + write_lazyframe(shard_label_df, shard_label_fp) + write_df(row_cached_matrix, out_fp, do_overwrite=cfg.do_overwrite) rwlock_wrap( - data_fp, + (raw_data_fp, data_fp), out_fp, - load_matrix, + read_fn, write_fn, compute_fn, do_overwrite=cfg.do_overwrite, diff --git a/tests/test_configs.py b/tests/test_configs.py index 8c0c138..708d270 100644 --- a/tests/test_configs.py +++ b/tests/test_configs.py @@ -34,9 +34,12 @@ def make_config_mutable(cfg): make_config_mutable(cfg[key]) -@pytest.mark.parametrize("model", ["xgboost", "sgd_classifier"]) +@pytest.mark.parametrize( + "model", + ["xgboost", "sgd_classifier", "knn_classifier", "logistic_regression", "random_forest_classifier"], +) @pytest.mark.parametrize("imputer", ["default", "mean_imputer", "mode_imputer", "median_imputer"]) -@pytest.mark.parametrize("normalization", ["min_max_scaler", "standard_scaler"]) +@pytest.mark.parametrize("normalization", ["standard_scaler", "max_abs_scaler"]) def test_model_config(model, imputer, normalization): MEDS_cohort_dir = "blah" xgboost_config_kwargs = { From 3e223bb89c94e94f5c61a870b444d9b13ec5f148 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Thu, 22 Aug 2024 07:48:05 +0000 Subject: [PATCH 18/54] added more thourough tests for output file paths of task caching and tabularizing. Standardized output directory for autogluon model. Additionally fixed label bug so labels are deduplicated --- .../configs/launch_autogluon.yaml | 1 + .../configs/task_specific_caching.yaml | 2 +- src/MEDS_tabular_automl/scripts/cache_task.py | 74 ++- .../scripts/launch_autogluon.py | 14 +- .../scripts/launch_model.py | 28 +- src/MEDS_tabular_automl/utils.py | 3 +- tests/test_integration.py | 370 +++++++------- tests/test_tabularize.py | 460 +++++++++--------- 8 files changed, 505 insertions(+), 447 deletions(-) diff --git a/src/MEDS_tabular_automl/configs/launch_autogluon.yaml b/src/MEDS_tabular_automl/configs/launch_autogluon.yaml index c11d116..567d11c 100644 --- a/src/MEDS_tabular_automl/configs/launch_autogluon.yaml +++ b/src/MEDS_tabular_automl/configs/launch_autogluon.yaml @@ -16,6 +16,7 @@ input_dir: ${output_cohort_dir}/${task_name}/task_cache input_label_dir: ${output_cohort_dir}/${task_name}/labels/ # Where to output the model and cached data model_dir: ${output_cohort_dir}/autogluon/autogluon_${now:%Y-%m-%d_%H-%M-%S} +model_log_dir: ${model_dir}/.logs/ output_filepath: ${model_dir} # Model parameters diff --git a/src/MEDS_tabular_automl/configs/task_specific_caching.yaml b/src/MEDS_tabular_automl/configs/task_specific_caching.yaml index c002dfc..7a3ee79 100644 --- a/src/MEDS_tabular_automl/configs/task_specific_caching.yaml +++ b/src/MEDS_tabular_automl/configs/task_specific_caching.yaml @@ -7,7 +7,7 @@ task_name: task # Tabularized Data input_dir: ${output_cohort_dir}/tabularize # Where the labels are stored, with columns patient_id, timestamp, label -input_label_dir: ${MEDS_cohort_dir}/${task_name}/labels +input_label_dir: ${MEDS_cohort_dir}/tasks/${task_name}/ # Where to output the task specific tabularized data output_dir: ${output_cohort_dir}/${task_name}/task_cache output_label_dir: ${output_cohort_dir}/${task_name}/labels diff --git a/src/MEDS_tabular_automl/scripts/cache_task.py b/src/MEDS_tabular_automl/scripts/cache_task.py index 884dadb..8d36974 100644 --- a/src/MEDS_tabular_automl/scripts/cache_task.py +++ b/src/MEDS_tabular_automl/scripts/cache_task.py @@ -8,6 +8,7 @@ import numpy as np import polars as pl import scipy.sparse as sp +from loguru import logger from omegaconf import DictConfig from ..describe_codes import filter_parquet, get_feature_columns @@ -85,14 +86,20 @@ def main(cfg: DictConfig): # shuffle tasks tabularization_tasks = list_subdir_files(cfg.input_dir, "npz") + np.random.shuffle(tabularization_tasks) label_dir = Path(cfg.input_label_dir) - label_df = pl.scan_parquet(label_dir / "**/*.parquet").rename( - { - "prediction_time": "time", - cfg.label_column: "label", - } + label_df = ( + pl.scan_parquet(label_dir / "**/*.parquet") + .rename( + { + "prediction_time": "time", + cfg.label_column: "label", + } + ) + .group_by(pl.col("patient_id", "time"), maintain_order=True) + .first() ) feature_columns = get_feature_columns(cfg.tabularization.filtered_code_metadata_fp) @@ -101,40 +108,55 @@ def main(cfg: DictConfig): for data_fp in iter_wrapper(tabularization_tasks): # parse as time series agg split, shard_num, window_size, code_type, agg_name = Path(data_fp).with_suffix("").parts[-5:] - - raw_data_fp = Path(cfg.output_cohort_dir) / "data" / split / f"{shard_num}.parquet" + meds_data_in_fp = Path(cfg.output_cohort_dir) / "data" / split / f"{shard_num}.parquet" shard_label_fp = Path(cfg.output_label_dir) / split / f"{shard_num}.parquet" out_fp = (Path(cfg.output_dir) / get_shard_prefix(cfg.input_dir, data_fp)).with_suffix(".npz") - def read_fn(in_fp_tuple): - raw_data_fp, data_fp = in_fp_tuple - raw_data_df = filter_parquet(raw_data_fp, cfg.tabularization._resolved_codes) - matrix = load_matrix(data_fp) - return raw_data_df, matrix - - def compute_fn(input_tuple): - raw_data_df, matrix = input_tuple - raw_data_df = ( - get_unique_time_events_df(get_events_df(raw_data_df, feature_columns)) + def read_meds_data_df(meds_data_fp): + if "numeric_value" not in pl.scan_parquet(meds_data_fp).columns: + raise ValueError( + f"'numeric_value' column not found in raw data {meds_data_fp}. " + "You are maybe loading labels instead or meds data" + ) + return filter_parquet(meds_data_fp, cfg.tabularization._resolved_codes) + + def extract_labels(meds_data_df): + meds_data_df = ( + get_unique_time_events_df(get_events_df(meds_data_df, feature_columns)) .with_row_index("event_id") .select("patient_id", "time", "event_id") ) shard_label_df = label_df.join( - raw_data_df.select("patient_id").unique(), on="patient_id", how="inner" - ).join_asof(other=raw_data_df, by="patient_id", on="time") + meds_data_df.select("patient_id").unique(), on="patient_id", how="inner" + ).join_asof(other=meds_data_df, by="patient_id", on="time") + return shard_label_df - row_cached_matrix = generate_row_cached_matrix(matrix=matrix, label_df=shard_label_df) + def read_fn(in_fp_tuple): + meds_data_fp, data_fp = in_fp_tuple + assert "data" in str(meds_data_fp) + # TODO: replace this with more intelligent locking + if not Path(shard_label_fp).exists(): + logger.info(f"Extracting labels for {shard_label_fp}") + Path(shard_label_fp).parent.mkdir(parents=True, exist_ok=True) + meds_data_df = read_meds_data_df(meds_data_fp) + extracted_events = extract_labels(meds_data_df) + write_lazyframe(extracted_events, shard_label_fp) + else: + logger.info(f"Labels already exist, reading from {shard_label_fp}") + shard_label_df = pl.scan_parquet(shard_label_fp) + matrix = load_matrix(data_fp) + return shard_label_df, matrix - return shard_label_df, row_cached_matrix + def compute_fn(input_tuple): + shard_label_df, matrix = input_tuple + row_cached_matrix = generate_row_cached_matrix(matrix=matrix, label_df=shard_label_df) + return row_cached_matrix - def write_fn(output_tuple, out_fp): - shard_label_df, row_cached_matrix = output_tuple - Path(shard_label_fp).parent.mkdir(parents=True, exist_ok=True) - write_lazyframe(shard_label_df, shard_label_fp) + def write_fn(row_cached_matrix, out_fp): write_df(row_cached_matrix, out_fp, do_overwrite=cfg.do_overwrite) rwlock_wrap( - (raw_data_fp, data_fp), + (meds_data_in_fp, data_fp), out_fp, read_fn, write_fn, diff --git a/src/MEDS_tabular_automl/scripts/launch_autogluon.py b/src/MEDS_tabular_automl/scripts/launch_autogluon.py index 0e163ea..db61e9f 100644 --- a/src/MEDS_tabular_automl/scripts/launch_autogluon.py +++ b/src/MEDS_tabular_automl/scripts/launch_autogluon.py @@ -1,4 +1,5 @@ from importlib.resources import files +from pathlib import Path import hydra import pandas as pd @@ -63,12 +64,17 @@ def main(cfg: DictConfig) -> float: # predict predictions = predictor.predict(held_out_dataset.drop(columns=[cfg.task_name])) - print("Predictions:", predictions) + logger.info("Predictions:", predictions) # evaluate score = predictor.evaluate(held_out_dataset) - print("Test score:", score) - - # TODO(model) add tests for autogluon pipeline + logger.info("Test score:", score) + + log_fp = Path(cfg.model_log_dir) + log_fp.mkdir(parents=True, exist_ok=True) + # log hyperparameters + out_fp = log_fp / "trial_performance_results.log" + with open(out_fp, "w") as f: + f.write(f"{cfg.output_filepath}\t{cfg.tabularization}\t{cfg.model_params}\t{None}\t{score}\n") if __name__ == "__main__": diff --git a/src/MEDS_tabular_automl/scripts/launch_model.py b/src/MEDS_tabular_automl/scripts/launch_model.py index d0232aa..b4fa3de 100644 --- a/src/MEDS_tabular_automl/scripts/launch_model.py +++ b/src/MEDS_tabular_automl/scripts/launch_model.py @@ -6,6 +6,7 @@ from omegaconf import DictConfig, open_dict from MEDS_tabular_automl.base_model import BaseModel +from MEDS_tabular_automl.mapper import wrap as rwlock_wrap from ..utils import hydra_loguru_init @@ -19,14 +20,25 @@ def log_to_logfile(model, cfg, output_fp): log_fp = Path(cfg.model_log_dir) log_fp.mkdir(parents=True, exist_ok=True) # log hyperparameters - with open(log_fp / "hyperparameters.log", "a") as f: - f.write(f"{output_fp}\t") - f.write(f"{cfg.tabularization}\t") - f.write(f"{cfg.model_params}\n") - - # log performance - with open(log_fp / "performance.log", "a") as f: - f.write(f"{output_fp}, {model.evaluate()}, {model.evaluate(split='held_out')}\n") + out_fp = log_fp / "trial_performance_results.log" + + def write_fn(_, out_fp): + with open(out_fp, "a") as f: + f.write( + f"{output_fp}\t{cfg.tabularization}\t{cfg.model_params}" + f"\t{model.evaluate()}\t{model.evaluate(split='held_out')}\n" + ) + + rwlock_wrap( + None, + out_fp, + lambda _: None, # read_fn is ignored + write_fn, + cache_intermediate=True, + clear_cache_on_completion=True, + do_overwrite=True, + do_return=False, + ) @hydra.main(version_base=None, config_path=str(config_yaml.parent.resolve()), config_name=config_yaml.stem) diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py index e61d45b..4971ed7 100644 --- a/src/MEDS_tabular_automl/utils.py +++ b/src/MEDS_tabular_automl/utils.py @@ -86,7 +86,8 @@ def filter_to_codes( feature_freqs = feature_freqs.filter(pl.col("count") >= min_code_inclusion_count) if max_include_codes is not None: - feature_freqs = feature_freqs.sort("count", reverse=True).head(max_include_codes) + # feature_freqs = feature_freqs.sort("count", reverse=True).head(max_include_codes) + feature_freqs = feature_freqs.sort("count", descending=True).head(max_include_codes) return sorted(feature_freqs["code"].to_list()) diff --git a/tests/test_integration.py b/tests/test_integration.py index 81336ed..12948ee 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -4,7 +4,6 @@ import json import subprocess -import tempfile from io import StringIO from pathlib import Path @@ -42,196 +41,191 @@ def run_command(script: str, args: list[str], hydra_kwargs: dict[str, str], test return stderr, stdout -def test_integration(): +def test_integration(tmp_path): # Step 0: Setup Environment - with tempfile.TemporaryDirectory() as d: - MEDS_cohort_dir = Path(d) / "MEDS_cohort_dir" - output_cohort_dir = Path(d) / "output_cohort_dir" - - shared_config = { - "MEDS_cohort_dir": str(MEDS_cohort_dir.resolve()), - "output_cohort_dir": str(output_cohort_dir.resolve()), - "do_overwrite": False, - "seed": 1, - "hydra.verbose": True, - "tqdm": False, - "loguru_init": True, - } - - describe_codes_config = {**shared_config} - - with initialize( - version_base=None, config_path="../src/MEDS_tabular_automl/configs/" - ): # path to config.yaml - overrides = [f"{k}={v}" for k, v in describe_codes_config.items()] - cfg = compose(config_name="describe_codes", overrides=overrides) # config.yaml - - # Create the directories - (output_cohort_dir / "data").mkdir(parents=True, exist_ok=True) - - # Store MEDS outputs - all_data = [] - for split, data in MEDS_OUTPUTS.items(): - file_path = output_cohort_dir / "data" / f"{split}.parquet" - file_path.parent.mkdir(exist_ok=True) - df = pl.read_csv(StringIO(data)).with_columns( - pl.col("time").str.to_datetime("%Y-%m-%dT%H:%M:%S%.f") - ) - df.write_parquet(file_path) - all_data.append(df) - - all_data = pl.concat(all_data, how="diagonal_relaxed").sort(by=["patient_id", "time"]) - - # Check the files are not empty - meds_files = list_subdir_files(Path(cfg.input_dir), "parquet") - assert ( - len(list_subdir_files(Path(cfg.input_dir).parent, "parquet")) == 4 - ), "MEDS train split Data Files Should be 4!" - for f in meds_files: - assert pl.read_parquet(f).shape[0] > 0, "MEDS Data Tabular Dataframe Should not be Empty!" - split_json = json.load(StringIO(SPLITS_JSON)) - splits_fp = output_cohort_dir / ".shards.json" - json.dump(split_json, splits_fp.open("w")) - - # Step 1: Run the describe_codes script - stderr, stdout = run_command( - "meds-tab-describe", - [], - describe_codes_config, - "describe_codes", + MEDS_cohort_dir = Path(tmp_path) / "MEDS_cohort_dir" + output_cohort_dir = Path(tmp_path) / "output_cohort_dir" + + shared_config = { + "MEDS_cohort_dir": str(MEDS_cohort_dir.resolve()), + "output_cohort_dir": str(output_cohort_dir.resolve()), + "do_overwrite": False, + "seed": 1, + "hydra.verbose": True, + "tqdm": False, + "loguru_init": True, + } + + describe_codes_config = {**shared_config} + + with initialize( + version_base=None, config_path="../src/MEDS_tabular_automl/configs/" + ): # path to config.yaml + overrides = [f"{k}={v}" for k, v in describe_codes_config.items()] + cfg = compose(config_name="describe_codes", overrides=overrides) # config.yaml + + # Create the directories + (output_cohort_dir / "data").mkdir(parents=True, exist_ok=True) + + # Store MEDS outputs + all_data = [] + for split, data in MEDS_OUTPUTS.items(): + file_path = output_cohort_dir / "data" / f"{split}.parquet" + file_path.parent.mkdir(exist_ok=True) + df = pl.read_csv(StringIO(data)).with_columns(pl.col("time").str.to_datetime("%Y-%m-%dT%H:%M:%S%.f")) + df.write_parquet(file_path) + all_data.append(df) + + all_data = pl.concat(all_data, how="diagonal_relaxed").sort(by=["patient_id", "time"]) + + # Check the files are not empty + meds_files = list_subdir_files(Path(cfg.input_dir), "parquet") + assert ( + len(list_subdir_files(Path(cfg.input_dir).parent, "parquet")) == 4 + ), "MEDS train split Data Files Should be 4!" + for f in meds_files: + assert pl.read_parquet(f).shape[0] > 0, "MEDS Data Tabular Dataframe Should not be Empty!" + split_json = json.load(StringIO(SPLITS_JSON)) + splits_fp = output_cohort_dir / ".shards.json" + json.dump(split_json, splits_fp.open("w")) + + # Step 1: Run the describe_codes script + stderr, stdout = run_command( + "meds-tab-describe", + [], + describe_codes_config, + "describe_codes", + ) + assert Path(cfg.output_filepath).is_file() + + feature_columns = get_feature_columns(cfg.output_filepath) + assert get_feature_names("code/count", feature_columns) == sorted(CODE_COLS) + assert get_feature_names("static/present", feature_columns) == sorted(STATIC_PRESENT_COLS) + assert get_feature_names("static/first", feature_columns) == sorted(STATIC_FIRST_COLS) + for value_agg in VALUE_AGGREGATIONS: + assert get_feature_names(value_agg, feature_columns) == sorted(VALUE_COLS) + + # Step 2: Run the static data tabularization script + tabularize_config = { + **shared_config, + "tabularization.min_code_inclusion_count": 1, + "tabularization.window_sizes": "[30d,365d,full]", + } + stderr, stdout = run_command( + "meds-tab-tabularize-static", + [], + tabularize_config, + "tabularization", + ) + with initialize( + version_base=None, config_path="../src/MEDS_tabular_automl/configs/" + ): # path to config.yaml + overrides = [f"{k}={v}" for k, v in tabularize_config.items()] + cfg = compose(config_name="tabularization", overrides=overrides) # config.yaml + + output_files = list(Path(cfg.output_dir).glob("**/static/**/*.npz")) + actual_files = [get_shard_prefix(Path(cfg.output_dir), each) + ".npz" for each in output_files] + assert set(actual_files) == set(EXPECTED_STATIC_FILES) + # Check the files are not empty + for f in output_files: + static_matrix = load_matrix(f) + assert static_matrix.shape[0] > 0, "Static Data Tabular Dataframe Should not be Empty!" + expected_num_cols = len(get_feature_names(f"static/{f.stem}", feature_columns)) + assert static_matrix.shape[1] == expected_num_cols, ( + f"Static Data Tabular Dataframe Should have {expected_num_cols}" + f"Columns but has {static_matrix.shape[1]}!" ) - assert Path(cfg.output_filepath).is_file() - - feature_columns = get_feature_columns(cfg.output_filepath) - assert get_feature_names("code/count", feature_columns) == sorted(CODE_COLS) - assert get_feature_names("static/present", feature_columns) == sorted(STATIC_PRESENT_COLS) - assert get_feature_names("static/first", feature_columns) == sorted(STATIC_FIRST_COLS) - for value_agg in VALUE_AGGREGATIONS: - assert get_feature_names(value_agg, feature_columns) == sorted(VALUE_COLS) - - # Step 2: Run the static data tabularization script - tabularize_config = { - **shared_config, - "tabularization.min_code_inclusion_count": 1, - "tabularization.window_sizes": "[30d,365d,full]", - } - stderr, stdout = run_command( - "meds-tab-tabularize-static", - [], - tabularize_config, - "tabularization", + split = f.parts[-5] + shard_num = f.parts[-4] + med_shard_fp = (Path(cfg.input_dir) / split / shard_num).with_suffix(".parquet") + expected_num_rows = ( + get_unique_time_events_df(get_events_df(pl.scan_parquet(med_shard_fp), feature_columns)) + .collect() + .shape[0] ) - with initialize( - version_base=None, config_path="../src/MEDS_tabular_automl/configs/" - ): # path to config.yaml - overrides = [f"{k}={v}" for k, v in tabularize_config.items()] - cfg = compose(config_name="tabularization", overrides=overrides) # config.yaml - - output_files = list(Path(cfg.output_dir).glob("**/static/**/*.npz")) - actual_files = [get_shard_prefix(Path(cfg.output_dir), each) + ".npz" for each in output_files] - assert set(actual_files) == set(EXPECTED_STATIC_FILES) - # Check the files are not empty - for f in output_files: - static_matrix = load_matrix(f) - assert static_matrix.shape[0] > 0, "Static Data Tabular Dataframe Should not be Empty!" - expected_num_cols = len(get_feature_names(f"static/{f.stem}", feature_columns)) - assert static_matrix.shape[1] == expected_num_cols, ( - f"Static Data Tabular Dataframe Should have {expected_num_cols}" - f"Columns but has {static_matrix.shape[1]}!" - ) - split = f.parts[-5] - shard_num = f.parts[-4] - med_shard_fp = (Path(cfg.input_dir) / split / shard_num).with_suffix(".parquet") - expected_num_rows = ( - get_unique_time_events_df(get_events_df(pl.scan_parquet(med_shard_fp), feature_columns)) - .collect() - .shape[0] - ) - assert static_matrix.shape[0] == expected_num_rows, ( - f"Static Data matrix Should have {expected_num_rows}" - f" rows but has {static_matrix.shape[0]}!" - ) - allowed_codes = cfg.tabularization._resolved_codes - num_allowed_codes = len(allowed_codes) - feature_columns = get_feature_columns(cfg.tabularization.filtered_code_metadata_fp) - assert num_allowed_codes == len( - feature_columns - ), f"Should have {len(feature_columns)} codes but has {num_allowed_codes}" - - # Step 3: Run the time series tabularization script - tabularize_config = { - **shared_config, - "tabularization.min_code_inclusion_count": 1, - "tabularization.window_sizes": "[30d,365d,full]", - } - - stderr, stdout = run_command( - "meds-tab-tabularize-time-series", - ["--multirun", 'worker="range(0,1)"', "hydra/launcher=joblib"], - tabularize_config, - "tabularization", + assert static_matrix.shape[0] == expected_num_rows, ( + f"Static Data matrix Should have {expected_num_rows}" f" rows but has {static_matrix.shape[0]}!" ) - - # confirm summary files exist: - output_files = list_subdir_files(cfg.output_dir, "npz") - actual_files = [ - get_shard_prefix(Path(cfg.output_dir), each) + ".npz" - for each in output_files - if "none/static" not in str(each) - ] - assert len(actual_files) > 0 - for f in output_files: - ts_matrix = load_matrix(f) - assert ts_matrix.shape[0] > 0, "Time-Series Tabular Dataframe Should not be Empty!" - expected_num_cols = len(get_feature_names(f"{f.parent.stem}/{f.stem}", feature_columns)) - assert ts_matrix.shape[1] == expected_num_cols, ( - f"Time-Series Tabular Dataframe Should have {expected_num_cols}" - f"Columns but has {ts_matrix.shape[1]}!" - ) - split = f.parts[-5] - shard_num = f.parts[-4] - med_shard_fp = (Path(cfg.input_dir) / split / shard_num).with_suffix(".parquet") - expected_num_rows = ( - get_unique_time_events_df(get_events_df(pl.scan_parquet(med_shard_fp), feature_columns)) - .collect() - .shape[0] - ) - assert ts_matrix.shape[0] == expected_num_rows, ( - f"Time-Series Data matrix Should have {expected_num_rows}" - f" rows but has {ts_matrix.shape[0]}!" - ) - # Step 4: Run the task_specific_caching script - cache_config = { - **shared_config, - "tabularization.min_code_inclusion_count": 1, - "tabularization.window_sizes": "[30d,365d,full]", - } - with initialize( - version_base=None, config_path="../src/MEDS_tabular_automl/configs/" - ): # path to config.yaml - overrides = [f"{k}={v}" for k, v in cache_config.items()] - cfg = compose(config_name="task_specific_caching", overrides=overrides) # config.yaml - - df = get_unique_time_events_df(get_events_df(all_data.lazy(), feature_columns)).collect() - pseudo_labels = pl.Series(([0, 1] * df.shape[0])[: df.shape[0]]) - df = df.with_columns(pl.Series(name="boolean_value", values=pseudo_labels)) - df = df.select("patient_id", pl.col("time").alias("prediction_time"), "boolean_value") - - out_fp = Path(cfg.input_label_dir) / "0.parquet" - out_fp.parent.mkdir(parents=True, exist_ok=True) - df.write_parquet(out_fp) - - stderr, stdout_ws = run_command("generate-subsets", ["[30d]"], {}, "generate-subsets window_sizes") - stderr, stdout_agg = run_command( - "generate-subsets", ["[static/present,static/first]"], {}, "generate-subsets aggs" + allowed_codes = cfg.tabularization._resolved_codes + num_allowed_codes = len(allowed_codes) + feature_columns = get_feature_columns(cfg.tabularization.filtered_code_metadata_fp) + assert num_allowed_codes == len( + feature_columns + ), f"Should have {len(feature_columns)} codes but has {num_allowed_codes}" + + # Step 3: Run the time series tabularization script + tabularize_config = { + **shared_config, + "tabularization.min_code_inclusion_count": 1, + "tabularization.window_sizes": "[30d,365d,full]", + } + + stderr, stdout = run_command( + "meds-tab-tabularize-time-series", + ["--multirun", 'worker="range(0,1)"', "hydra/launcher=joblib"], + tabularize_config, + "tabularization", + ) + + # confirm summary files exist: + output_files = list_subdir_files(cfg.output_dir, "npz") + actual_files = [ + get_shard_prefix(Path(cfg.output_dir), each) + ".npz" + for each in output_files + if "none/static" not in str(each) + ] + assert len(actual_files) > 0 + for f in output_files: + ts_matrix = load_matrix(f) + assert ts_matrix.shape[0] > 0, "Time-Series Tabular Dataframe Should not be Empty!" + expected_num_cols = len(get_feature_names(f"{f.parent.stem}/{f.stem}", feature_columns)) + assert ts_matrix.shape[1] == expected_num_cols, ( + f"Time-Series Tabular Dataframe Should have {expected_num_cols}" + f"Columns but has {ts_matrix.shape[1]}!" ) - - stderr, stdout = run_command( - "meds-tab-cache-task", - [ - "--multirun", - f"tabularization.aggs={stdout_agg.strip()}", - ], - cache_config, - "task_specific_caching", + split = f.parts[-5] + shard_num = f.parts[-4] + med_shard_fp = (Path(cfg.input_dir) / split / shard_num).with_suffix(".parquet") + expected_num_rows = ( + get_unique_time_events_df(get_events_df(pl.scan_parquet(med_shard_fp), feature_columns)) + .collect() + .shape[0] + ) + assert ts_matrix.shape[0] == expected_num_rows, ( + f"Time-Series Data matrix Should have {expected_num_rows}" f" rows but has {ts_matrix.shape[0]}!" ) + # Step 4: Run the task_specific_caching script + cache_config = { + **shared_config, + "tabularization.min_code_inclusion_count": 1, + "tabularization.window_sizes": "[30d,365d,full]", + } + with initialize( + version_base=None, config_path="../src/MEDS_tabular_automl/configs/" + ): # path to config.yaml + overrides = [f"{k}={v}" for k, v in cache_config.items()] + cfg = compose(config_name="task_specific_caching", overrides=overrides) # config.yaml + + df = get_unique_time_events_df(get_events_df(all_data.lazy(), feature_columns)).collect() + pseudo_labels = pl.Series(([0, 1] * df.shape[0])[: df.shape[0]]) + df = df.with_columns(pl.Series(name="boolean_value", values=pseudo_labels)) + df = df.select("patient_id", pl.col("time").alias("prediction_time"), "boolean_value") + + out_fp = Path(cfg.input_label_dir) / "0.parquet" + out_fp.parent.mkdir(parents=True, exist_ok=True) + df.write_parquet(out_fp) + + stderr, stdout_ws = run_command("generate-subsets", ["[30d]"], {}, "generate-subsets window_sizes") + stderr, stdout_agg = run_command( + "generate-subsets", ["[static/present,static/first]"], {}, "generate-subsets aggs" + ) + + stderr, stdout = run_command( + "meds-tab-cache-task", + [ + "--multirun", + f"tabularization.aggs={stdout_agg.strip()}", + ], + cache_config, + "task_specific_caching", + ) diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index 9a48041..7691440 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -6,7 +6,6 @@ import json import os import subprocess -import tempfile from io import StringIO from pathlib import Path @@ -36,6 +35,7 @@ logger.disable("MEDS_tabular_automl") SPLITS_JSON = """{"train/0": [239684, 1195293], "train/1": [68729, 814703], "tuning/0": [754281], "held_out/0": [1500733]}""" # noqa: E501 +NUM_SHARDS = 4 MEDS_TRAIN_0 = """ patient_id,code,time,numeric_value @@ -149,210 +149,257 @@ ] -def test_tabularize(): - with tempfile.TemporaryDirectory() as d: - MEDS_cohort_dir = Path(d) / "MEDS_cohort_dir" - output_cohort_dir = Path(d) / "output_cohort_dir" +def test_tabularize(tmp_path): + MEDS_cohort_dir = Path(tmp_path) / "MEDS_cohort_dir" + output_cohort_dir = Path(tmp_path) / "output_cohort_dir" - shared_config = { - "MEDS_cohort_dir": str(MEDS_cohort_dir.resolve()), - "output_cohort_dir": str(output_cohort_dir.resolve()), - "do_overwrite": False, - "seed": 1, - "hydra.verbose": True, - "tqdm": False, - "loguru_init": True, - } + shared_config = { + "MEDS_cohort_dir": str(MEDS_cohort_dir.resolve()), + "output_cohort_dir": str(output_cohort_dir.resolve()), + "do_overwrite": False, + "seed": 1, + "hydra.verbose": True, + "tqdm": False, + "loguru_init": True, + } - describe_codes_config = {**shared_config} + describe_codes_config = {**shared_config} - with initialize( - version_base=None, config_path="../src/MEDS_tabular_automl/configs/" - ): # path to config.yaml - overrides = [f"{k}={v}" for k, v in describe_codes_config.items()] - cfg = compose(config_name="describe_codes", overrides=overrides) # config.yaml - - # Create the directories - (output_cohort_dir / "data").mkdir(parents=True, exist_ok=True) - - # Store MEDS outputs - all_data = [] - for split, data in MEDS_OUTPUTS.items(): - file_path = output_cohort_dir / "data" / f"{split}.parquet" - file_path.parent.mkdir(exist_ok=True) - df = pl.read_csv(StringIO(data)).with_columns( - pl.col("time").str.to_datetime("%Y-%m-%dT%H:%M:%S%.f") - ) - df.write_parquet(file_path) - all_data.append(df) - - all_data = pl.concat(all_data, how="diagonal_relaxed").sort(by=["patient_id", "time"]) - - # Check the files are not empty - meds_files = list_subdir_files(Path(cfg.input_dir), "parquet") - assert ( - len(list_subdir_files(Path(cfg.input_dir).parent, "parquet")) == 4 - ), "MEDS train split Data Files Should be 4!" - for f in meds_files: - assert pl.read_parquet(f).shape[0] > 0, "MEDS Data Tabular Dataframe Should not be Empty!" - split_json = json.load(StringIO(SPLITS_JSON)) - splits_fp = output_cohort_dir / ".shards.json" - json.dump(split_json, splits_fp.open("w")) - # Step 1: Describe Codes - compute code frequencies - describe_codes.main(cfg) - - assert Path(cfg.output_filepath).is_file() - - feature_columns = get_feature_columns(cfg.output_filepath) - assert get_feature_names("code/count", feature_columns) == sorted(CODE_COLS) - assert get_feature_names("static/present", feature_columns) == sorted(STATIC_PRESENT_COLS) - assert get_feature_names("static/first", feature_columns) == sorted(STATIC_FIRST_COLS) - for value_agg in VALUE_AGGREGATIONS: - assert get_feature_names(value_agg, feature_columns) == sorted(VALUE_COLS) - - # Step 2: Tabularization - tabularize_static_config = { - **shared_config, - "tabularization.min_code_inclusion_count": 1, - "tabularization.window_sizes": "[30d,365d,full]", - } + with initialize( + version_base=None, config_path="../src/MEDS_tabular_automl/configs/" + ): # path to config.yaml + overrides = [f"{k}={v}" for k, v in describe_codes_config.items()] + cfg = compose(config_name="describe_codes", overrides=overrides) # config.yaml + + # Create the directories + (output_cohort_dir / "data").mkdir(parents=True, exist_ok=True) + + # Store MEDS outputs + all_data = [] + for split, data in MEDS_OUTPUTS.items(): + file_path = output_cohort_dir / "data" / f"{split}.parquet" + file_path.parent.mkdir(exist_ok=True) + df = pl.read_csv(StringIO(data)).with_columns(pl.col("time").str.to_datetime("%Y-%m-%dT%H:%M:%S%.f")) + df.write_parquet(file_path) + all_data.append(df) + + all_data = pl.concat(all_data, how="diagonal_relaxed").sort(by=["patient_id", "time"]) + + # Check the files are not empty + meds_files = list_subdir_files(Path(cfg.input_dir), "parquet") + assert ( + len(list_subdir_files(Path(cfg.input_dir).parent, "parquet")) == 4 + ), "MEDS train split Data Files Should be 4!" + for f in meds_files: + assert pl.read_parquet(f).shape[0] > 0, "MEDS Data Tabular Dataframe Should not be Empty!" + split_json = json.load(StringIO(SPLITS_JSON)) + splits_fp = output_cohort_dir / ".shards.json" + json.dump(split_json, splits_fp.open("w")) + # Step 1: Describe Codes - compute code frequencies + describe_codes.main(cfg) + + assert Path(cfg.output_filepath).is_file() + + feature_columns = get_feature_columns(cfg.output_filepath) + assert get_feature_names("code/count", feature_columns) == sorted(CODE_COLS) + assert get_feature_names("static/present", feature_columns) == sorted(STATIC_PRESENT_COLS) + assert get_feature_names("static/first", feature_columns) == sorted(STATIC_FIRST_COLS) + for value_agg in VALUE_AGGREGATIONS: + assert get_feature_names(value_agg, feature_columns) == sorted(VALUE_COLS) + + # Step 2: Tabularization + tabularize_static_config = { + **shared_config, + "tabularization.min_code_inclusion_count": 1, + "tabularization.window_sizes": "[30d,365d,full]", + } - with initialize( - version_base=None, config_path="../src/MEDS_tabular_automl/configs/" - ): # path to config.yaml - overrides = [f"{k}={v}" for k, v in tabularize_static_config.items()] - cfg = compose(config_name="tabularization", overrides=overrides) # config.yaml - tabularize_static.main(cfg) - - output_dir = Path(cfg.output_cohort_dir) / "tabularize" - - output_files = list(output_dir.glob("**/static/**/*.npz")) - actual_files = [get_shard_prefix(output_dir, each) + ".npz" for each in output_files] - assert set(actual_files) == set(EXPECTED_STATIC_FILES) - # Check the files are not empty - for f in output_files: - static_matrix = load_matrix(f) - assert static_matrix.shape[0] > 0, "Static Data Tabular Dataframe Should not be Empty!" - expected_num_cols = len(get_feature_names(f"static/{f.stem}", feature_columns)) - assert static_matrix.shape[1] == expected_num_cols, ( - f"Static Data Tabular Dataframe Should have {expected_num_cols}" - f"Columns but has {static_matrix.shape[1]}!" - ) - split = f.parts[-5] - shard_num = f.parts[-4] - med_shard_fp = (Path(cfg.input_dir) / split / shard_num).with_suffix(".parquet") - expected_num_rows = ( - get_unique_time_events_df(get_events_df(pl.scan_parquet(med_shard_fp), feature_columns)) - .collect() - .shape[0] - ) - assert static_matrix.shape[0] == expected_num_rows, ( - f"Static Data matrix Should have {expected_num_rows}" - f" rows but has {static_matrix.shape[0]}!" - ) - allowed_codes = cfg.tabularization._resolved_codes - num_allowed_codes = len(allowed_codes) - feature_columns = get_feature_columns(cfg.tabularization.filtered_code_metadata_fp) - assert num_allowed_codes == len( - feature_columns - ), f"Should have {len(feature_columns)} codes but has {num_allowed_codes}" - - tabularize_time_series.main(cfg) - - # confirm summary files exist: - output_files = list_subdir_files(str(output_dir.resolve()), "npz") - actual_files = [ - get_shard_prefix(output_dir, each) + ".npz" - for each in output_files - if "none/static" not in str(each) - ] - assert len(actual_files) > 0 - for f in output_files: - ts_matrix = load_matrix(f) - assert ts_matrix.shape[0] > 0, "Time-Series Tabular Dataframe Should not be Empty!" - expected_num_cols = len(get_feature_names(f"{f.parent.stem}/{f.stem}", feature_columns)) - assert ts_matrix.shape[1] == expected_num_cols, ( - f"Time-Series Tabular Dataframe Should have {expected_num_cols}" - f"Columns but has {ts_matrix.shape[1]}!" - ) - split = f.parts[-5] - shard_num = f.parts[-4] - med_shard_fp = (Path(cfg.input_dir) / split / shard_num).with_suffix(".parquet") - expected_num_rows = ( - get_unique_time_events_df(get_events_df(pl.scan_parquet(med_shard_fp), feature_columns)) - .collect() - .shape[0] - ) - assert ts_matrix.shape[0] == expected_num_rows, ( - f"Time-Series Data matrix Should have {expected_num_rows}" - f" rows but has {ts_matrix.shape[0]}!" - ) - - # Step 3: Cache Task data - cache_config = { - **shared_config, - "tabularization.min_code_inclusion_count": 1, - "tabularization.window_sizes": "[30d,365d,full]", - } + with initialize( + version_base=None, config_path="../src/MEDS_tabular_automl/configs/" + ): # path to config.yaml + overrides = [f"{k}={v}" for k, v in tabularize_static_config.items()] + cfg = compose(config_name="tabularization", overrides=overrides) # config.yaml + tabularize_static.main(cfg) + + output_dir = Path(cfg.output_cohort_dir) / "tabularize" + + output_files = list(output_dir.glob("**/static/**/*.npz")) + actual_files = [get_shard_prefix(output_dir, each) + ".npz" for each in output_files] + assert set(actual_files) == set(EXPECTED_STATIC_FILES) + # Check the files are not empty + for f in output_files: + static_matrix = load_matrix(f) + assert static_matrix.shape[0] > 0, "Static Data Tabular Dataframe Should not be Empty!" + expected_num_cols = len(get_feature_names(f"static/{f.stem}", feature_columns)) + assert static_matrix.shape[1] == expected_num_cols, ( + f"Static Data Tabular Dataframe Should have {expected_num_cols}" + f"Columns but has {static_matrix.shape[1]}!" + ) + split = f.parts[-5] + shard_num = f.parts[-4] + med_shard_fp = (Path(cfg.input_dir) / split / shard_num).with_suffix(".parquet") + expected_num_rows = ( + get_unique_time_events_df(get_events_df(pl.scan_parquet(med_shard_fp), feature_columns)) + .collect() + .shape[0] + ) + assert static_matrix.shape[0] == expected_num_rows, ( + f"Static Data matrix Should have {expected_num_rows}" f" rows but has {static_matrix.shape[0]}!" + ) + allowed_codes = cfg.tabularization._resolved_codes + num_allowed_codes = len(allowed_codes) + feature_columns = get_feature_columns(cfg.tabularization.filtered_code_metadata_fp) + assert num_allowed_codes == len( + feature_columns + ), f"Should have {len(feature_columns)} codes but has {num_allowed_codes}" + + tabularize_time_series.main(cfg) + + # confirm summary files exist: + output_files = list_subdir_files(str(output_dir.resolve()), "npz") + actual_files = [ + get_shard_prefix(output_dir, each) + ".npz" for each in output_files if "none/static" not in str(each) + ] + assert len(actual_files) > 0 + for f in output_files: + ts_matrix = load_matrix(f) + assert ts_matrix.shape[0] > 0, "Time-Series Tabular Dataframe Should not be Empty!" + expected_num_cols = len(get_feature_names(f"{f.parent.stem}/{f.stem}", feature_columns)) + assert ts_matrix.shape[1] == expected_num_cols, ( + f"Time-Series Tabular Dataframe Should have {expected_num_cols}" + f"Columns but has {ts_matrix.shape[1]}!" + ) + split = f.parts[-5] + shard_num = f.parts[-4] + med_shard_fp = (Path(cfg.input_dir) / split / shard_num).with_suffix(".parquet") + expected_num_rows = ( + get_unique_time_events_df(get_events_df(pl.scan_parquet(med_shard_fp), feature_columns)) + .collect() + .shape[0] + ) + assert ts_matrix.shape[0] == expected_num_rows, ( + f"Time-Series Data matrix Should have {expected_num_rows}" f" rows but has {ts_matrix.shape[0]}!" + ) + output_files = list_subdir_files(str(output_dir.resolve()), "npz") + for split in split_json.keys(): + for window in cfg.tabularization.window_sizes: + for agg in cfg.tabularization.aggs: + if agg.startswith("static"): + if window != cfg.tabularization.window_sizes[0]: + continue + expected_fp = Path(cfg.output_dir) / split / "none" / f"{agg}.npz" + else: + expected_fp = Path(cfg.output_dir) / split / window / f"{agg}.npz" + assert expected_fp in output_files, f"Missing {expected_fp}" + expected_num_time_tabs = ( + NUM_SHARDS * len(cfg.tabularization.window_sizes) * (len(cfg.tabularization.aggs) - 2) + ) + expected_num_static_tabs = NUM_SHARDS * 2 + assert len(list_subdir_files(cfg.output_dir, "npz")) == expected_num_time_tabs + expected_num_static_tabs + cfg.output_dir + # Step 3: Cache Task data + cache_config = { + **shared_config, + "tabularization.min_code_inclusion_count": 1, + "tabularization.window_sizes": "[30d,365d,full]", + } - with initialize( - version_base=None, config_path="../src/MEDS_tabular_automl/configs/" - ): # path to config.yaml - overrides = [f"{k}={v}" for k, v in cache_config.items()] - cfg = compose(config_name="task_specific_caching", overrides=overrides) # config.yaml + with initialize( + version_base=None, config_path="../src/MEDS_tabular_automl/configs/" + ): # path to config.yaml + overrides = [f"{k}={v}" for k, v in cache_config.items()] + cfg = compose(config_name="task_specific_caching", overrides=overrides) # config.yaml + + # Create fake labels + df = get_unique_time_events_df(get_events_df(all_data.lazy(), feature_columns)).collect() + pseudo_labels = pl.Series(([0, 1] * df.shape[0])[: df.shape[0]]) + df = df.with_columns(pl.Series(name="boolean_value", values=pseudo_labels)) + df = df.select("patient_id", pl.col("time").alias("prediction_time"), "boolean_value") + + out_fp = Path(cfg.input_label_dir) / "0.parquet" + out_fp.parent.mkdir(parents=True, exist_ok=True) + df.write_parquet(out_fp) + + cache_task.main(cfg) + for split in split_json.keys(): + for window in cfg.tabularization.window_sizes: + for agg in cfg.tabularization.aggs: + if agg.startswith("static"): + if window != cfg.tabularization.window_sizes[0]: + continue + expected_fp = Path(cfg.output_dir) / split / "none" / f"{agg}.npz" + else: + expected_fp = Path(cfg.output_dir) / split / window / f"{agg}.npz" + output_files = list_subdir_files(str(Path(cfg.output_dir).resolve()), "npz") + assert expected_fp in output_files, f"Missing {expected_fp}" + [each for each in output_files if "0/30d" in str(each) and "code/count" in str(each)] + assert len(list_subdir_files(cfg.output_dir, "npz")) == expected_num_time_tabs + expected_num_static_tabs + + xgboost_config_kwargs = { + **shared_config, + "tabularization.min_code_inclusion_count": 1, + "tabularization.window_sizes": "[30d,365d,full]", + } - # Create fake labels - df = get_unique_time_events_df(get_events_df(all_data.lazy(), feature_columns)).collect() - pseudo_labels = pl.Series(([0, 1] * df.shape[0])[: df.shape[0]]) - df = df.with_columns(pl.Series(name="boolean_value", values=pseudo_labels)) - df = df.select("patient_id", pl.col("time").alias("prediction_time"), "boolean_value") + with initialize( + version_base=None, config_path="../src/MEDS_tabular_automl/configs/" + ): # path to config.yaml + overrides = ["model=xgboost"] + [f"{k}={v}" for k, v in xgboost_config_kwargs.items()] + cfg = compose( + config_name="launch_model", overrides=overrides, return_hydra_config=True + ) # config.yaml - out_fp = Path(cfg.input_label_dir) / "0.parquet" - out_fp.parent.mkdir(parents=True, exist_ok=True) - df.write_parquet(out_fp) + output_dir = Path(cfg.output_cohort_dir) / "model" - cache_task.main(cfg) + HydraConfig().set_config(cfg) + launch_model.main(cfg) + output_files = list(output_dir.glob("**/*.json")) + assert len(output_files) == 1 - xgboost_config_kwargs = { - **shared_config, - "tabularization.min_code_inclusion_count": 1, - "tabularization.window_sizes": "[30d,365d,full]", - } + sklearnmodel_config_kwargs = { + **shared_config, + "tabularization.min_code_inclusion_count": 1, + "tabularization.window_sizes": "[30d,365d,full]", + } - with initialize( - version_base=None, config_path="../src/MEDS_tabular_automl/configs/" - ): # path to config.yaml - overrides = ["model=xgboost"] + [f"{k}={v}" for k, v in xgboost_config_kwargs.items()] - cfg = compose( - config_name="launch_model", overrides=overrides, return_hydra_config=True - ) # config.yaml + with initialize( + version_base=None, config_path="../src/MEDS_tabular_automl/configs/" + ): # path to config.yaml + overrides = ["model=sgd_classifier"] + [f"{k}={v}" for k, v in sklearnmodel_config_kwargs.items()] + cfg = compose(config_name="launch_model", overrides=overrides) # config.yaml - output_dir = Path(cfg.output_cohort_dir) / "model" + output_dir = Path(cfg.output_cohort_dir) / "model" - HydraConfig().set_config(cfg) - launch_model.main(cfg) - output_files = list(output_dir.glob("**/*.json")) - assert len(output_files) == 1 + launch_model.main(cfg) + output_files = list(output_dir.glob("**/*.pkl")) + assert len(output_files) == 1 - sklearnmodel_config_kwargs = { - **shared_config, - "tabularization.min_code_inclusion_count": 1, - "tabularization.window_sizes": "[30d,365d,full]", - } + sklearnmodel_config_kwargs = { + **shared_config, + "tabularization.min_code_inclusion_count": 1, + "tabularization.window_sizes": "[30d,365d,full]", + "model_params.iterator.keep_data_in_memory": False, + "model_dir": "${output_cohort_dir}/model_online/model_${now:%Y-%m-%d_%H-%M-%S}", + } - with initialize( - version_base=None, config_path="../src/MEDS_tabular_automl/configs/" - ): # path to config.yaml - overrides = ["model=sgd_classifier"] + [f"{k}={v}" for k, v in sklearnmodel_config_kwargs.items()] - cfg = compose(config_name="launch_model", overrides=overrides) # config.yaml + with initialize( + version_base=None, config_path="../src/MEDS_tabular_automl/configs/" + ): # path to config.yaml + overrides = ["model=sgd_classifier"] + [f"{k}={v}" for k, v in sklearnmodel_config_kwargs.items()] + cfg = compose(config_name="launch_model", overrides=overrides) # config.yaml + + output_dir = Path(cfg.output_cohort_dir) / "model_online" + + launch_model.main(cfg) + output_files = list(output_dir.glob("**/*.pkl")) + assert len(output_files) == 1 - output_dir = Path(cfg.output_cohort_dir) / "model" + if importlib.util.find_spec("autogluon") is not None: + import autogluon as ag - launch_model.main(cfg) - output_files = list(output_dir.glob("**/*.pkl")) - assert len(output_files) == 1 + from MEDS_tabular_automl.scripts import launch_autogluon - sklearnmodel_config_kwargs = { + autogluon_config_kwargs = { **shared_config, "tabularization.min_code_inclusion_count": 1, "tabularization.window_sizes": "[30d,365d,full]", @@ -363,40 +410,15 @@ def test_tabularize(): with initialize( version_base=None, config_path="../src/MEDS_tabular_automl/configs/" ): # path to config.yaml - overrides = ["model=sgd_classifier"] + [f"{k}={v}" for k, v in sklearnmodel_config_kwargs.items()] - cfg = compose(config_name="launch_model", overrides=overrides) # config.yaml + overrides = [f"{k}={v}" for k, v in autogluon_config_kwargs.items()] + cfg = compose(config_name="launch_autogluon", overrides=overrides) # config.yaml output_dir = Path(cfg.output_cohort_dir) / "model_online" - launch_model.main(cfg) - output_files = list(output_dir.glob("**/*.pkl")) - assert len(output_files) == 1 - - if importlib.util.find_spec("autogluon") is not None: - import autogluon as ag - - from MEDS_tabular_automl.scripts import launch_autogluon - - autogluon_config_kwargs = { - **shared_config, - "tabularization.min_code_inclusion_count": 1, - "tabularization.window_sizes": "[30d,365d,full]", - "model_params.iterator.keep_data_in_memory": False, - "model_dir": "${output_cohort_dir}/model_online/model_${now:%Y-%m-%d_%H-%M-%S}", - } - - with initialize( - version_base=None, config_path="../src/MEDS_tabular_automl/configs/" - ): # path to config.yaml - overrides = [f"{k}={v}" for k, v in autogluon_config_kwargs.items()] - cfg = compose(config_name="launch_autogluon", overrides=overrides) # config.yaml - - output_dir = Path(cfg.output_cohort_dir) / "model_online" - - launch_autogluon.main(cfg) - output_files = list(output_dir.glob("*")) - most_recent_file = max(output_files, key=os.path.getmtime) - ag.tabular.TabularPredictor.load(most_recent_file) + launch_autogluon.main(cfg) + output_files = list(output_dir.glob("*")) + most_recent_file = max(output_files, key=os.path.getmtime) + ag.tabular.TabularPredictor.load(most_recent_file) def run_command(script: str, args: list[str], hydra_kwargs: dict[str, str], test_name: str): From 299bf6f22483b529d1142c476d9dd12f2c5c17da Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Sun, 25 Aug 2024 17:12:46 +0000 Subject: [PATCH 19/54] setup dynamic versioning --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c6d9c6c..9452f09 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "meds-tab" -version = "0.0.4" +dynamic = ["version"] authors = [ { name="Matthew McDermott", email="mattmcdermott8@gmail.com" }, { name="Nassim Oufattole", email="noufattole@gmail.com" }, From 8a7692a8d19ad5d448d3925a71535b759f15f78f Mon Sep 17 00:00:00 2001 From: teyaberg Date: Thu, 5 Sep 2024 23:12:56 +0000 Subject: [PATCH 20/54] version updates --- README.md | 8 ++--- docs/source/implementation.md | 2 +- docs/source/overview.md | 6 ++-- pyproject.toml | 2 +- .../configs/task_specific_caching.yaml | 2 +- src/MEDS_tabular_automl/describe_codes.py | 6 ++-- .../generate_static_features.py | 32 +++++++++---------- .../generate_summarized_reps.py | 10 +++--- .../generate_ts_features.py | 2 +- src/MEDS_tabular_automl/scripts/cache_task.py | 8 ++--- src/MEDS_tabular_automl/utils.py | 8 ++--- tests/test_integration.py | 4 +-- tests/test_tabularize.py | 12 +++---- 13 files changed, 51 insertions(+), 51 deletions(-) diff --git a/README.md b/README.md index 838081a..8900e41 100644 --- a/README.md +++ b/README.md @@ -105,7 +105,7 @@ By following these steps, you can seamlessly transform your dataset, define nece This script further caches feature names and frequencies in a dataset stored in a `code_metadata.parquet` file within the `MEDS_cohort_dir` argument specified as a hydra-style command line argument. -2. **`meds-tab-tabularize-static`**: Filters and processes the dataset based on the frequency of codes, generating a tabular vector for each patient at each timestamp in the shards. Each row corresponds to a unique `patient_id` and `timestamp` combination, thus rows are duplicated across multiple timestamps for the same patient. +2. **`meds-tab-tabularize-static`**: Filters and processes the dataset based on the frequency of codes, generating a tabular vector for each patient at each timestamp in the shards. Each row corresponds to a unique `subject_id` and `timestamp` combination, thus rows are duplicated across multiple timestamps for the same patient. **Example: Tabularizing static data** with the minimum code frequency of 10, window sizes of `[1d, 30d, 365d, full]`, and value aggregation methods of `[static/present, static/first, code/count, value/count, value/sum, value/sum_sqd, value/min, value/max]` @@ -119,7 +119,7 @@ By following these steps, you can seamlessly transform your dataset, define nece - For the exhaustive examples of value aggregations, see [`/src/MEDS_tabular_automl/utils.py`](https://github.com/mmcdermott/MEDS_Tabular_AutoML/blob/main/src/MEDS_tabular_automl/utils.py#L24) -3. **`meds-tab-tabularize-time-series`**: Iterates through combinations of a shard, `window_size`, and `aggregation` to generate feature vectors that aggregate patient data for each unique `patient_id` x `timestamp`. This stage (and the previous stage) uses sparse matrix formats to efficiently handle the computational and storage demands of rolling window calculations on large datasets. We support parallelization through Hydra's [`--multirun`](https://hydra.cc/docs/intro/#multirun) flag and the [`joblib` launcher](https://hydra.cc/docs/plugins/joblib_launcher/#internaldocs-banner). +3. **`meds-tab-tabularize-time-series`**: Iterates through combinations of a shard, `window_size`, and `aggregation` to generate feature vectors that aggregate patient data for each unique `subject_id` x `timestamp`. This stage (and the previous stage) uses sparse matrix formats to efficiently handle the computational and storage demands of rolling window calculations on large datasets. We support parallelization through Hydra's [`--multirun`](https://hydra.cc/docs/intro/#multirun) flag and the [`joblib` launcher](https://hydra.cc/docs/plugins/joblib_launcher/#internaldocs-banner). **Example: Aggregate time-series data** on features across different `window_sizes` @@ -134,7 +134,7 @@ By following these steps, you can seamlessly transform your dataset, define nece tabularization.aggs=[static/present,static/first,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max] ``` -4. **`meds-tab-cache-task`**: Aligns task-specific labels with the nearest prior event in the tabularized data. It requires a labeled dataset directory with three columns (`patient_id`, `timestamp`, `label`) structured similarly to the `MEDS_cohort_dir`. +4. **`meds-tab-cache-task`**: Aligns task-specific labels with the nearest prior event in the tabularized data. It requires a labeled dataset directory with three columns (`subject_id`, `timestamp`, `label`) structured similarly to the `MEDS_cohort_dir`. **Example: Align tabularized data** for a specific task `$TASK` and labels that has pulled from [ACES](https://github.com/justin13601/ACES) @@ -321,7 +321,7 @@ Now that we have generated tabular features for all the events in our dataset, w - **Row Selection Based on Tasks**: Only the data rows that are relevant to the specific tasks are selected and cached. This reduces the memory footprint and speeds up the training process. - **Use of Sparse Matrices for Efficient Storage**: Sparse matrices are again employed here to store the selected data efficiently, ensuring that only non-zero data points are kept in memory, thus optimizing both storage and retrieval times. -The file structure for the cached data mirrors that of the tabular data, also consisting of `.npz` files, where users must specify the directory that stores labels. Labels follow the same shard file structure as the input meds data from step (1), and the label parquets need `patient_id`, `timestamp`, and `label` columns. +The file structure for the cached data mirrors that of the tabular data, also consisting of `.npz` files, where users must specify the directory that stores labels. Labels follow the same shard file structure as the input meds data from step (1), and the label parquets need `subject_id`, `timestamp`, and `label` columns. ## 4. XGBoost Training diff --git a/docs/source/implementation.md b/docs/source/implementation.md index aabe6a2..e93186a 100644 --- a/docs/source/implementation.md +++ b/docs/source/implementation.md @@ -92,7 +92,7 @@ Now that we have generated tabular features for all the events in our dataset, w - **Row Selection Based on Tasks**: Only the data rows that are relevant to the specific tasks are selected and cached. This reduces the memory footprint and speeds up the training process. - **Use of Sparse Matrices for Efficient Storage**: Sparse matrices are again employed here to store the selected data efficiently, ensuring that only non-zero data points are kept in memory, thus optimizing both storage and retrieval times. -The file structure for the cached data mirrors that of the tabular data, also consisting of `.npz` files, where users must specify the directory that stores labels. Labels follow the same shard filestructure as the input meds data from step (1), and the label parquets need `patient_id`, `timestamp`, and `label` columns. +The file structure for the cached data mirrors that of the tabular data, also consisting of `.npz` files, where users must specify the directory that stores labels. Labels follow the same shard filestructure as the input meds data from step (1), and the label parquets need `subject_id`, `timestamp`, and `label` columns. ## 4. XGBoost Training diff --git a/docs/source/overview.md b/docs/source/overview.md index af596e6..44f68bf 100644 --- a/docs/source/overview.md +++ b/docs/source/overview.md @@ -40,7 +40,7 @@ See [`/tests/test_integration.py`](https://github.com/mmcdermott/MEDS_Tabular_Au This script further caches feature names and frequencies in a dataset stored in a `code_metadata.parquet` file within the `MEDS_cohort_dir` argument specified as a hydra-style command line argument. -2. **`meds-tab-tabularize-static`**: Filters and processes the dataset based on the frequency of codes, generating a tabular vector for each patient at each timestamp in the shards. Each row corresponds to a unique `patient_id` and `timestamp` combination, thus rows are duplicated across multiple timestamps for the same patient. +2. **`meds-tab-tabularize-static`**: Filters and processes the dataset based on the frequency of codes, generating a tabular vector for each patient at each timestamp in the shards. Each row corresponds to a unique `subject_id` and `timestamp` combination, thus rows are duplicated across multiple timestamps for the same patient. **Example: Tabularizing static data** with the minimum code frequency of 10, window sizes of `[1d, 30d, 365d, full]`, and value aggregation methods of `[static/present, static/first, code/count, value/count, value/sum, value/sum_sqd, value/min, value/max]` @@ -54,7 +54,7 @@ See [`/tests/test_integration.py`](https://github.com/mmcdermott/MEDS_Tabular_Au - For the exhaustive examples of value aggregations, see [`/src/MEDS_tabular_automl/utils.py`](https://github.com/mmcdermott/MEDS_Tabular_AutoML/blob/main/src/MEDS_tabular_automl/utils.py#L24) -3. **`meds-tab-tabularize-time-series`**: Iterates through combinations of a shard, `window_size`, and `aggregation` to generate feature vectors that aggregate patient data for each unique `patient_id` x `timestamp`. This stage (and the previous stage) uses sparse matrix formats to efficiently handle the computational and storage demands of rolling window calculations on large datasets. We support parallelization through Hydra's [`--multirun`](https://hydra.cc/docs/intro/#multirun) flag and the [`joblib` launcher](https://hydra.cc/docs/plugins/joblib_launcher/#internaldocs-banner). +3. **`meds-tab-tabularize-time-series`**: Iterates through combinations of a shard, `window_size`, and `aggregation` to generate feature vectors that aggregate patient data for each unique `subject_id` x `timestamp`. This stage (and the previous stage) uses sparse matrix formats to efficiently handle the computational and storage demands of rolling window calculations on large datasets. We support parallelization through Hydra's [`--multirun`](https://hydra.cc/docs/intro/#multirun) flag and the [`joblib` launcher](https://hydra.cc/docs/plugins/joblib_launcher/#internaldocs-banner). **Example: Aggregate time-series data** on features across different `window_sizes` @@ -69,7 +69,7 @@ See [`/tests/test_integration.py`](https://github.com/mmcdermott/MEDS_Tabular_Au tabularization.aggs=[static/present,static/first,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max] ``` -4. **`meds-tab-cache-task`**: Aligns task-specific labels with the nearest prior event in the tabularized data. It requires a labeled dataset directory with three columns (`patient_id`, `timestamp`, `label`) structured similarly to the `MEDS_cohort_dir`. +4. **`meds-tab-cache-task`**: Aligns task-specific labels with the nearest prior event in the tabularized data. It requires a labeled dataset directory with three columns (`subject_id`, `timestamp`, `label`) structured similarly to the `MEDS_cohort_dir`. **Example: Align tabularized data** for a specific task `$TASK` and labels that has pulled from [ACES](https://github.com/justin13601/ACES) diff --git a/pyproject.toml b/pyproject.toml index 9452f09..d24137f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,7 @@ classifiers = [ ] dependencies = [ "polars", "pyarrow", "loguru", "hydra-core", "numpy", "scipy<1.14.0", "pandas", "tqdm", "xgboost", - "scikit-learn", "hydra-optuna-sweeper", "hydra-joblib-launcher", "ml-mixins", "meds==0.3", + "scikit-learn", "hydra-optuna-sweeper", "hydra-joblib-launcher", "ml-mixins", "meds==0.3.2", "meds-transforms==0.0.7", ] [project.scripts] diff --git a/src/MEDS_tabular_automl/configs/task_specific_caching.yaml b/src/MEDS_tabular_automl/configs/task_specific_caching.yaml index 7a3ee79..63fed0f 100644 --- a/src/MEDS_tabular_automl/configs/task_specific_caching.yaml +++ b/src/MEDS_tabular_automl/configs/task_specific_caching.yaml @@ -6,7 +6,7 @@ task_name: task # Tabularized Data input_dir: ${output_cohort_dir}/tabularize -# Where the labels are stored, with columns patient_id, timestamp, label +# Where the labels are stored, with columns subject_id, timestamp, label input_label_dir: ${MEDS_cohort_dir}/tasks/${task_name}/ # Where to output the task specific tabularized data output_dir: ${output_cohort_dir}/${task_name}/task_cache diff --git a/src/MEDS_tabular_automl/describe_codes.py b/src/MEDS_tabular_automl/describe_codes.py index 5a86d0e..70c53bd 100644 --- a/src/MEDS_tabular_automl/describe_codes.py +++ b/src/MEDS_tabular_automl/describe_codes.py @@ -80,7 +80,7 @@ def compute_feature_frequencies(shard_df: DF_T) -> pl.DataFrame: Examples: >>> from datetime import datetime >>> data = pl.DataFrame({ - ... 'patient_id': [1, 1, 2, 2, 3, 3, 3], + ... 'subject_id': [1, 1, 2, 2, 3, 3, 3], ... 'code': ['A', 'A', 'B', 'B', 'C', 'C', 'C'], ... 'time': [ ... None, @@ -101,7 +101,7 @@ def compute_feature_frequencies(shard_df: DF_T) -> pl.DataFrame: ... ) """ static_df = shard_df.filter( - pl.col("patient_id").is_not_null() & pl.col("code").is_not_null() & pl.col("time").is_null() + pl.col("subject_id").is_not_null() & pl.col("code").is_not_null() & pl.col("time").is_null() ) static_code_freqs_df = static_df.group_by("code").agg(pl.count("code").alias("count")).collect() static_code_freqs = { @@ -117,7 +117,7 @@ def compute_feature_frequencies(shard_df: DF_T) -> pl.DataFrame: } ts_df = shard_df.filter( - pl.col("patient_id").is_not_null() & pl.col("code").is_not_null() & pl.col("time").is_not_null() + pl.col("subject_id").is_not_null() & pl.col("code").is_not_null() & pl.col("time").is_not_null() ) code_freqs_df = ts_df.group_by("code").agg(pl.count("code").alias("count")).collect() code_freqs = {row["code"] + "/code": row["count"] for row in code_freqs_df.iter_rows(named=True)} diff --git a/src/MEDS_tabular_automl/generate_static_features.py b/src/MEDS_tabular_automl/generate_static_features.py index c990ece..9f99b23 100644 --- a/src/MEDS_tabular_automl/generate_static_features.py +++ b/src/MEDS_tabular_automl/generate_static_features.py @@ -36,7 +36,7 @@ def convert_to_matrix(df: pl.DataFrame, num_events: int, num_features: int) -> c Returns: A sparse matrix representation of the DataFrame. """ - dense_matrix = df.drop("patient_id").collect().to_numpy() + dense_matrix = df.drop("subject_id").collect().to_numpy() data_list = [] rows = [] cols = [] @@ -54,7 +54,7 @@ def convert_to_matrix(df: pl.DataFrame, num_events: int, num_features: int) -> c def get_sparse_static_rep( static_features: list[str], static_df: pl.DataFrame, meds_df: pl.DataFrame, feature_columns: list[str] ) -> coo_array: - """Merges static and time-series dataframes into a sparse representation based on the patient_id column. + """Merges static and time-series dataframes into a sparse representation based on the subject_id column. Args: static_features: A list of static feature names. @@ -68,10 +68,10 @@ def get_sparse_static_rep( # Make static data sparse and merge it with the time-series data logger.info("Make static data sparse and merge it with the time-series data") # Check static_df is sorted and unique - assert static_df.select(pl.col("patient_id")).collect().to_series().is_sorted() + assert static_df.select(pl.col("subject_id")).collect().to_series().is_sorted() assert ( static_df.select(pl.len()).collect().item() - == static_df.select(pl.col("patient_id").n_unique()).collect().item() + == static_df.select(pl.col("subject_id").n_unique()).collect().item() ) meds_df = get_unique_time_events_df(get_events_df(meds_df, feature_columns)) @@ -81,9 +81,9 @@ def get_sparse_static_rep( ) # Duplicate static matrix rows to match time-series data events_per_patient = ( - meds_df.select(pl.col("patient_id").value_counts()) - .unnest("patient_id") - .sort(by="patient_id") + meds_df.select(pl.col("subject_id").value_counts()) + .unnest("subject_id") + .sort(by="subject_id") .select(pl.col("count")) .collect() .to_series() @@ -110,16 +110,16 @@ def summarize_static_measurements( df: The DataFrame from which features will be extracted and summarized. Returns: - A LazyFrame containing summarized data pivoted by 'patient_id' for each static feature. + A LazyFrame containing summarized data pivoted by 'subject_id' for each static feature. """ if agg == STATIC_VALUE_AGGREGATION: static_features = get_feature_names(agg=agg, feature_columns=feature_columns) # Handling 'first' static values static_first_codes = [parse_static_feature_column(c)[0] for c in static_features] code_subset = df.filter(pl.col("code").is_in(static_first_codes)) - first_code_subset = code_subset.group_by(pl.col("patient_id")).first().collect() + first_code_subset = code_subset.group_by(pl.col("subject_id")).first().collect() static_value_pivot_df = first_code_subset.pivot( - index=["patient_id"], columns=["code"], values=["numeric_value"], aggregate_function=None + index=["subject_id"], columns=["code"], values=["numeric_value"], aggregate_function=None ) # rename code to feature name remap_cols = { @@ -128,8 +128,8 @@ def summarize_static_measurements( if input_name in static_value_pivot_df.columns } static_value_pivot_df = static_value_pivot_df.select( - *["patient_id"], *[pl.col(k).alias(v).cast(pl.Boolean) for k, v in remap_cols.items()] - ).sort(by="patient_id") + *["subject_id"], *[pl.col(k).alias(v).cast(pl.Boolean) for k, v in remap_cols.items()] + ).sort(by="subject_id") # pivot can be faster: https://stackoverflow.com/questions/73522017/replacing-a-pivot-with-a-lazy-groupby-operation # noqa: E501 # TODO: consider casting with .cast(pl.Float32)) return static_value_pivot_df @@ -138,17 +138,17 @@ def summarize_static_measurements( # Handling 'present' static indicators static_present_codes = [parse_static_feature_column(c)[0] for c in static_features] static_present_pivot_df = ( - df.select(*["patient_id", "code"]) + df.select(*["subject_id", "code"]) .filter(pl.col("code").is_in(static_present_codes)) .with_columns(pl.lit(True).alias("__indicator")) .collect() .pivot( - index=["patient_id"], + index=["subject_id"], columns=["code"], values="__indicator", aggregate_function=None, ) - .sort(by="patient_id") + .sort(by="subject_id") ) remap_cols = { input_name: output_name @@ -157,7 +157,7 @@ def summarize_static_measurements( } # rename columns to final feature names static_present_pivot_df = static_present_pivot_df.select( - *["patient_id"], *[pl.col(k).alias(v).cast(pl.Boolean) for k, v in remap_cols.items()] + *["subject_id"], *[pl.col(k).alias(v).cast(pl.Boolean) for k, v in remap_cols.items()] ) return static_present_pivot_df else: diff --git a/src/MEDS_tabular_automl/generate_summarized_reps.py b/src/MEDS_tabular_automl/generate_summarized_reps.py index c2dcb86..1cd7405 100644 --- a/src/MEDS_tabular_automl/generate_summarized_reps.py +++ b/src/MEDS_tabular_automl/generate_summarized_reps.py @@ -59,7 +59,7 @@ def get_rolling_window_indicies(index_df: pl.LazyFrame, window_size: str) -> pl. timedelta = pd.Timedelta(window_size) return ( index_df.with_row_index("index") - .rolling(index_column="time", period=timedelta, group_by="patient_id") + .rolling(index_column="time", period=timedelta, group_by="subject_id") .agg([pl.col("index").min().alias("min_index"), pl.col("index").max().alias("max_index")]) .select(pl.col("min_index", "max_index")) .collect() @@ -132,12 +132,12 @@ def compute_agg( ) -> csr_array: """Applies aggregation to a sparse matrix using rolling window indices derived from a DataFrame. - Dataframe is expected to only have the relevant columns for aggregating. It should have the patient_id and + Dataframe is expected to only have the relevant columns for aggregating. It should have the subject_id and time columns, and then only code columns if agg is a code aggregation or only value columns if it is a value aggreagation. Args: - index_df: The DataFrame with 'patient_id' and 'time' columns used for grouping. + index_df: The DataFrame with 'subject_id' and 'time' columns used for grouping. matrix: The sparse matrix to be aggregated. window_size: The string defining the rolling window size. agg: The string specifying the aggregation method. @@ -149,11 +149,11 @@ def compute_agg( """ group_df = ( index_df.with_row_index("index") - .group_by(["patient_id", "time"], maintain_order=True) + .group_by(["subject_id", "time"], maintain_order=True) .agg([pl.col("index").min().alias("min_index"), pl.col("index").max().alias("max_index")]) .collect() ) - index_df = group_df.lazy().select(pl.col("patient_id", "time")) + index_df = group_df.lazy().select(pl.col("subject_id", "time")) windows = group_df.select(pl.col("min_index", "max_index")) logger.info("Step 1.5: Running sparse aggregation.") matrix = aggregate_matrix(windows, matrix, agg, num_features, use_tqdm) diff --git a/src/MEDS_tabular_automl/generate_ts_features.py b/src/MEDS_tabular_automl/generate_ts_features.py index 331f65e..dc6ee52 100644 --- a/src/MEDS_tabular_automl/generate_ts_features.py +++ b/src/MEDS_tabular_automl/generate_ts_features.py @@ -107,7 +107,7 @@ def summarize_dynamic_measurements( of aggregated values. """ logger.info("Generating Sparse matrix for Time Series Features") - id_cols = ["patient_id", "time"] + id_cols = ["subject_id", "time"] # Confirm dataframe is sorted check_df = df.select(pl.col(id_cols)) diff --git a/src/MEDS_tabular_automl/scripts/cache_task.py b/src/MEDS_tabular_automl/scripts/cache_task.py index 8d36974..a5a34b0 100644 --- a/src/MEDS_tabular_automl/scripts/cache_task.py +++ b/src/MEDS_tabular_automl/scripts/cache_task.py @@ -98,7 +98,7 @@ def main(cfg: DictConfig): cfg.label_column: "label", } ) - .group_by(pl.col("patient_id", "time"), maintain_order=True) + .group_by(pl.col("subject_id", "time"), maintain_order=True) .first() ) @@ -124,11 +124,11 @@ def extract_labels(meds_data_df): meds_data_df = ( get_unique_time_events_df(get_events_df(meds_data_df, feature_columns)) .with_row_index("event_id") - .select("patient_id", "time", "event_id") + .select("subject_id", "time", "event_id") ) shard_label_df = label_df.join( - meds_data_df.select("patient_id").unique(), on="patient_id", how="inner" - ).join_asof(other=meds_data_df, by="patient_id", on="time") + meds_data_df.select("subject_id").unique(), on="subject_id", how="inner" + ).join_asof(other=meds_data_df, by="subject_id", on="time") return shard_label_df def read_fn(in_fp_tuple): diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py index 4971ed7..57d1bd2 100644 --- a/src/MEDS_tabular_automl/utils.py +++ b/src/MEDS_tabular_automl/utils.py @@ -315,20 +315,20 @@ def get_events_df(shard_df: pl.LazyFrame, feature_columns) -> pl.LazyFrame: def get_unique_time_events_df(events_df: pl.LazyFrame) -> pl.LazyFrame: - """Ensures all times in the events LazyFrame are unique and sorted by patient_id and time. + """Ensures all times in the events LazyFrame are unique and sorted by subject_id and time. Args: events_df: Events LazyFrame to process. Returns: - A LazyFrame with unique times, sorted by patient_id and time. + A LazyFrame with unique times, sorted by subject_id and time. """ assert events_df.select(pl.col("time")).null_count().collect().item() == 0 # Check events_df is sorted - so it aligns with the ts_matrix we generate later in the pipeline events_df = ( - events_df.drop_nulls("time").select(pl.col(["patient_id", "time"])).unique(maintain_order=True) + events_df.drop_nulls("time").select(pl.col(["subject_id", "time"])).unique(maintain_order=True) ) - assert events_df.sort(by=["patient_id", "time"]).collect().equals(events_df.collect()) + assert events_df.sort(by=["subject_id", "time"]).collect().equals(events_df.collect()) return events_df diff --git a/tests/test_integration.py b/tests/test_integration.py index 12948ee..907b038 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -76,7 +76,7 @@ def test_integration(tmp_path): df.write_parquet(file_path) all_data.append(df) - all_data = pl.concat(all_data, how="diagonal_relaxed").sort(by=["patient_id", "time"]) + all_data = pl.concat(all_data, how="diagonal_relaxed").sort(by=["subject_id", "time"]) # Check the files are not empty meds_files = list_subdir_files(Path(cfg.input_dir), "parquet") @@ -209,7 +209,7 @@ def test_integration(tmp_path): df = get_unique_time_events_df(get_events_df(all_data.lazy(), feature_columns)).collect() pseudo_labels = pl.Series(([0, 1] * df.shape[0])[: df.shape[0]]) df = df.with_columns(pl.Series(name="boolean_value", values=pseudo_labels)) - df = df.select("patient_id", pl.col("time").alias("prediction_time"), "boolean_value") + df = df.select("subject_id", pl.col("time").alias("prediction_time"), "boolean_value") out_fp = Path(cfg.input_label_dir) / "0.parquet" out_fp.parent.mkdir(parents=True, exist_ok=True) diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index 7691440..9865a9d 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -38,7 +38,7 @@ NUM_SHARDS = 4 MEDS_TRAIN_0 = """ -patient_id,code,time,numeric_value +subject_id,code,time,numeric_value 239684,HEIGHT,,175.271115221764 239684,EYE_COLOR//BROWN,, 239684,DOB,1980-12-28T00:00:00.000000, @@ -71,7 +71,7 @@ 1195293,DISCHARGE,2010-06-20T20:50:04.000000, """ MEDS_TRAIN_1 = """ -patient_id,code,time,numeric_value +subject_id,code,time,numeric_value 68729,EYE_COLOR//HAZEL,, 68729,HEIGHT,,160.3953106166676 68729,DOB,1978-03-09T00:00:00.000000, @@ -88,7 +88,7 @@ 814703,DISCHARGE,2010-02-05T07:02:30.000000, """ MEDS_HELD_OUT_0 = """ -patient_id,code,time,numeric_value +subject_id,code,time,numeric_value 1500733,HEIGHT,,158.60131573580904 1500733,EYE_COLOR//BROWN,, 1500733,DOB,1986-07-20T00:00:00.000000, @@ -102,7 +102,7 @@ 1500733,DISCHARGE,2010-06-03T16:44:26.000000, """ MEDS_TUNING_0 = """ -patient_id,code,time,numeric_value +subject_id,code,time,numeric_value 754281,EYE_COLOR//BROWN,, 754281,HEIGHT,,166.22261567137025 754281,DOB,1988-12-19T00:00:00.000000, @@ -183,7 +183,7 @@ def test_tabularize(tmp_path): df.write_parquet(file_path) all_data.append(df) - all_data = pl.concat(all_data, how="diagonal_relaxed").sort(by=["patient_id", "time"]) + all_data = pl.concat(all_data, how="diagonal_relaxed").sort(by=["subject_id", "time"]) # Check the files are not empty meds_files = list_subdir_files(Path(cfg.input_dir), "parquet") @@ -314,7 +314,7 @@ def test_tabularize(tmp_path): df = get_unique_time_events_df(get_events_df(all_data.lazy(), feature_columns)).collect() pseudo_labels = pl.Series(([0, 1] * df.shape[0])[: df.shape[0]]) df = df.with_columns(pl.Series(name="boolean_value", values=pseudo_labels)) - df = df.select("patient_id", pl.col("time").alias("prediction_time"), "boolean_value") + df = df.select("subject_id", pl.col("time").alias("prediction_time"), "boolean_value") out_fp = Path(cfg.input_label_dir) / "0.parquet" out_fp.parent.mkdir(parents=True, exist_ok=True) From 158b8faad09988d4c93f82a99c54024340d51c65 Mon Sep 17 00:00:00 2001 From: teyaberg Date: Fri, 6 Sep 2024 13:59:09 +0000 Subject: [PATCH 21/54] version updates --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index d24137f..0fd79de 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,7 @@ classifiers = [ ] dependencies = [ "polars", "pyarrow", "loguru", "hydra-core", "numpy", "scipy<1.14.0", "pandas", "tqdm", "xgboost", - "scikit-learn", "hydra-optuna-sweeper", "hydra-joblib-launcher", "ml-mixins", "meds==0.3.2", "meds-transforms==0.0.7", + "scikit-learn", "hydra-optuna-sweeper", "hydra-joblib-launcher", "ml-mixins", "meds==0.3.3", "meds-transforms==0.0.7", ] [project.scripts] From e92049f1dfd79e0c6e290f8d89bd1e799a9e0c0f Mon Sep 17 00:00:00 2001 From: teyaberg Date: Fri, 6 Sep 2024 15:37:22 +0000 Subject: [PATCH 22/54] fix hydra-core version for experimental callback support --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 0fd79de..59b30cf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,7 +15,7 @@ classifiers = [ "Operating System :: OS Independent", ] dependencies = [ - "polars", "pyarrow", "loguru", "hydra-core", "numpy", "scipy<1.14.0", "pandas", "tqdm", "xgboost", + "polars", "pyarrow", "loguru", "hydra-core==1.3.2", "numpy", "scipy<1.14.0", "pandas", "tqdm", "xgboost", "scikit-learn", "hydra-optuna-sweeper", "hydra-joblib-launcher", "ml-mixins", "meds==0.3.3", "meds-transforms==0.0.7", ] From 0623aaabc8dd953b5d74f12f75301d2cca2d36ca Mon Sep 17 00:00:00 2001 From: teyaberg Date: Fri, 6 Sep 2024 18:59:11 +0000 Subject: [PATCH 23/54] eval callback logging --- .../configs/launch_autogluon.yaml | 3 - .../configs/launch_model.yaml | 13 ++-- .../configs/model/knn_classifier.yaml | 5 +- .../configs/model/logistic_regression.yaml | 5 +- .../model/random_forest_classifier.yaml | 5 +- .../configs/model/sgd_classifier.yaml | 5 +- .../configs/model/xgboost.yaml | 5 +- .../evaluation_callback.py | 65 ++++++++++--------- .../generate_static_features.py | 16 +++-- .../generate_summarized_reps.py | 7 +- .../generate_ts_features.py | 10 ++- src/MEDS_tabular_automl/scripts/cache_task.py | 3 +- .../scripts/launch_model.py | 38 ++--------- .../scripts/tabularize_time_series.py | 5 +- src/MEDS_tabular_automl/sklearn_model.py | 43 ++++-------- src/MEDS_tabular_automl/utils.py | 36 +++++++++- src/MEDS_tabular_automl/xgboost_model.py | 15 +---- tests/test_tabularize.py | 6 +- 18 files changed, 143 insertions(+), 142 deletions(-) diff --git a/src/MEDS_tabular_automl/configs/launch_autogluon.yaml b/src/MEDS_tabular_automl/configs/launch_autogluon.yaml index 567d11c..908e79d 100644 --- a/src/MEDS_tabular_automl/configs/launch_autogluon.yaml +++ b/src/MEDS_tabular_automl/configs/launch_autogluon.yaml @@ -3,9 +3,6 @@ defaults: - tabularization: default - imputer: default - normalization: default - - override hydra/sweeper: optuna - - override hydra/sweeper/sampler: tpe - - override hydra/launcher: joblib - _self_ task_name: task diff --git a/src/MEDS_tabular_automl/configs/launch_model.yaml b/src/MEDS_tabular_automl/configs/launch_model.yaml index ad47261..9938cd9 100644 --- a/src/MEDS_tabular_automl/configs/launch_model.yaml +++ b/src/MEDS_tabular_automl/configs/launch_model.yaml @@ -17,10 +17,15 @@ input_dir: ${output_cohort_dir}/${task_name}/task_cache # Directory with task labels input_label_dir: ${output_cohort_dir}/${task_name}/labels/ # Where to output the model and cached data -model_dir: ${output_cohort_dir}/model/model_${now:%Y-%m-%d_%H-%M-%S} -output_filepath: ${model_dir}/model_metadata.json -model_log_dir: ${model_dir}/.logs/ -save_top_k: -1 +model_saving: + model_dir: ${output_cohort_dir}/model/model_${now:%Y-%m-%d_%H-%M-%S} + model_file_stem: model + model_file_extension: .json + delete_below_top_k: -1 +model_logging: + model_log_dir: ${model_saving.model_dir}/.logs/ + performance_log_stem: performance + config_log_stem: config name: launch_model diff --git a/src/MEDS_tabular_automl/configs/model/knn_classifier.yaml b/src/MEDS_tabular_automl/configs/model/knn_classifier.yaml index 9e86e81..1ca034a 100644 --- a/src/MEDS_tabular_automl/configs/model/knn_classifier.yaml +++ b/src/MEDS_tabular_automl/configs/model/knn_classifier.yaml @@ -5,8 +5,9 @@ model_target: model_params: ${model_params} input_dir: ${input_dir} input_label_dir: ${input_label_dir} - model_dir: ${model_dir} - output_filepath: ${output_filepath} + model_dir: ${model_saving.model_dir} + model_file_stem: ${model_saving.model_file_stem} + model_file_extension: ${model_saving.model_file_extension} log_dir: ${log_dir} cache_dir: ${cache_dir} imputer: ${model_params.iterator.imputer} diff --git a/src/MEDS_tabular_automl/configs/model/logistic_regression.yaml b/src/MEDS_tabular_automl/configs/model/logistic_regression.yaml index bdfc19a..0f74a7b 100644 --- a/src/MEDS_tabular_automl/configs/model/logistic_regression.yaml +++ b/src/MEDS_tabular_automl/configs/model/logistic_regression.yaml @@ -5,8 +5,9 @@ model_target: model_params: ${model_params} input_dir: ${input_dir} input_label_dir: ${input_label_dir} - model_dir: ${model_dir} - output_filepath: ${output_filepath} + model_dir: ${model_saving.model_dir} + model_file_stem: ${model_saving.model_file_stem} + model_file_extension: ${model_saving.model_file_extension} log_dir: ${log_dir} cache_dir: ${cache_dir} imputer: ${model_params.iterator.imputer} diff --git a/src/MEDS_tabular_automl/configs/model/random_forest_classifier.yaml b/src/MEDS_tabular_automl/configs/model/random_forest_classifier.yaml index c36c506..58a9671 100644 --- a/src/MEDS_tabular_automl/configs/model/random_forest_classifier.yaml +++ b/src/MEDS_tabular_automl/configs/model/random_forest_classifier.yaml @@ -5,8 +5,9 @@ model_target: model_params: ${model_params} input_dir: ${input_dir} input_label_dir: ${input_label_dir} - model_dir: ${model_dir} - output_filepath: ${output_filepath} + model_dir: ${model_saving.model_dir} + model_file_stem: ${model_saving.model_file_stem} + model_file_extension: ${model_saving.model_file_extension} log_dir: ${log_dir} cache_dir: ${cache_dir} imputer: ${model_params.iterator.imputer} diff --git a/src/MEDS_tabular_automl/configs/model/sgd_classifier.yaml b/src/MEDS_tabular_automl/configs/model/sgd_classifier.yaml index 8411817..2f2b57f 100644 --- a/src/MEDS_tabular_automl/configs/model/sgd_classifier.yaml +++ b/src/MEDS_tabular_automl/configs/model/sgd_classifier.yaml @@ -5,8 +5,9 @@ model_target: model_params: ${model_params} input_dir: ${input_dir} input_label_dir: ${input_label_dir} - model_dir: ${model_dir} - output_filepath: ${output_filepath} + model_dir: ${model_saving.model_dir} + model_file_stem: ${model_saving.model_file_stem} + model_file_extension: ${model_saving.model_file_extension} log_dir: ${log_dir} cache_dir: ${cache_dir} imputer: ${model_params.iterator.imputer} diff --git a/src/MEDS_tabular_automl/configs/model/xgboost.yaml b/src/MEDS_tabular_automl/configs/model/xgboost.yaml index 11c8d81..793cc29 100644 --- a/src/MEDS_tabular_automl/configs/model/xgboost.yaml +++ b/src/MEDS_tabular_automl/configs/model/xgboost.yaml @@ -5,8 +5,9 @@ model_target: model_params: ${model_params} input_dir: ${input_dir} input_label_dir: ${input_label_dir} - model_dir: ${model_dir} - output_filepath: ${output_filepath} + model_dir: ${model_saving.model_dir} + model_file_stem: ${model_saving.model_file_stem} + model_file_extension: ${model_saving.model_file_extension} log_dir: ${log_dir} cache_dir: ${cache_dir} imputer: ${model_params.iterator.imputer} diff --git a/src/MEDS_tabular_automl/evaluation_callback.py b/src/MEDS_tabular_automl/evaluation_callback.py index 84ce5bb..fddb858 100644 --- a/src/MEDS_tabular_automl/evaluation_callback.py +++ b/src/MEDS_tabular_automl/evaluation_callback.py @@ -1,8 +1,9 @@ import ast from pathlib import Path -import pandas as pd +import polars as pl from hydra.experimental.callback import Callback +from loguru import logger from omegaconf import DictConfig, OmegaConf @@ -11,47 +12,53 @@ def __init__(self, **kwargs): self.kwargs = kwargs def on_multirun_end(self, config: DictConfig, **kwargs): - """Find best model based on log files and print its performance and hyperparameters.""" - log_fp = Path(config.model_log_dir) + """Find best model based on log files and logger.info its performance and hyperparameters.""" + log_fp = Path(config.model_logging.model_log_dir) - performance = pd.read_csv( - log_fp / "performance.log", sep=",", header=None - ) # , columns=["model_fp", "tuning_auc", "test_auc"]) - performance.columns = ["model_fp", "tuning_auc", "test_auc"] - performance.sort_values("tuning_auc", ascending=False, inplace=True) - print(performance.head()) + try: + performance = pl.read_csv(log_fp / "*/*.csv") + except Exception as e: + raise FileNotFoundError(f"Log files incomplete or not found at {log_fp}, exception {e}.") - hyperparams = pd.read_csv(log_fp / "hyperparameters.log", sep="\t", header=None) - hyperparams.columns = ["model_fp", "tabularization", "model_params"] + performance = performance.sort("tuning_auc", descending=True, nulls_last=True) + logger.info(performance.head(10)) - best_model = performance.head(1)["model_fp"].values[0] - best_hyperparams = hyperparams[hyperparams["model_fp"] == best_model] + # get best model_fp + best_model = performance[0, 0] - print(f"The best model can be found at {best_model}") - self.print_performance(performance.head(1)) - self.print_hyperparams(best_hyperparams) - if hasattr(config, "save_top_k") and config.save_top_k >= 0: - self.save_top_k_models(performance, config.save_top_k, config.model_dir) + best_params_fp = log_fp / best_model / f"{config.model_logging.config_log_stem}.json" + + # check if this file exists + if not best_params_fp.is_file(): + raise FileNotFoundError(f"Best hyperparameters file not found at {best_params_fp}") + + logger.info(f"The best model can be found at {best_model}") + # self.log_performance(performance.head(1)) + # self.log_hyperparams(best_hyperparams) + if hasattr(config, "model_saving.delete_below_top_k") and config.delete_below_top_k >= 0: + self.delete_below_top_k_models( + performance, config.model_saving.delete_below_top_k, config.model_saving.model_dir + ) return performance.head(1) - def print_performance(self, performance): - """Print performance of the best model with nice formatting.""" - print("Performance of the best model:") - print(f"Tuning AUC: {performance['tuning_auc'].values[0]}") - print(f"Test AUC: {performance['test_auc'].values[0]}") + def log_performance(self, performance): + """logger.info performance of the best model with nice formatting.""" + logger.info("Performance of the best model:") + logger.info(f"Tuning AUC: {performance['tuning_auc'].values[0]}") + logger.info(f"Test AUC: {performance['test_auc'].values[0]}") - def print_hyperparams(self, hyperparams): - """Print hyperparameters of the best model with nice formatting.""" - print("Hyperparameters of the best model:") - print( + def log_hyperparams(self, hyperparams): + """logger.info hyperparameters of the best model with nice formatting.""" + logger.info("Hyperparameters of the best model:") + logger.info( f"Tabularization: {OmegaConf.to_yaml(ast.literal_eval(hyperparams['tabularization'].values[0]))}" ) - print( + logger.info( f"Model parameters: {OmegaConf.to_yaml(ast.literal_eval(hyperparams['model_params'].values[0]))}" ) - def save_top_k_models(self, performance, k, model_dir): + def delete_below_top_k_models(self, performance, k, model_dir): """Save only top k models from the model directory and delete all other files.""" top_k_models = performance.head(k)["model_fp"].values for model_fp in Path(model_dir).iterdir(): diff --git a/src/MEDS_tabular_automl/generate_static_features.py b/src/MEDS_tabular_automl/generate_static_features.py index 9f99b23..77269b1 100644 --- a/src/MEDS_tabular_automl/generate_static_features.py +++ b/src/MEDS_tabular_automl/generate_static_features.py @@ -67,12 +67,15 @@ def get_sparse_static_rep( """ # Make static data sparse and merge it with the time-series data logger.info("Make static data sparse and merge it with the time-series data") - # Check static_df is sorted and unique - assert static_df.select(pl.col("subject_id")).collect().to_series().is_sorted() - assert ( + # Check static_df is sorted and unique and raise error if it is not + if not static_df.select(pl.col("subject_id")).collect().to_series().is_sorted(): + raise ValueError("static_df is not sorted by subject_id.") + if not ( static_df.select(pl.len()).collect().item() == static_df.select(pl.col("subject_id").n_unique()).collect().item() - ) + ): + raise ValueError("static_df has duplicate subject_id values.") + meds_df = get_unique_time_events_df(get_events_df(meds_df, feature_columns)) # load static data as sparse matrix @@ -189,7 +192,6 @@ def get_flat_static_rep( raise ValueError(f"No static features found. Remove the aggregation function {agg}") # convert to sparse_matrix matrix = get_sparse_static_rep(static_features, static_measurements.lazy(), shard_df, feature_columns) - assert matrix.shape[1] == len( - static_features - ), f"Expected {len(static_features)} features, got {matrix.shape[1]}" + if not matrix.shape[1] == len(static_features): + raise ValueError(f"Expected {len(static_features)} features, got {matrix.shape[1]}") return matrix diff --git a/src/MEDS_tabular_automl/generate_summarized_reps.py b/src/MEDS_tabular_automl/generate_summarized_reps.py index 1cd7405..6721306 100644 --- a/src/MEDS_tabular_automl/generate_summarized_reps.py +++ b/src/MEDS_tabular_automl/generate_summarized_reps.py @@ -193,13 +193,16 @@ def generate_summary( raise ValueError( f"Invalid aggregation: {agg}. Valid options are: {CODE_AGGREGATIONS + VALUE_AGGREGATIONS}" ) - assert len(feature_columns), "feature_columns must be a non-empty list" + if not len(feature_columns): + raise ValueError("No feature columns provided -- feature_columns must be a non-empty list.") ts_columns = get_feature_names(agg, feature_columns) # Generate summaries for each window size and aggregation code_type, _ = agg.split("/") # only iterate through code_types that exist in the dataframe columns - assert any([c.endswith(code_type) for c in ts_columns]) + if not any([c.endswith(code_type) for c in ts_columns]): + raise ValueError(f"No columns found for aggregation {agg} in feature_columns: {ts_columns}.") + logger.info( f"Generating aggregation {agg} for window_size {window_size}, with {len(ts_columns)} columns." ) diff --git a/src/MEDS_tabular_automl/generate_ts_features.py b/src/MEDS_tabular_automl/generate_ts_features.py index dc6ee52..43fdd5f 100644 --- a/src/MEDS_tabular_automl/generate_ts_features.py +++ b/src/MEDS_tabular_automl/generate_ts_features.py @@ -57,7 +57,8 @@ def get_long_code_df( .to_series() .to_numpy() ) - assert np.issubdtype(cols.dtype, np.number), "numeric_value must be a numerical type" + if not np.issubdtype(cols.dtype, np.number): + raise ValueError("numeric_value must be a numerical type. Instead it has type: ", cols.dtype) data = np.ones(df.select(pl.len()).collect().item(), dtype=np.bool_) return data, (rows, cols) @@ -85,7 +86,9 @@ def get_long_value_df( .to_series() .to_numpy() ) - assert np.issubdtype(cols.dtype, np.number), "numeric_value must be a numerical type" + if not np.issubdtype(cols.dtype, np.number): + raise ValueError("numeric_value must be a numerical type. Instead it has type: ", cols.dtype) + data = value_df.select(pl.col("numeric_value")).collect().to_series().to_numpy() return data, (rows, cols) @@ -111,7 +114,8 @@ def summarize_dynamic_measurements( # Confirm dataframe is sorted check_df = df.select(pl.col(id_cols)) - assert check_df.sort(by=id_cols).collect().equals(check_df.collect()), "data frame must be sorted" + if not check_df.sort(by=id_cols).collect().equals(check_df.collect()): + raise ValueError("data frame must be sorted by subject_id and time") # Generate sparse matrix if agg in CODE_AGGREGATIONS: diff --git a/src/MEDS_tabular_automl/scripts/cache_task.py b/src/MEDS_tabular_automl/scripts/cache_task.py index a5a34b0..0903aa3 100644 --- a/src/MEDS_tabular_automl/scripts/cache_task.py +++ b/src/MEDS_tabular_automl/scripts/cache_task.py @@ -116,7 +116,7 @@ def read_meds_data_df(meds_data_fp): if "numeric_value" not in pl.scan_parquet(meds_data_fp).columns: raise ValueError( f"'numeric_value' column not found in raw data {meds_data_fp}. " - "You are maybe loading labels instead or meds data" + "You are maybe loading labels instead of meds data" ) return filter_parquet(meds_data_fp, cfg.tabularization._resolved_codes) @@ -133,7 +133,6 @@ def extract_labels(meds_data_df): def read_fn(in_fp_tuple): meds_data_fp, data_fp = in_fp_tuple - assert "data" in str(meds_data_fp) # TODO: replace this with more intelligent locking if not Path(shard_label_fp).exists(): logger.info(f"Extracting labels for {shard_label_fp}") diff --git a/src/MEDS_tabular_automl/scripts/launch_model.py b/src/MEDS_tabular_automl/scripts/launch_model.py index b4fa3de..97943b4 100644 --- a/src/MEDS_tabular_automl/scripts/launch_model.py +++ b/src/MEDS_tabular_automl/scripts/launch_model.py @@ -6,41 +6,14 @@ from omegaconf import DictConfig, open_dict from MEDS_tabular_automl.base_model import BaseModel -from MEDS_tabular_automl.mapper import wrap as rwlock_wrap -from ..utils import hydra_loguru_init +from ..utils import hydra_loguru_init, log_to_logfile config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_model.yaml") if not config_yaml.is_file(): raise FileNotFoundError("Core configuration not successfully installed!") -def log_to_logfile(model, cfg, output_fp): - """Log model hyperparameters and performance to two log files.""" - log_fp = Path(cfg.model_log_dir) - log_fp.mkdir(parents=True, exist_ok=True) - # log hyperparameters - out_fp = log_fp / "trial_performance_results.log" - - def write_fn(_, out_fp): - with open(out_fp, "a") as f: - f.write( - f"{output_fp}\t{cfg.tabularization}\t{cfg.model_params}" - f"\t{model.evaluate()}\t{model.evaluate(split='held_out')}\n" - ) - - rwlock_wrap( - None, - out_fp, - lambda _: None, # read_fn is ignored - write_fn, - cache_intermediate=True, - clear_cache_on_completion=True, - do_overwrite=True, - do_return=False, - ) - - @hydra.main(version_base=None, config_path=str(config_yaml.parent.resolve()), config_name=config_yaml.stem) def main(cfg: DictConfig) -> float: """Optimizes the model based on the provided configuration. @@ -66,12 +39,15 @@ def main(cfg: DictConfig) -> float: # logger.info(f"AUC: {auc}") # save model - output_fp = Path(cfg.output_filepath) - output_fp = output_fp.parent / f"{output_fp.stem}_{auc:.4f}_{time.time()}{output_fp.suffix}" + output_fp = Path(cfg.model_saving.model_dir) + output_fp = ( + output_fp.parent + / f"{cfg.model_saving.model_file_stem}_{auc:.4f}_{time.time()}{cfg.model_saving.model_file_extension}" + ) output_fp.parent.mkdir(parents=True, exist_ok=True) # log to logfile - log_to_logfile(model, cfg, output_fp) + log_to_logfile(model, cfg, output_fp.stem) model.save_model(output_fp) return auc diff --git a/src/MEDS_tabular_automl/scripts/tabularize_time_series.py b/src/MEDS_tabular_automl/scripts/tabularize_time_series.py index c6ecc98..8eca7f4 100644 --- a/src/MEDS_tabular_automl/scripts/tabularize_time_series.py +++ b/src/MEDS_tabular_automl/scripts/tabularize_time_series.py @@ -101,7 +101,10 @@ def compute_fn(shard_df): window_size, agg, ) - assert summary_df.shape[1] > 0, "No data found in the summarized dataframe" + + if not summary_df.shape[1]: + raise ValueError("No data found in the summarized dataframe.") + del index_df del sparse_matrix gc.collect() diff --git a/src/MEDS_tabular_automl/sklearn_model.py b/src/MEDS_tabular_automl/sklearn_model.py index 9b94fd0..56063fd 100644 --- a/src/MEDS_tabular_automl/sklearn_model.py +++ b/src/MEDS_tabular_automl/sklearn_model.py @@ -3,7 +3,6 @@ import numpy as np import scipy.sparse as sp from loguru import logger -from mixins import TimeableMixin from omegaconf import DictConfig from sklearn.metrics import roc_auc_score @@ -11,7 +10,7 @@ from .tabular_dataset import TabularDataset -class SklearnIterator(TabularDataset, TimeableMixin): +class SklearnIterator(TabularDataset): """SklearnIterator class for loading and processing data shards for use in SciKit-Learn models. This class provides functionality for iterating through data shards, loading @@ -43,21 +42,14 @@ def __init__(self, cfg: DictConfig, split: str): cfg: The configuration dictionary. split: The data split to use. """ - TabularDataset.__init__(self, cfg=cfg, split=split) - TimeableMixin.__init__(self) + super().__init__(cfg=cfg, split=split) self.valid_event_ids, self.labels = self._load_ids_and_labels() # check if the labels are empty if len(self.labels) == 0: raise ValueError("No labels found.") - # self._it = 0 - # def iterate(self, function): - # for shard_idx in range(len(self._data_shards)): - # data, labels = self.get_data_shards(shard_idx) - # function(data, labels) - -class SklearnMatrix(TimeableMixin): +class SklearnMatrix: """SklearnMatrix class for loading and processing data shards for use in SciKit-Learn models.""" def __init__(self, data: sp.csr_matrix, labels: np.ndarray): @@ -77,24 +69,24 @@ def get_label(self): return self.labels -class SklearnModel(BaseModel, TimeableMixin): +class SklearnModel(BaseModel): """Class for configuring, training, and evaluating an SciKit-Learn model. This class utilizes the configuration settings provided to manage the training and evaluation - process of an XGBoost model, ensuring the model is trained and validated using specified parameters + process of an SKlearn model, ensuring the model is trained and validated using specified parameters and data splits. It supports training with in-memory data handling as well as direct streaming from disk using iterators. Args: - cfg: The configuration settings for the model, including data paths, model parameters,ß + cfg: The configuration settings for the model, including data paths, model parameters, and flags for data handling. Attributes: cfg: Configuration object containing all settings required for model operation. - model: The XGBoost model after being trained. - dtrain: The training dataset in DMatrix format. - dtuning: The tuning (validation) dataset in DMatrix format. - dheld_out: The held-out (test) dataset in DMatrix format. + model: The SKlearn model. + dtrain: The training dataset in Matrix format. + dtuning: The tuning (validation) dataset in Matrix format. + dheld_out: The held-out (test) dataset in Matrix format. itrain: Iterator for the training dataset. ituning: Iterator for the tuning dataset. iheld_out: Iterator for the held-out dataset. @@ -102,7 +94,7 @@ class SklearnModel(BaseModel, TimeableMixin): """ def __init__(self, cfg: DictConfig): - """Initializes the XGBoostClassifier with the provided configuration. + """Initializes the SklearnClassifier with the provided configuration. Args: cfg: The configuration dictionary. @@ -124,7 +116,6 @@ def __init__(self, cfg: DictConfig): if not hasattr(self.model, "fit"): raise ValueError("Model does not have a fit method.") - @TimeableMixin.TimeAs def _build_data(self): """Builds necessary data structures for training.""" if self.keep_data_in_memory: @@ -146,11 +137,6 @@ def _fit_from_partial(self): # train on each all data for shard_idx in range(len(self.itrain._data_shards)): data, labels = self.itrain.get_data_shards(shard_idx) - # if self.model.shuffle: # TODO: check this for speed - # # shuffle data - # indices = np.random.permutation(len(labels)) - # data = data[indices] - # labels = labels[indices] self.model.partial_fit(data, labels, classes=classes) # evaluate on tuning set auc = self.evaluate() @@ -161,36 +147,30 @@ def _fit_from_partial(self): if epoch - best_epoch > self.cfg.model_params.early_stopping_rounds: break - @TimeableMixin.TimeAs def _train(self): """Trains the model.""" - # two cases: data is in memory or data is streamed if self.keep_data_in_memory: self.model.fit(self.dtrain.get_data(), self.dtrain.get_label()) else: self._fit_from_partial() - @TimeableMixin.TimeAs def train(self): """Trains the model.""" self._build_data() self._train() - @TimeableMixin.TimeAs def _build_matrix_in_memory(self): """Builds the DMatrix from the data in memory.""" self.dtrain = SklearnMatrix(*self.itrain.get_data()) self.dtuning = SklearnMatrix(*self.ituning.get_data()) self.dheld_out = SklearnMatrix(*self.iheld_out.get_data()) - @TimeableMixin.TimeAs def _build_iterators(self): """Builds the iterators for training, validation, and testing.""" self.itrain = SklearnIterator(self.cfg, split="train") self.ituning = SklearnIterator(self.cfg, split="tuning") self.iheld_out = SklearnIterator(self.cfg, split="held_out") - @TimeableMixin.TimeAs def evaluate(self, split: str = "tuning") -> float: """Evaluates the model on the tuning set. @@ -213,6 +193,7 @@ def evaluate(self, split: str = "tuning") -> float: # check if model has predict_proba method if not hasattr(self.model, "predict_proba"): raise ValueError(f"Model {self.model.__class__.__name__} does not have a predict_proba method.") + # two cases: data is in memory or data is streamed if self.keep_data_in_memory: y_pred = self.model.predict_proba(dsplit.get_data())[:, 1] diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py index 57d1bd2..6b726f9 100644 --- a/src/MEDS_tabular_automl/utils.py +++ b/src/MEDS_tabular_automl/utils.py @@ -157,7 +157,8 @@ def array_to_sparse_matrix(array: np.ndarray, shape: tuple[int, int]) -> coo_arr Raises: AssertionError: If the input array's first dimension is not 3. """ - assert array.shape[0] == 3 + if not array.shape[0] == 3: + raise AssertionError("Array must have 3 dimensions: [data, row, col], currently has", array.shape[0]) data, row, col = array return coo_array((data, (row, col)), shape=shape) @@ -323,12 +324,14 @@ def get_unique_time_events_df(events_df: pl.LazyFrame) -> pl.LazyFrame: Returns: A LazyFrame with unique times, sorted by subject_id and time. """ - assert events_df.select(pl.col("time")).null_count().collect().item() == 0 + if not events_df.select(pl.col("time")).null_count().collect().item() == 0: + raise ValueError("Time column must not have null values for time series data.") # Check events_df is sorted - so it aligns with the ts_matrix we generate later in the pipeline events_df = ( events_df.drop_nulls("time").select(pl.col(["subject_id", "time"])).unique(maintain_order=True) ) - assert events_df.sort(by=["subject_id", "time"]).collect().equals(events_df.collect()) + if not events_df.sort(by=["subject_id", "time"]).collect().equals(events_df.collect()): + raise ValueError("Data frame must be sorted by subject_id and time") return events_df @@ -392,3 +395,30 @@ def get_shard_prefix(base_path: Path, fp: Path) -> str: file_name = relative_path.name.split(".")[0] return str(relative_parent / file_name) + + +def log_to_logfile(model, cfg, output_fp): + """Log model hyperparameters and performance to two log files. + + Args: + model: The model to log. + cfg: The configuration dictionary. + output_fp: The relative output file path. + """ + log_fp = Path(cfg.model_logging.model_log_dir) + + # make a folder to log everything for this model + out_fp = log_fp / output_fp + out_fp.mkdir(parents=True, exist_ok=True) + + # config as a json + config_fp = out_fp / f"{cfg.model_logging.config_log_stem}.json" + with open(config_fp, "w") as f: + f.write(OmegaConf.to_yaml(cfg)) + + model_performance_fp = out_fp / f"{cfg.model_logging.performance_log_stem}.csv" + with open(model_performance_fp, "w") as f: + f.write("model_fp,tuning_auc,test_auc\n") + f.write(f"{output_fp},{model.evaluate()},{model.evaluate(split='held_out')}\n") + + logger.debug(f"Model config and performance logged to {config_fp} and {model_performance_fp}") diff --git a/src/MEDS_tabular_automl/xgboost_model.py b/src/MEDS_tabular_automl/xgboost_model.py index 424d9f0..ec33c15 100644 --- a/src/MEDS_tabular_automl/xgboost_model.py +++ b/src/MEDS_tabular_automl/xgboost_model.py @@ -4,7 +4,6 @@ import scipy.sparse as sp import xgboost as xgb from loguru import logger -from mixins import TimeableMixin from omegaconf import DictConfig, OmegaConf from sklearn.metrics import roc_auc_score @@ -12,7 +11,7 @@ from .tabular_dataset import TabularDataset -class XGBIterator(xgb.DataIter, TabularDataset, TimeableMixin): +class XGBIterator(xgb.DataIter, TabularDataset): """XGBIterator class for loading and processing data shards for use in XGBoost models. This class provides functionality for iterating through data shards, loading @@ -46,14 +45,12 @@ def __init__(self, cfg: DictConfig, split: str): """ xgb.DataIter.__init__(self, cache_prefix=Path(cfg.cache_dir)) TabularDataset.__init__(self, cfg=cfg, split=split) - TimeableMixin.__init__(self) self.valid_event_ids, self.labels = self._load_ids_and_labels() # check if the labels are empty if self.labels is None: raise ValueError("No labels found.") self._it = 0 - @TimeableMixin.TimeAs def next(self, input_data: Callable) -> int: """Advances the XGBIterator by one step and provides data to XGBoost for DMatrix construction. @@ -73,13 +70,12 @@ def next(self, input_data: Callable) -> int: return 1 - @TimeableMixin.TimeAs def reset(self): """Resets the XGBIterator to its beginning.""" self._it = 0 -class XGBoostModel(BaseModel, TimeableMixin): +class XGBoostModel(BaseModel): """Class for configuring, training, and evaluating an XGBoost model. This class utilizes the configuration settings provided to manage the training and evaluation @@ -123,7 +119,6 @@ def __init__(self, cfg: DictConfig): self.model = None - @TimeableMixin.TimeAs def _build(self): """Builds necessary data structures for training.""" if self.keep_data_in_memory: @@ -133,7 +128,6 @@ def _build(self): self._build_iterators() self._build_dmatrix_from_iterators() - @TimeableMixin.TimeAs def _train(self): """Trains the model.""" self.model = xgb.train( @@ -146,13 +140,11 @@ def _train(self): verbose_eval=0, ) - @TimeableMixin.TimeAs def train(self): """Trains the model.""" self._build() self._train() - @TimeableMixin.TimeAs def _build_dmatrix_in_memory(self): """Builds the DMatrix from the data in memory.""" X_train, y_train = self.itrain.get_data() @@ -162,21 +154,18 @@ def _build_dmatrix_in_memory(self): self.dtuning = xgb.DMatrix(X_tuning, label=y_tuning) self.dheld_out = xgb.DMatrix(X_held_out, label=y_held_out) - @TimeableMixin.TimeAs def _build_dmatrix_from_iterators(self): """Builds the DMatrix from the iterators.""" self.dtrain = xgb.DMatrix(self.itrain) self.dtuning = xgb.DMatrix(self.ituning) self.dheld_out = xgb.DMatrix(self.iheld_out) - @TimeableMixin.TimeAs def _build_iterators(self): """Builds the iterators for training, validation, and testing.""" self.itrain = XGBIterator(self.cfg, split="train") self.ituning = XGBIterator(self.cfg, split="tuning") self.iheld_out = XGBIterator(self.cfg, split="held_out") - @TimeableMixin.TimeAs def evaluate(self, split="tuning") -> float: """Evaluates the model on the tuning set. diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index 9865a9d..b1bd879 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -354,7 +354,7 @@ def test_tabularize(tmp_path): HydraConfig().set_config(cfg) launch_model.main(cfg) output_files = list(output_dir.glob("**/*.json")) - assert len(output_files) == 1 + assert len(output_files) == 2 sklearnmodel_config_kwargs = { **shared_config, @@ -379,7 +379,7 @@ def test_tabularize(tmp_path): "tabularization.min_code_inclusion_count": 1, "tabularization.window_sizes": "[30d,365d,full]", "model_params.iterator.keep_data_in_memory": False, - "model_dir": "${output_cohort_dir}/model_online/model_${now:%Y-%m-%d_%H-%M-%S}", + "model_saving.model_dir": "${output_cohort_dir}/model_online/model_${now:%Y-%m-%d_%H-%M-%S}", } with initialize( @@ -404,7 +404,7 @@ def test_tabularize(tmp_path): "tabularization.min_code_inclusion_count": 1, "tabularization.window_sizes": "[30d,365d,full]", "model_params.iterator.keep_data_in_memory": False, - "model_dir": "${output_cohort_dir}/model_online/model_${now:%Y-%m-%d_%H-%M-%S}", + "model_saving.model_dir": "${output_cohort_dir}/model_online/model_${now:%Y-%m-%d_%H-%M-%S}", } with initialize( From e1be8508bea0ce0db0e344d4b87f961ae3fc91ae Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Sat, 7 Sep 2024 22:41:00 +0000 Subject: [PATCH 24/54] added script input args checks, reduced redundancy in model launcher configs thorugh adding hierarchical defaults --- .github/workflows/publish-to-pypi.yml | 26 +---- pyproject.toml | 2 + .../configs/describe_codes.yaml | 2 +- .../configs/launch_autogluon.yaml | 25 ++--- .../configs/launch_model.yaml | 18 +-- .../configs/model/knn_classifier.yaml | 44 -------- .../configs/model/logistic_regression.yaml | 49 --------- .../model/random_forest_classifier.yaml | 51 --------- .../configs/model/sgd_classifier.yaml | 38 ------- .../configs/model/xgboost.yaml | 47 -------- .../configs/model_launcher/autogluon.yaml | 3 + .../data_loading_params/default.yaml | 2 + .../data_processing_params/default.yaml | 3 + .../imputer/default.yaml | 0 .../imputer/mean_imputer.yaml | 0 .../imputer/median_imputer.yaml | 0 .../imputer/mode_imputer.yaml | 0 .../normalization/default.yaml | 0 .../normalization/max_abs_scaler.yaml | 0 .../normalization/standard_scaler.yaml | 0 .../configs/model_launcher/default.yaml | 7 ++ .../model_launcher/hydra/sweeper/default.yaml | 5 + .../model_launcher/knn_classifier.yaml | 31 ++++++ .../model_launcher/logistic_regression.yaml | 35 ++++++ .../configs/model_launcher/path/default.yaml | 10 ++ .../random_forest_classifier.yaml | 38 +++++++ .../model_launcher/sgd_classifier.yaml | 26 +++++ .../configs/model_launcher/xgboost.yaml | 30 +++++ .../configs/tabularization.yaml | 2 +- .../configs/task_specific_caching.yaml | 2 +- src/MEDS_tabular_automl/describe_codes.py | 4 +- src/MEDS_tabular_automl/scripts/cache_task.py | 2 + .../scripts/describe_codes.py | 9 +- .../scripts/launch_autogluon.py | 30 +++-- .../scripts/launch_model.py | 12 +- .../scripts/tabularize_static.py | 2 + .../scripts/tabularize_time_series.py | 2 + src/MEDS_tabular_automl/sklearn_model.py | 7 +- src/MEDS_tabular_automl/utils.py | 104 ++++++++++++++++-- src/MEDS_tabular_automl/xgboost_model.py | 3 +- 40 files changed, 352 insertions(+), 319 deletions(-) delete mode 100644 src/MEDS_tabular_automl/configs/model/knn_classifier.yaml delete mode 100644 src/MEDS_tabular_automl/configs/model/logistic_regression.yaml delete mode 100644 src/MEDS_tabular_automl/configs/model/random_forest_classifier.yaml delete mode 100644 src/MEDS_tabular_automl/configs/model/sgd_classifier.yaml delete mode 100644 src/MEDS_tabular_automl/configs/model/xgboost.yaml create mode 100644 src/MEDS_tabular_automl/configs/model_launcher/autogluon.yaml create mode 100644 src/MEDS_tabular_automl/configs/model_launcher/data_loading_params/default.yaml create mode 100644 src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/default.yaml rename src/MEDS_tabular_automl/configs/{ => model_launcher/data_processing_params}/imputer/default.yaml (100%) rename src/MEDS_tabular_automl/configs/{ => model_launcher/data_processing_params}/imputer/mean_imputer.yaml (100%) rename src/MEDS_tabular_automl/configs/{ => model_launcher/data_processing_params}/imputer/median_imputer.yaml (100%) rename src/MEDS_tabular_automl/configs/{ => model_launcher/data_processing_params}/imputer/mode_imputer.yaml (100%) rename src/MEDS_tabular_automl/configs/{ => model_launcher/data_processing_params}/normalization/default.yaml (100%) rename src/MEDS_tabular_automl/configs/{ => model_launcher/data_processing_params}/normalization/max_abs_scaler.yaml (100%) rename src/MEDS_tabular_automl/configs/{ => model_launcher/data_processing_params}/normalization/standard_scaler.yaml (100%) create mode 100644 src/MEDS_tabular_automl/configs/model_launcher/default.yaml create mode 100644 src/MEDS_tabular_automl/configs/model_launcher/hydra/sweeper/default.yaml create mode 100644 src/MEDS_tabular_automl/configs/model_launcher/knn_classifier.yaml create mode 100644 src/MEDS_tabular_automl/configs/model_launcher/logistic_regression.yaml create mode 100644 src/MEDS_tabular_automl/configs/model_launcher/path/default.yaml create mode 100644 src/MEDS_tabular_automl/configs/model_launcher/random_forest_classifier.yaml create mode 100644 src/MEDS_tabular_automl/configs/model_launcher/sgd_classifier.yaml create mode 100644 src/MEDS_tabular_automl/configs/model_launcher/xgboost.yaml diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml index d86806f..7a68958 100644 --- a/.github/workflows/publish-to-pypi.yml +++ b/.github/workflows/publish-to-pypi.yml @@ -36,7 +36,7 @@ jobs: runs-on: ubuntu-latest environment: name: pypi - url: https://pypi.org/p/ # Replace with your PyPI project name + url: https://pypi.org/p/meds-tab # Replace with your PyPI project name permissions: id-token: write # IMPORTANT: mandatory for trusted publishing @@ -91,27 +91,3 @@ jobs: gh release upload '${{ github.ref_name }}' dist/** --repo '${{ github.repository }}' - - publish-to-testpypi: - name: Publish Python 🐍 distribution 📦 to TestPyPI - needs: - - build - runs-on: ubuntu-latest - - environment: - name: testpypi - url: https://test.pypi.org/p/ - - permissions: - id-token: write # IMPORTANT: mandatory for trusted publishing - - steps: - - name: Download all the dists - uses: actions/download-artifact@v3 - with: - name: python-package-distributions - path: dist/ - - name: Publish distribution 📦 to TestPyPI - uses: pypa/gh-action-pypi-publish@release/v1 - with: - repository-url: https://test.pypi.org/legacy/ diff --git a/pyproject.toml b/pyproject.toml index 59b30cf..1b75489 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,8 @@ dependencies = [ "scikit-learn", "hydra-optuna-sweeper", "hydra-joblib-launcher", "ml-mixins", "meds==0.3.3", "meds-transforms==0.0.7", ] +[tool.setuptools_scm] + [project.scripts] meds-tab-describe = "MEDS_tabular_automl.scripts.describe_codes:main" meds-tab-tabularize-static = "MEDS_tabular_automl.scripts.tabularize_static:main" diff --git a/src/MEDS_tabular_automl/configs/describe_codes.yaml b/src/MEDS_tabular_automl/configs/describe_codes.yaml index ec980bf..8d0aac3 100644 --- a/src/MEDS_tabular_automl/configs/describe_codes.yaml +++ b/src/MEDS_tabular_automl/configs/describe_codes.yaml @@ -2,7 +2,7 @@ defaults: - default - _self_ -input_dir: ${output_cohort_dir}/data +input_dir: ${MEDS_cohort_dir}/data # Where to store output code frequency data output_filepath: ${output_cohort_dir}/metadata/codes.parquet diff --git a/src/MEDS_tabular_automl/configs/launch_autogluon.yaml b/src/MEDS_tabular_automl/configs/launch_autogluon.yaml index 908e79d..19ae671 100644 --- a/src/MEDS_tabular_automl/configs/launch_autogluon.yaml +++ b/src/MEDS_tabular_automl/configs/launch_autogluon.yaml @@ -3,26 +3,25 @@ defaults: - tabularization: default - imputer: default - normalization: default + - model_launcher: autogluon - _self_ -task_name: task +task_name: ??? # Task cached data dir input_dir: ${output_cohort_dir}/${task_name}/task_cache # Directory with task labels input_label_dir: ${output_cohort_dir}/${task_name}/labels/ # Where to output the model and cached data -model_dir: ${output_cohort_dir}/autogluon/autogluon_${now:%Y-%m-%d_%H-%M-%S} -model_log_dir: ${model_dir}/.logs/ -output_filepath: ${model_dir} - -# Model parameters -model_params: - iterator: - keep_data_in_memory: True - binarize_task: True - -log_dir: ${model_dir}/.logs/ -log_filepath: ${log_dir}/log.txt +output_dir: ??? name: launch_autogluon + +hydra: + verbose: False + job: + name: MEDS_TAB_${name}_${worker}_${now:%Y-%m-%d_%H-%M-%S} + sweep: + dir: ${model_log_dir} + run: + dir: ${model_log_dir} diff --git a/src/MEDS_tabular_automl/configs/launch_model.yaml b/src/MEDS_tabular_automl/configs/launch_model.yaml index 9938cd9..2bc064d 100644 --- a/src/MEDS_tabular_automl/configs/launch_model.yaml +++ b/src/MEDS_tabular_automl/configs/launch_model.yaml @@ -2,30 +2,22 @@ defaults: - _self_ - default - tabularization: default - - model: xgboost # This can be changed to sgd_classifier or any other model - - imputer: default - - normalization: default + - model_launcher: xgboost - override hydra/callbacks: evaluation_callback - override hydra/sweeper: optuna - override hydra/sweeper/sampler: tpe - override hydra/launcher: joblib -task_name: task +task_name: ??? # Task cached data dir input_dir: ${output_cohort_dir}/${task_name}/task_cache # Directory with task labels input_label_dir: ${output_cohort_dir}/${task_name}/labels/ # Where to output the model and cached data -model_saving: - model_dir: ${output_cohort_dir}/model/model_${now:%Y-%m-%d_%H-%M-%S} - model_file_stem: model - model_file_extension: .json - delete_below_top_k: -1 -model_logging: - model_log_dir: ${model_saving.model_dir}/.logs/ - performance_log_stem: performance - config_log_stem: config +output_dir: ??? + +delete_below_top_k: -1 name: launch_model diff --git a/src/MEDS_tabular_automl/configs/model/knn_classifier.yaml b/src/MEDS_tabular_automl/configs/model/knn_classifier.yaml deleted file mode 100644 index 1ca034a..0000000 --- a/src/MEDS_tabular_automl/configs/model/knn_classifier.yaml +++ /dev/null @@ -1,44 +0,0 @@ -# @package _global_ - -model_target: - _target_: MEDS_tabular_automl.sklearn_model.SklearnModel.initialize - model_params: ${model_params} - input_dir: ${input_dir} - input_label_dir: ${input_label_dir} - model_dir: ${model_saving.model_dir} - model_file_stem: ${model_saving.model_file_stem} - model_file_extension: ${model_saving.model_file_extension} - log_dir: ${log_dir} - cache_dir: ${cache_dir} - imputer: ${model_params.iterator.imputer} - normalization: ${model_params.iterator.normalization} - -model_params: - epochs: 20 - early_stopping_rounds: 5 - model: - _target_: sklearn.neighbors.KNeighborsClassifier - weights: "distance" - leaf_size: 30 - p: 2 - metric: "minkowski" - iterator: - keep_data_in_memory: True - binarize_task: True - normalization: ${normalization} - imputer: ${imputer} - -hydra: - sweeper: - direction: maximize - n_trials: 250 - n_jobs: 25 - - params: - model_params.model.n_neighbors: range(1, 20) - model_params.model.weights: choice(['uniform', 'distance']) - model_params.model.leaf_size: range(10, 50) - model_params.model.p: choice([1, 2]) - model_params.model.metric: choice(['minkowski', 'euclidean', 'manhattan']) - model_params.epochs: range(10, 100) - model_params.early_stopping_rounds: range(1, 10) diff --git a/src/MEDS_tabular_automl/configs/model/logistic_regression.yaml b/src/MEDS_tabular_automl/configs/model/logistic_regression.yaml deleted file mode 100644 index 0f74a7b..0000000 --- a/src/MEDS_tabular_automl/configs/model/logistic_regression.yaml +++ /dev/null @@ -1,49 +0,0 @@ -# @package _global_ - -model_target: - _target_: MEDS_tabular_automl.sklearn_model.SklearnModel.initialize - model_params: ${model_params} - input_dir: ${input_dir} - input_label_dir: ${input_label_dir} - model_dir: ${model_saving.model_dir} - model_file_stem: ${model_saving.model_file_stem} - model_file_extension: ${model_saving.model_file_extension} - log_dir: ${log_dir} - cache_dir: ${cache_dir} - imputer: ${model_params.iterator.imputer} - normalization: ${model_params.iterator.normalization} - -model_params: - epochs: 20 - early_stopping_rounds: 5 - model: - _target_: sklearn.linear_model.LogisticRegression - penalty: "l2" - dual: false - tol: 0.0001 - C: 1.0 - fit_intercept: True - intercept_scaling: 1 - class_weight: null - random_state: null - solver: "lbfgs" - max_iter: 100 - - iterator: - keep_data_in_memory: True - binarize_task: True - normalization: ${normalization} - imputer: ${imputer} - -hydra: - sweeper: - direction: maximize - n_trials: 250 - n_jobs: 25 - - params: - model_params.model.C: tag(log, interval(1e-6, 1)) - model_params.model.penalty: choice(['l1', 'l2', 'elasticnet']) - model_params.model.solver: choice(['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']) - model_params.epochs: range(10, 100) - model_params.early_stopping_rounds: range(1, 10) diff --git a/src/MEDS_tabular_automl/configs/model/random_forest_classifier.yaml b/src/MEDS_tabular_automl/configs/model/random_forest_classifier.yaml deleted file mode 100644 index 58a9671..0000000 --- a/src/MEDS_tabular_automl/configs/model/random_forest_classifier.yaml +++ /dev/null @@ -1,51 +0,0 @@ -# @package _global_ - -model_target: - _target_: MEDS_tabular_automl.sklearn_model.SklearnModel.initialize - model_params: ${model_params} - input_dir: ${input_dir} - input_label_dir: ${input_label_dir} - model_dir: ${model_saving.model_dir} - model_file_stem: ${model_saving.model_file_stem} - model_file_extension: ${model_saving.model_file_extension} - log_dir: ${log_dir} - cache_dir: ${cache_dir} - imputer: ${model_params.iterator.imputer} - normalization: ${model_params.iterator.normalization} - -model_params: - epochs: 20 - early_stopping_rounds: 5 - model: - _target_: sklearn.ensemble.RandomForestClassifier - criterion: "gini" - max_depth: null - min_samples_split: 2 - min_samples_leaf: 1 - min_weight_fraction_leaf: 0.0 - max_features: "sqrt" - max_leaf_nodes: null - min_impurity_decrease: 0.0 - bootstrap: True - iterator: - keep_data_in_memory: True - binarize_task: True - normalization: ${normalization} - imputer: ${imputer} - -hydra: - sweeper: - direction: maximize - n_trials: 250 - n_jobs: 25 - - params: - model_params.model.n_estimators: range(50, 300, 50) - model_params.model.max_depth: choice([null, 10, 20, 30, 40, 50]) - model_params.model.min_samples_split: range(2, 11) - model_params.model.min_samples_leaf: range(1, 5) - model_params.model.max_features: choice(['sqrt', 'log2', null]) - model_params.model.bootstrap: choice([True, False]) - model_params.model.criterion: choice(['gini', 'entropy']) - model_params.epochs: range(10, 100) - model_params.early_stopping_rounds: range(1, 10) diff --git a/src/MEDS_tabular_automl/configs/model/sgd_classifier.yaml b/src/MEDS_tabular_automl/configs/model/sgd_classifier.yaml deleted file mode 100644 index 2f2b57f..0000000 --- a/src/MEDS_tabular_automl/configs/model/sgd_classifier.yaml +++ /dev/null @@ -1,38 +0,0 @@ -# @package _global_ - -model_target: - _target_: MEDS_tabular_automl.sklearn_model.SklearnModel.initialize - model_params: ${model_params} - input_dir: ${input_dir} - input_label_dir: ${input_label_dir} - model_dir: ${model_saving.model_dir} - model_file_stem: ${model_saving.model_file_stem} - model_file_extension: ${model_saving.model_file_extension} - log_dir: ${log_dir} - cache_dir: ${cache_dir} - imputer: ${model_params.iterator.imputer} - normalization: ${model_params.iterator.normalization} - -model_params: - epochs: 20 - early_stopping_rounds: 5 - model: - _target_: sklearn.linear_model.SGDClassifier - loss: log_loss - iterator: - keep_data_in_memory: True - binarize_task: True - normalization: ${normalization} - imputer: ${imputer} - -hydra: - sweeper: - direction: maximize - n_trials: 250 - n_jobs: 25 - params: - +model_params.model.alpha: tag(log, interval(1e-6, 1)) - +model_params.model.l1_ratio: interval(0, 1) - +model_params.model.penalty: choice(['l1', 'l2', 'elasticnet']) - model_params.epochs: range(10, 100) - model_params.early_stopping_rounds: range(1, 10) diff --git a/src/MEDS_tabular_automl/configs/model/xgboost.yaml b/src/MEDS_tabular_automl/configs/model/xgboost.yaml deleted file mode 100644 index 793cc29..0000000 --- a/src/MEDS_tabular_automl/configs/model/xgboost.yaml +++ /dev/null @@ -1,47 +0,0 @@ -# @package _global_ - -model_target: - _target_: MEDS_tabular_automl.xgboost_model.XGBoostModel.initialize - model_params: ${model_params} - input_dir: ${input_dir} - input_label_dir: ${input_label_dir} - model_dir: ${model_saving.model_dir} - model_file_stem: ${model_saving.model_file_stem} - model_file_extension: ${model_saving.model_file_extension} - log_dir: ${log_dir} - cache_dir: ${cache_dir} - imputer: ${model_params.iterator.imputer} - normalization: ${model_params.iterator.normalization} - # tabularization: ${tabularization} # Ideally we should define tabularization here, but there is an issue initializing with it's resolvers. - -model_params: - num_boost_round: 1000 - early_stopping_rounds: 5 - model: - booster: gbtree - device: cpu - nthread: 1 - tree_method: hist - objective: binary:logistic - iterator: - keep_data_in_memory: True - binarize_task: True - normalization: ${normalization} - imputer: ${imputer} - -hydra: - sweeper: - direction: maximize - n_trials: 250 - n_jobs: 25 - - params: - +model_params.model.eta: tag(log, interval(0.001, 1)) - +model_params.model.lambda: tag(log, interval(0.001, 1)) - +model_params.model.alpha: tag(log, interval(0.001, 1)) - +model_params.model.subsample: interval(0.5, 1) - +model_params.model.min_child_weight: interval(1e-2, 100) - model_params.num_boost_round: range(100, 1000) - model_params.early_stopping_rounds: range(1, 10) - +model_params.model.max_depth: range(2, 16) - tabularization.min_code_inclusion_frequency: tag(log, range(10, 1000000)) diff --git a/src/MEDS_tabular_automl/configs/model_launcher/autogluon.yaml b/src/MEDS_tabular_automl/configs/model_launcher/autogluon.yaml new file mode 100644 index 0000000..b7d02cd --- /dev/null +++ b/src/MEDS_tabular_automl/configs/model_launcher/autogluon.yaml @@ -0,0 +1,3 @@ +defaults: + - default + - _self_ diff --git a/src/MEDS_tabular_automl/configs/model_launcher/data_loading_params/default.yaml b/src/MEDS_tabular_automl/configs/model_launcher/data_loading_params/default.yaml new file mode 100644 index 0000000..723131f --- /dev/null +++ b/src/MEDS_tabular_automl/configs/model_launcher/data_loading_params/default.yaml @@ -0,0 +1,2 @@ +keep_data_in_memory: True +binarize_task: True diff --git a/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/default.yaml b/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/default.yaml new file mode 100644 index 0000000..5cf0c5b --- /dev/null +++ b/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/default.yaml @@ -0,0 +1,3 @@ +defaults: + - imputer: default + - normalization: default diff --git a/src/MEDS_tabular_automl/configs/imputer/default.yaml b/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/imputer/default.yaml similarity index 100% rename from src/MEDS_tabular_automl/configs/imputer/default.yaml rename to src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/imputer/default.yaml diff --git a/src/MEDS_tabular_automl/configs/imputer/mean_imputer.yaml b/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/imputer/mean_imputer.yaml similarity index 100% rename from src/MEDS_tabular_automl/configs/imputer/mean_imputer.yaml rename to src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/imputer/mean_imputer.yaml diff --git a/src/MEDS_tabular_automl/configs/imputer/median_imputer.yaml b/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/imputer/median_imputer.yaml similarity index 100% rename from src/MEDS_tabular_automl/configs/imputer/median_imputer.yaml rename to src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/imputer/median_imputer.yaml diff --git a/src/MEDS_tabular_automl/configs/imputer/mode_imputer.yaml b/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/imputer/mode_imputer.yaml similarity index 100% rename from src/MEDS_tabular_automl/configs/imputer/mode_imputer.yaml rename to src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/imputer/mode_imputer.yaml diff --git a/src/MEDS_tabular_automl/configs/normalization/default.yaml b/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/normalization/default.yaml similarity index 100% rename from src/MEDS_tabular_automl/configs/normalization/default.yaml rename to src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/normalization/default.yaml diff --git a/src/MEDS_tabular_automl/configs/normalization/max_abs_scaler.yaml b/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/normalization/max_abs_scaler.yaml similarity index 100% rename from src/MEDS_tabular_automl/configs/normalization/max_abs_scaler.yaml rename to src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/normalization/max_abs_scaler.yaml diff --git a/src/MEDS_tabular_automl/configs/normalization/standard_scaler.yaml b/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/normalization/standard_scaler.yaml similarity index 100% rename from src/MEDS_tabular_automl/configs/normalization/standard_scaler.yaml rename to src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/normalization/standard_scaler.yaml diff --git a/src/MEDS_tabular_automl/configs/model_launcher/default.yaml b/src/MEDS_tabular_automl/configs/model_launcher/default.yaml new file mode 100644 index 0000000..6df964d --- /dev/null +++ b/src/MEDS_tabular_automl/configs/model_launcher/default.yaml @@ -0,0 +1,7 @@ +defaults: + - path: default + - data_processing_params: default + - data_loading_params: default + - _self_ + +tabularization: ${tabularization} diff --git a/src/MEDS_tabular_automl/configs/model_launcher/hydra/sweeper/default.yaml b/src/MEDS_tabular_automl/configs/model_launcher/hydra/sweeper/default.yaml new file mode 100644 index 0000000..a1faf8f --- /dev/null +++ b/src/MEDS_tabular_automl/configs/model_launcher/hydra/sweeper/default.yaml @@ -0,0 +1,5 @@ +hydra: + sweeper: + direction: maximize + n_trials: 250 + n_jobs: 25 diff --git a/src/MEDS_tabular_automl/configs/model_launcher/knn_classifier.yaml b/src/MEDS_tabular_automl/configs/model_launcher/knn_classifier.yaml new file mode 100644 index 0000000..d6227f2 --- /dev/null +++ b/src/MEDS_tabular_automl/configs/model_launcher/knn_classifier.yaml @@ -0,0 +1,31 @@ +defaults: + - default + - hydra/sweeper: default + - _self_ + +_target_: MEDS_tabular_automl.sklearn_model.SklearnModel.initialize + +model: + _target_: sklearn.neighbors.KNeighborsClassifier + weights: "distance" + leaf_size: 30 + p: 2 + metric: "minkowski" + +training_params: + epochs: 20 + early_stopping_rounds: 5 + +path: + model_file_extension: .pkl + +hydra: + sweeper: + +params: + model.n_neighbors: range(1, 20) + model.weights: choice(['uniform', 'distance']) + model.leaf_size: range(10, 50) + model.p: choice([1, 2]) + model.metric: choice(['minkowski', 'euclidean', 'manhattan']) + epochs: range(10, 100) + early_stopping_rounds: range(1, 10) diff --git a/src/MEDS_tabular_automl/configs/model_launcher/logistic_regression.yaml b/src/MEDS_tabular_automl/configs/model_launcher/logistic_regression.yaml new file mode 100644 index 0000000..58e3753 --- /dev/null +++ b/src/MEDS_tabular_automl/configs/model_launcher/logistic_regression.yaml @@ -0,0 +1,35 @@ +defaults: + - default + - hydra/sweeper: default + - _self_ + +_target_: MEDS_tabular_automl.sklearn_model.SklearnModel.initialize + +model: + _target_: sklearn.linear_model.LogisticRegression + penalty: "l2" + dual: false + tol: 0.0001 + C: 1.0 + fit_intercept: True + intercept_scaling: 1 + class_weight: null + random_state: null + solver: "lbfgs" + max_iter: 100 + +training_params: + epochs: 20 + early_stopping_rounds: 5 + +path: + model_file_extension: .pkl + +hydra: + sweeper: + +params: + model.C: tag(log, interval(1e-6, 1)) + model.penalty: choice(['l1', 'l2', 'elasticnet']) + model.solver: choice(['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']) + epochs: range(10, 100) + early_stopping_rounds: range(1, 10) diff --git a/src/MEDS_tabular_automl/configs/model_launcher/path/default.yaml b/src/MEDS_tabular_automl/configs/model_launcher/path/default.yaml new file mode 100644 index 0000000..1e77b62 --- /dev/null +++ b/src/MEDS_tabular_automl/configs/model_launcher/path/default.yaml @@ -0,0 +1,10 @@ +input_dir: ${input_dir} +input_label_dir: ${input_label_dir} +output_dir: ${output_dir} +model_file_stem: model +model_file_extension: .json +log_dir: ${log_dir} +cache_dir: ${cache_dir} +model_log_dir: ${output_dir}/.logs/ +performance_log_stem: performance +config_log_stem: config diff --git a/src/MEDS_tabular_automl/configs/model_launcher/random_forest_classifier.yaml b/src/MEDS_tabular_automl/configs/model_launcher/random_forest_classifier.yaml new file mode 100644 index 0000000..bfb285c --- /dev/null +++ b/src/MEDS_tabular_automl/configs/model_launcher/random_forest_classifier.yaml @@ -0,0 +1,38 @@ +defaults: + - default + - hydra/sweeper: default + - _self_ + +_target_: MEDS_tabular_automl.sklearn_model.SklearnModel.initialize + +model: + _target_: sklearn.ensemble.RandomForestClassifier + criterion: "gini" + max_depth: null + min_samples_split: 2 + min_samples_leaf: 1 + min_weight_fraction_leaf: 0.0 + max_features: "sqrt" + max_leaf_nodes: null + min_impurity_decrease: 0.0 + bootstrap: True + +training_params: + epochs: 20 + early_stopping_rounds: 5 + +path: + model_file_extension: .pkl + +hydra: + sweeper: + +params: + model.n_estimators: range(50, 300, 50) + model.max_depth: choice([null, 10, 20, 30, 40, 50]) + model.min_samples_split: range(2, 11) + model.min_samples_leaf: range(1, 5) + model.max_features: choice(['sqrt', 'log2', null]) + model.bootstrap: choice([True, False]) + model.criterion: choice(['gini', 'entropy']) + epochs: range(10, 100) + early_stopping_rounds: range(1, 10) diff --git a/src/MEDS_tabular_automl/configs/model_launcher/sgd_classifier.yaml b/src/MEDS_tabular_automl/configs/model_launcher/sgd_classifier.yaml new file mode 100644 index 0000000..1a8a7e0 --- /dev/null +++ b/src/MEDS_tabular_automl/configs/model_launcher/sgd_classifier.yaml @@ -0,0 +1,26 @@ +defaults: + - default + - hydra/sweeper: default + - _self_ + +_target_: MEDS_tabular_automl.sklearn_model.SklearnModel.initialize + +model: + _target_: sklearn.linear_model.SGDClassifier + loss: log_loss + +training_params: + epochs: 20 + early_stopping_rounds: 5 + +path: + model_file_extension: .pkl + +hydra: + sweeper: + +params: + model.alpha: tag(log, interval(1e-6, 1)) + model.l1_ratio: interval(0, 1) + model.penalty: choice(['l1', 'l2', 'elasticnet']) + epochs: range(10, 100) + early_stopping_rounds: range(1, 10) diff --git a/src/MEDS_tabular_automl/configs/model_launcher/xgboost.yaml b/src/MEDS_tabular_automl/configs/model_launcher/xgboost.yaml new file mode 100644 index 0000000..2b69965 --- /dev/null +++ b/src/MEDS_tabular_automl/configs/model_launcher/xgboost.yaml @@ -0,0 +1,30 @@ +defaults: + - default + - hydra/sweeper: default + - _self_ + +_target_: MEDS_tabular_automl.xgboost_model.XGBoostModel.initialize + +model: + booster: gbtree + device: cpu + nthread: 1 + tree_method: hist + objective: binary:logistic + +training_params: + num_boost_round: 1000 + early_stopping_rounds: 5 + +hydra: + sweeper: + +params: + model.eta: tag(log, interval(0.001, 1)) + model.lambda: tag(log, interval(0.001, 1)) + model.alpha: tag(log, interval(0.001, 1)) + model.subsample: interval(0.5, 1) + model.min_child_weight: interval(1e-2, 100) + num_boost_round: range(100, 1000) + early_stopping_rounds: range(1, 10) + model.max_depth: range(2, 16) + tabularization.min_code_inclusion_frequency: tag(log, range(10, 1000000)) diff --git a/src/MEDS_tabular_automl/configs/tabularization.yaml b/src/MEDS_tabular_automl/configs/tabularization.yaml index cf03d63..ca2c4cb 100644 --- a/src/MEDS_tabular_automl/configs/tabularization.yaml +++ b/src/MEDS_tabular_automl/configs/tabularization.yaml @@ -6,7 +6,7 @@ defaults: # Raw data # Where the code metadata is stored input_code_metadata_fp: ${output_cohort_dir}/metadata/codes.parquet -input_dir: ${output_cohort_dir}/data +input_dir: ${MEDS_cohort_dir}/data output_dir: ${output_cohort_dir}/tabularize name: tabularization diff --git a/src/MEDS_tabular_automl/configs/task_specific_caching.yaml b/src/MEDS_tabular_automl/configs/task_specific_caching.yaml index 63fed0f..80510f6 100644 --- a/src/MEDS_tabular_automl/configs/task_specific_caching.yaml +++ b/src/MEDS_tabular_automl/configs/task_specific_caching.yaml @@ -2,7 +2,7 @@ defaults: - default - tabularization: default - _self_ -task_name: task +task_name: ??? # Tabularized Data input_dir: ${output_cohort_dir}/tabularize diff --git a/src/MEDS_tabular_automl/describe_codes.py b/src/MEDS_tabular_automl/describe_codes.py index 70c53bd..23cdb98 100644 --- a/src/MEDS_tabular_automl/describe_codes.py +++ b/src/MEDS_tabular_automl/describe_codes.py @@ -2,7 +2,7 @@ import polars as pl -from MEDS_tabular_automl.utils import DF_T, get_feature_names +from MEDS_tabular_automl.utils import get_feature_names def convert_to_df(freq_dict: dict[str, int]) -> pl.DataFrame: @@ -65,7 +65,7 @@ def convert_to_freq_dict(df: pl.LazyFrame) -> dict[str, dict[int, int]]: return dict(df.collect().iter_rows()) -def compute_feature_frequencies(shard_df: DF_T) -> pl.DataFrame: +def compute_feature_frequencies(shard_df: pl.LazyFrame) -> pl.DataFrame: """Generates a DataFrame containing the frequencies of codes and numerical values under different aggregations by computing frequency counts for certain attributes and organizing the results into specific categories based on the dataset's features. diff --git a/src/MEDS_tabular_automl/scripts/cache_task.py b/src/MEDS_tabular_automl/scripts/cache_task.py index 0903aa3..2ba030d 100644 --- a/src/MEDS_tabular_automl/scripts/cache_task.py +++ b/src/MEDS_tabular_automl/scripts/cache_task.py @@ -25,6 +25,7 @@ hydra_loguru_init, load_matrix, load_tqdm, + tabularize_init, write_df, ) @@ -79,6 +80,7 @@ def main(cfg: DictConfig): Args: cfg: The configuration for processing, loaded from a YAML file. """ + tabularize_init(cfg) iter_wrapper = load_tqdm(cfg.tqdm) if not cfg.loguru_init: hydra_loguru_init() diff --git a/src/MEDS_tabular_automl/scripts/describe_codes.py b/src/MEDS_tabular_automl/scripts/describe_codes.py index c29b542..a742a29 100644 --- a/src/MEDS_tabular_automl/scripts/describe_codes.py +++ b/src/MEDS_tabular_automl/scripts/describe_codes.py @@ -17,7 +17,13 @@ ) from ..file_name import list_subdir_files from ..mapper import wrap as rwlock_wrap -from ..utils import get_shard_prefix, hydra_loguru_init, load_tqdm, write_df +from ..utils import ( + get_shard_prefix, + hydra_loguru_init, + load_tqdm, + tabularize_init, + write_df, +) config_yaml = files("MEDS_tabular_automl").joinpath("configs/describe_codes.yaml") if not config_yaml.is_file(): @@ -32,6 +38,7 @@ def main(cfg: DictConfig): cfg: The configuration object for the tabularization process, loaded from a Hydra YAML configuration file. """ + tabularize_init(cfg) iter_wrapper = load_tqdm(cfg.tqdm) if not cfg.loguru_init: hydra_loguru_init() diff --git a/src/MEDS_tabular_automl/scripts/launch_autogluon.py b/src/MEDS_tabular_automl/scripts/launch_autogluon.py index db61e9f..dbd411d 100644 --- a/src/MEDS_tabular_automl/scripts/launch_autogluon.py +++ b/src/MEDS_tabular_automl/scripts/launch_autogluon.py @@ -6,15 +6,27 @@ from loguru import logger from omegaconf import DictConfig +try: + import autogluon.tabular as ag +except ImportError: + ag = None + from MEDS_tabular_automl.dense_iterator import DenseIterator -from ..utils import hydra_loguru_init +from ..utils import hydra_loguru_init, launch_model_init config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_autogluon.yaml") if not config_yaml.is_file(): raise FileNotFoundError("Core configuration not successfully installed!") +def check_autogluon(): + if ag is None: + raise ImportError( + "AutoGluon could not be imported. Please try installing it using: `pip install autogluon`" + ) + + @hydra.main(version_base=None, config_path=str(config_yaml.parent.resolve()), config_name=config_yaml.stem) def main(cfg: DictConfig) -> float: """Launches AutoGluon after collecting data based on the provided configuration. @@ -22,17 +34,11 @@ def main(cfg: DictConfig) -> float: Args: cfg: The configuration dictionary specifying model and training parameters. """ - - # print(OmegaConf.to_yaml(cfg)) + check_autogluon() + launch_model_init(cfg) if not cfg.loguru_init: hydra_loguru_init() - # check that autogluon is installed - try: - import autogluon.tabular as ag - except ImportError: - logger.error("AutoGluon is not installed. Please install AutoGluon.") - # collect data based on the configuration itrain = DenseIterator(cfg, "train") ituning = DenseIterator(cfg, "tuning") @@ -44,13 +50,13 @@ def main(cfg: DictConfig) -> float: held_out_data, held_out_labels = iheld_out.densify() # construct dfs for AutoGluon - train_df = pd.DataFrame(train_data.todense()) # , columns=cols) + train_df = pd.DataFrame(train_data.todense()) train_df[cfg.task_name] = train_labels tuning_df = pd.DataFrame( tuning_data.todense(), - ) # columns=cols) + ) tuning_df[cfg.task_name] = tuning_labels - held_out_df = pd.DataFrame(held_out_data.todense()) # , columns=cols) + held_out_df = pd.DataFrame(held_out_data.todense()) held_out_df[cfg.task_name] = held_out_labels train_dataset = ag.TabularDataset(train_df) diff --git a/src/MEDS_tabular_automl/scripts/launch_model.py b/src/MEDS_tabular_automl/scripts/launch_model.py index 97943b4..cc9c7bb 100644 --- a/src/MEDS_tabular_automl/scripts/launch_model.py +++ b/src/MEDS_tabular_automl/scripts/launch_model.py @@ -3,11 +3,11 @@ from pathlib import Path import hydra -from omegaconf import DictConfig, open_dict +from omegaconf import DictConfig from MEDS_tabular_automl.base_model import BaseModel -from ..utils import hydra_loguru_init, log_to_logfile +from ..utils import hydra_loguru_init, launch_model_init, log_to_logfile config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_model.yaml") if not config_yaml.is_file(): @@ -24,25 +24,21 @@ def main(cfg: DictConfig) -> float: Returns: The evaluation result as the ROC AUC score on the held-out test set. """ + launch_model_init(cfg) - # print(OmegaConf.to_yaml(cfg)) if not cfg.loguru_init: hydra_loguru_init() model: BaseModel = hydra.utils.instantiate(cfg.model_target) - # TODO - make tabularuzation be copied in the yaml instead of here - with open_dict(cfg): - model.cfg.tabularization = hydra.utils.instantiate(cfg.tabularization) model.train() auc = model.evaluate() - # logger.info(f"AUC: {auc}") # save model output_fp = Path(cfg.model_saving.model_dir) output_fp = ( output_fp.parent - / f"{cfg.model_saving.model_file_stem}_{auc:.4f}_{time.time()}{cfg.model_saving.model_file_extension}" + / f"{cfg.model_saving.model_file_stem}_{auc:.4f}_{time.time()}{cfg.model_target.model_file_extension}" ) output_fp.parent.mkdir(parents=True, exist_ok=True) diff --git a/src/MEDS_tabular_automl/scripts/tabularize_static.py b/src/MEDS_tabular_automl/scripts/tabularize_static.py index 34d9c0d..d692142 100644 --- a/src/MEDS_tabular_automl/scripts/tabularize_static.py +++ b/src/MEDS_tabular_automl/scripts/tabularize_static.py @@ -30,6 +30,7 @@ get_shard_prefix, hydra_loguru_init, load_tqdm, + tabularize_init, write_df, ) @@ -81,6 +82,7 @@ def main( .. _link: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.groupby_rolling.html # noqa: E501 """ + tabularize_init(cfg) iter_wrapper = load_tqdm(cfg.tqdm) if not cfg.loguru_init: hydra_loguru_init() diff --git a/src/MEDS_tabular_automl/scripts/tabularize_time_series.py b/src/MEDS_tabular_automl/scripts/tabularize_time_series.py index 8eca7f4..8de82a3 100644 --- a/src/MEDS_tabular_automl/scripts/tabularize_time_series.py +++ b/src/MEDS_tabular_automl/scripts/tabularize_time_series.py @@ -26,6 +26,7 @@ get_shard_prefix, hydra_loguru_init, load_tqdm, + tabularize_init, write_df, ) @@ -64,6 +65,7 @@ def main( FileNotFoundError: If specified directories or files in the configuration are not found. ValueError: If required columns like 'code' or 'value' are missing in the data files. """ + tabularize_init(cfg) iter_wrapper = load_tqdm(cfg.tqdm) if not cfg.loguru_init: hydra_loguru_init() diff --git a/src/MEDS_tabular_automl/sklearn_model.py b/src/MEDS_tabular_automl/sklearn_model.py index 56063fd..9bd96a3 100644 --- a/src/MEDS_tabular_automl/sklearn_model.py +++ b/src/MEDS_tabular_automl/sklearn_model.py @@ -1,4 +1,5 @@ from pathlib import Path +from pickle import dump import numpy as np import scipy.sparse as sp @@ -224,9 +225,9 @@ def save_model(self, output_fp: str): if not hasattr(self.model, "save_model"): logger.info(f"Model {self.model.__class__.__name__} does not have a save_model method.") logger.info("Model will be saved using pickle dump.") - from pickle import dump - - with open(output_fp.parent / "model.pkl", "wb") as f: + if not output_fp.endswith(".pkl"): + raise ValueError("Model file extension must be .pkl.") + with open(output_fp, "wb") as f: dump(self.model, f, protocol=5) else: self.model.save_model(output_fp) diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py index 6b726f9..17498eb 100644 --- a/src/MEDS_tabular_automl/utils.py +++ b/src/MEDS_tabular_automl/utils.py @@ -1,21 +1,15 @@ -"""The base class for core dataset processing logic. - -Attributes: - INPUT_DF_T: This defines the type of the allowable input dataframes -- e.g., databases, filepaths, - dataframes, etc. - DF_T: This defines the type of internal dataframes -- e.g. polars DataFrames. -""" +"""The base class for core dataset processing logic and script utilities.""" import os +import sys from pathlib import Path import hydra import numpy as np import polars as pl from loguru import logger -from omegaconf import OmegaConf +from omegaconf import DictConfig, OmegaConf from scipy.sparse import coo_array -DF_T = pl.LazyFrame WRITE_USE_PYARROW = True ROW_IDX_NAME = "__row_idx" @@ -422,3 +416,95 @@ def log_to_logfile(model, cfg, output_fp): f.write(f"{output_fp},{model.evaluate()},{model.evaluate(split='held_out')}\n") logger.debug(f"Model config and performance logged to {config_fp} and {model_performance_fp}") + + +def current_script_name() -> str: + """Returns the name of the module that called this function.""" + + main_module = sys.modules["__main__"] + main_func = getattr(main_module, "main", None) + if main_func and callable(main_func): + func_module = main_func.__module__ + if func_module == "__main__": + return Path(sys.argv[0]).stem + else: + return func_module.split(".")[-1] + + logger.warning("Can't find main function in __main__ module. Using sys.argv[0] as a fallback.") + return Path(sys.argv[0]).stem + + +def tabularize_init(cfg: DictConfig): + """Initializes the stage by logging the configuration and the stage-specific paths. + + Args: + cfg: The global configuration object, which should have a ``cfg.stage_cfg`` attribute containing the + stage specific configuration. + + Returns: The data input directory, stage output directory, and metadata input directory. + """ + hydra_loguru_init() + + logger.info( + f"Running {current_script_name()} with the following configuration:\n{OmegaConf.to_yaml(cfg)}" + ) + + input_dir = Path(cfg.data_input_dir) + output_dir = Path(cfg.stage_cfg.output_dir) + metadata_input_dir = Path(cfg.stage_cfg.metadata_input_dir) + + def chk(x: Path): + return "✅" if x.exists() else "❌" + + paths_strs = [ + f" - {k}: {chk(v)} {str(v.resolve())}" + for k, v in { + "input_dir": input_dir, + "output_dir": output_dir, + "metadata_input_dir": metadata_input_dir, + }.items() + ] + + logger_strs = [ + f"Stage config:\n{OmegaConf.to_yaml(cfg.stage_cfg)}", + "Paths: (checkbox indicates if it exists)", + ] + logger.debug("\n".join(logger_strs + paths_strs)) + + +def launch_model_init(cfg: DictConfig): + """Initializes the stage by logging the configuration and the stage-specific paths. + + Args: + cfg: The global configuration object, which should have a ``cfg.stage_cfg`` attribute containing the + stage specific configuration. + + Returns: The data input directory, stage output directory, and metadata input directory. + """ + hydra_loguru_init() + + logger.info( + f"Running {current_script_name()} with the following configuration:\n{OmegaConf.to_yaml(cfg)}" + ) + + input_dir = Path(cfg.data_input_dir) + output_dir = Path(cfg.stage_cfg.output_dir) + metadata_input_dir = Path(cfg.stage_cfg.metadata_input_dir) + + def chk(x: Path): + return "✅" if x.exists() else "❌" + + paths_strs = [ + f" - {k}: {chk(v)} {str(v.resolve())}" + for k, v in { + "input_dir": input_dir, + "output_dir": output_dir, + "metadata_input_dir": metadata_input_dir, + }.items() + ] + + logger_strs = [ + f"Stage config:\n{OmegaConf.to_yaml(cfg.stage_cfg)}", + "Paths: (checkbox indicates if it exists)", + ] + logger.debug("\n".join(logger_strs + paths_strs)) diff --git a/src/MEDS_tabular_automl/xgboost_model.py b/src/MEDS_tabular_automl/xgboost_model.py index ec33c15..fe8d96c 100644 --- a/src/MEDS_tabular_automl/xgboost_model.py +++ b/src/MEDS_tabular_automl/xgboost_model.py @@ -130,7 +130,8 @@ def _build(self): def _train(self): """Trains the model.""" - self.model = xgb.train( + self.model = self.cfg.model + self.model = self.model.train( OmegaConf.to_container(self.cfg.model_params.model), self.dtrain, num_boost_round=self.cfg.model_params.num_boost_round, From 0e985ee27c6cccfae8c6ce41a7f164acca88e5bc Mon Sep 17 00:00:00 2001 From: teyaberg Date: Sat, 7 Sep 2024 23:08:39 +0000 Subject: [PATCH 25/54] eval callback --- .../evaluation_callback.py | 51 ++++++++----------- src/MEDS_tabular_automl/utils.py | 4 +- tests/test_integration.py | 14 +++++ tests/test_tabularize.py | 6 ++- 4 files changed, 43 insertions(+), 32 deletions(-) diff --git a/src/MEDS_tabular_automl/evaluation_callback.py b/src/MEDS_tabular_automl/evaluation_callback.py index fddb858..6af58c0 100644 --- a/src/MEDS_tabular_automl/evaluation_callback.py +++ b/src/MEDS_tabular_automl/evaluation_callback.py @@ -1,4 +1,3 @@ -import ast from pathlib import Path import polars as pl @@ -16,51 +15,45 @@ def on_multirun_end(self, config: DictConfig, **kwargs): log_fp = Path(config.model_logging.model_log_dir) try: - performance = pl.read_csv(log_fp / "*/*.csv") + perf = pl.read_csv(log_fp / f"*/*{config.model_logging.performance_log_stem}.log") except Exception as e: raise FileNotFoundError(f"Log files incomplete or not found at {log_fp}, exception {e}.") - performance = performance.sort("tuning_auc", descending=True, nulls_last=True) - logger.info(performance.head(10)) + perf = perf.sort("tuning_auc", descending=True, nulls_last=True) + logger.info(f"\nPerformance of the top 10 models:\n{perf.head(10)}") # get best model_fp - best_model = performance[0, 0] - - best_params_fp = log_fp / best_model / f"{config.model_logging.config_log_stem}.json" - - # check if this file exists - if not best_params_fp.is_file(): - raise FileNotFoundError(f"Best hyperparameters file not found at {best_params_fp}") + best_model = perf[0, 0] logger.info(f"The best model can be found at {best_model}") - # self.log_performance(performance.head(1)) - # self.log_hyperparams(best_hyperparams) + self.log_performance(perf[0, :]) + # self.log_hyperparams(log_fp / best_model / f"{config.model_logging.config_log_stem}.log") if hasattr(config, "model_saving.delete_below_top_k") and config.delete_below_top_k >= 0: self.delete_below_top_k_models( - performance, config.model_saving.delete_below_top_k, config.model_saving.model_dir + perf, config.model_saving.delete_below_top_k, config.model_saving.model_dir ) - return performance.head(1) + return perf.head(1) - def log_performance(self, performance): + def log_performance(self, perf): """logger.info performance of the best model with nice formatting.""" - logger.info("Performance of the best model:") - logger.info(f"Tuning AUC: {performance['tuning_auc'].values[0]}") - logger.info(f"Test AUC: {performance['test_auc'].values[0]}") - - def log_hyperparams(self, hyperparams): - """logger.info hyperparameters of the best model with nice formatting.""" - logger.info("Hyperparameters of the best model:") logger.info( - f"Tabularization: {OmegaConf.to_yaml(ast.literal_eval(hyperparams['tabularization'].values[0]))}" - ) - logger.info( - f"Model parameters: {OmegaConf.to_yaml(ast.literal_eval(hyperparams['model_params'].values[0]))}" + "\nPerformance of the best model:\n", + f"Tuning AUC: {perf['tuning_auc'][0]}\nTest AUC: {perf['test_auc'][0]}", ) - def delete_below_top_k_models(self, performance, k, model_dir): + def log_hyperparams(self, best_params_fp): + """logger.info hyperparameters of the best model with nice formatting.""" + # check if this file exists + if not best_params_fp.is_file(): + raise FileNotFoundError(f"Best hyperparameters file not found at {best_params_fp}") + best_params = OmegaConf.load(best_params_fp) + # print using OmegaConf.to_yaml + logger.info(f"\nHyperparameters of the best model:\n{OmegaConf.to_yaml(best_params)}") + + def delete_below_top_k_models(self, perf, k, model_dir): """Save only top k models from the model directory and delete all other files.""" - top_k_models = performance.head(k)["model_fp"].values + top_k_models = perf.head(k)["model_fp"].values for model_fp in Path(model_dir).iterdir(): if model_fp.is_file() and model_fp.suffix != ".log" and str(model_fp) not in top_k_models: model_fp.unlink() diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py index 6b726f9..3d6f496 100644 --- a/src/MEDS_tabular_automl/utils.py +++ b/src/MEDS_tabular_automl/utils.py @@ -412,11 +412,11 @@ def log_to_logfile(model, cfg, output_fp): out_fp.mkdir(parents=True, exist_ok=True) # config as a json - config_fp = out_fp / f"{cfg.model_logging.config_log_stem}.json" + config_fp = out_fp / f"{cfg.model_logging.config_log_stem}.log" with open(config_fp, "w") as f: f.write(OmegaConf.to_yaml(cfg)) - model_performance_fp = out_fp / f"{cfg.model_logging.performance_log_stem}.csv" + model_performance_fp = out_fp / f"{cfg.model_logging.performance_log_stem}.log" with open(model_performance_fp, "w") as f: f.write("model_fp,tuning_auc,test_auc\n") f.write(f"{output_fp},{model.evaluate()},{model.evaluate(split='held_out')}\n") diff --git a/tests/test_integration.py b/tests/test_integration.py index 907b038..f0b20dc 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -229,3 +229,17 @@ def test_integration(tmp_path): cache_config, "task_specific_caching", ) + stderr, stdout = run_command( + "meds-tab-model", + [ + "--multirun", + f"tabularization.window_sizes={stdout_ws.strip()}", + f"tabularization.aggs={stdout_agg.strip()}", + "hydra.sweeper.n_jobs=5", + "hydra.sweeper.n_trials=10", + ], + cache_config, + "xgboost-model", + ) + assert "The best model can be found at" in stderr + assert "Performance of the best model:" in stderr diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index b1bd879..d110121 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -354,7 +354,11 @@ def test_tabularize(tmp_path): HydraConfig().set_config(cfg) launch_model.main(cfg) output_files = list(output_dir.glob("**/*.json")) - assert len(output_files) == 2 + assert len(output_files) == 1 + + log_dir = Path(cfg.model_logging.model_log_dir) + log_csv = list(log_dir.glob("**/*.log")) + assert len(log_csv) == 2 sklearnmodel_config_kwargs = { **shared_config, From 139870f70f50ec27bffc3009aa5899239f65d220 Mon Sep 17 00:00:00 2001 From: teyaberg Date: Sat, 7 Sep 2024 23:15:25 +0000 Subject: [PATCH 26/54] eval callback --- src/MEDS_tabular_automl/evaluation_callback.py | 5 +++-- tests/test_integration.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/MEDS_tabular_automl/evaluation_callback.py b/src/MEDS_tabular_automl/evaluation_callback.py index 6af58c0..d9236f8 100644 --- a/src/MEDS_tabular_automl/evaluation_callback.py +++ b/src/MEDS_tabular_automl/evaluation_callback.py @@ -37,9 +37,10 @@ def on_multirun_end(self, config: DictConfig, **kwargs): def log_performance(self, perf): """logger.info performance of the best model with nice formatting.""" + tuning_auc = perf["tuning_auc"][0] + test_auc = perf["test_auc"][0] logger.info( - "\nPerformance of the best model:\n", - f"Tuning AUC: {perf['tuning_auc'][0]}\nTest AUC: {perf['test_auc'][0]}", + f"\nPerformance of best model:\nTuning AUC: {tuning_auc}\nTest AUC: {test_auc}", ) def log_hyperparams(self, best_params_fp): diff --git a/tests/test_integration.py b/tests/test_integration.py index f0b20dc..055070c 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -242,4 +242,4 @@ def test_integration(tmp_path): "xgboost-model", ) assert "The best model can be found at" in stderr - assert "Performance of the best model:" in stderr + assert "Performance of best model:" in stderr From 0d5e9e8a0057997ae64dcff3f971e144d48948db Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Sun, 8 Sep 2024 15:56:57 -0400 Subject: [PATCH 27/54] Updated pre-commit config too. --- .pre-commit-config.yaml | 4 +- src/MEDS_tabular_automl/dense_iterator.py | 37 ----------------- .../scripts/launch_autogluon.py | 2 +- src/MEDS_tabular_automl/sklearn_model.py | 41 +------------------ src/MEDS_tabular_automl/tabular_dataset.py | 19 +++++++++ src/MEDS_tabular_automl/xgboost_model.py | 5 +-- 6 files changed, 23 insertions(+), 85 deletions(-) delete mode 100644 src/MEDS_tabular_automl/dense_iterator.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1533f74..6fd8933 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,7 +1,5 @@ default_language_version: - python: python3.12 - -exclude: "sample_data|docs/MIMIC_IV_tutorial/wandb_reports" + python: python3.11 repos: - repo: https://github.com/pre-commit/pre-commit-hooks diff --git a/src/MEDS_tabular_automl/dense_iterator.py b/src/MEDS_tabular_automl/dense_iterator.py deleted file mode 100644 index 33d13b0..0000000 --- a/src/MEDS_tabular_automl/dense_iterator.py +++ /dev/null @@ -1,37 +0,0 @@ -import numpy as np -import scipy.sparse as sp -from mixins import TimeableMixin -from omegaconf import DictConfig - -from .tabular_dataset import TabularDataset - - -class DenseIterator(TabularDataset, TimeableMixin): - def __init__(self, cfg: DictConfig, split: str): - """Initializes the SklearnIterator with the provided configuration and data split. - - Args: - cfg: The configuration dictionary. - split: The data split to use. - """ - TabularDataset.__init__(self, cfg=cfg, split=split) - TimeableMixin.__init__(self) - self.valid_event_ids, self.labels = self._load_ids_and_labels() - # check if the labels are empty - if len(self.labels) == 0: - raise ValueError("No labels found.") - # self._it = 0 - - def densify(self) -> np.ndarray: - """Builds the data as a dense matrix based on column subselection.""" - - # get the dense matrix by iterating through the data shards - data = [] - labels = [] - for shard_idx in range(len(self._data_shards)): - shard_data, shard_labels = self.get_data_shards(shard_idx) - data.append(shard_data) - labels.append(shard_labels) - data = sp.vstack(data) - labels = np.concatenate(labels, axis=0) - return data, labels diff --git a/src/MEDS_tabular_automl/scripts/launch_autogluon.py b/src/MEDS_tabular_automl/scripts/launch_autogluon.py index db61e9f..d184603 100644 --- a/src/MEDS_tabular_automl/scripts/launch_autogluon.py +++ b/src/MEDS_tabular_automl/scripts/launch_autogluon.py @@ -6,7 +6,7 @@ from loguru import logger from omegaconf import DictConfig -from MEDS_tabular_automl.dense_iterator import DenseIterator +from MEDS_tabular_automl.tabular_dataset import TabularDataset as DenseIterator from ..utils import hydra_loguru_init diff --git a/src/MEDS_tabular_automl/sklearn_model.py b/src/MEDS_tabular_automl/sklearn_model.py index 56063fd..dbc519f 100644 --- a/src/MEDS_tabular_automl/sklearn_model.py +++ b/src/MEDS_tabular_automl/sklearn_model.py @@ -7,46 +7,7 @@ from sklearn.metrics import roc_auc_score from .base_model import BaseModel -from .tabular_dataset import TabularDataset - - -class SklearnIterator(TabularDataset): - """SklearnIterator class for loading and processing data shards for use in SciKit-Learn models. - - This class provides functionality for iterating through data shards, loading - feature data and labels, and processing them based on the provided configuration. - - Args: - cfg: A configuration dictionary containing parameters for - data processing, feature selection, and other settings. - split: The data split to use, which can be one of "train", "tuning", - or "held_out". This determines which subset of the data is loaded and processed. - - Attributes: - cfg: Configuration dictionary containing parameters for - data processing, feature selection, and other settings. - file_name_resolver: Object for resolving file names and paths based on the configuration. - split: The data split being used for loading and processing data shards. - _data_shards: List of data shard names. - valid_event_ids: Dictionary mapping shard number to a list of valid event IDs. - labels: Dictionary mapping shard number to a list of labels for the corresponding event IDs. - codes_set: Set of codes to include in the data. - code_masks: Dictionary of code masks for filtering features based on aggregation. - num_features: Total number of features in the data. - """ - - def __init__(self, cfg: DictConfig, split: str): - """Initializes the SklearnIterator with the provided configuration and data split. - - Args: - cfg: The configuration dictionary. - split: The data split to use. - """ - super().__init__(cfg=cfg, split=split) - self.valid_event_ids, self.labels = self._load_ids_and_labels() - # check if the labels are empty - if len(self.labels) == 0: - raise ValueError("No labels found.") +from .tabular_dataset import TabularDataset as SklearnIterator class SklearnMatrix: diff --git a/src/MEDS_tabular_automl/tabular_dataset.py b/src/MEDS_tabular_automl/tabular_dataset.py index 84a6609..b698904 100644 --- a/src/MEDS_tabular_automl/tabular_dataset.py +++ b/src/MEDS_tabular_automl/tabular_dataset.py @@ -61,6 +61,11 @@ def __init__(self, cfg: DictConfig, split: str = "train"): self._set_scaler() self._set_imputer() + self.valid_event_ids, self.labels = self._load_ids_and_labels() + # check if the labels are empty + if len(self.labels) == 0: + raise ValueError("No labels found.") + @TimeableMixin.TimeAs def _get_code_masks(self, feature_columns: list, codes_set: set) -> Mapping[str, list[bool]]: """Creates boolean masks for filtering features. @@ -497,3 +502,17 @@ def extract_name(test_file): all_indices.extend(feature_ids) return all_feats, all_indices + + def densify(self) -> np.ndarray: + """Builds the data as a dense matrix based on column subselection.""" + + # get the dense matrix by iterating through the data shards + data = [] + labels = [] + for shard_idx in range(len(self._data_shards)): + shard_data, shard_labels = self.get_data_shards(shard_idx) + data.append(shard_data) + labels.append(shard_labels) + data = sp.vstack(data) + labels = np.concatenate(labels, axis=0) + return data, labels diff --git a/src/MEDS_tabular_automl/xgboost_model.py b/src/MEDS_tabular_automl/xgboost_model.py index ec33c15..ea7ab92 100644 --- a/src/MEDS_tabular_automl/xgboost_model.py +++ b/src/MEDS_tabular_automl/xgboost_model.py @@ -45,10 +45,7 @@ def __init__(self, cfg: DictConfig, split: str): """ xgb.DataIter.__init__(self, cache_prefix=Path(cfg.cache_dir)) TabularDataset.__init__(self, cfg=cfg, split=split) - self.valid_event_ids, self.labels = self._load_ids_and_labels() - # check if the labels are empty - if self.labels is None: - raise ValueError("No labels found.") + self._it = 0 def next(self, input_data: Callable) -> int: From 2563aafe29e0e4ca0a470234738a3bd804c0b611 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Sun, 8 Sep 2024 16:39:20 -0400 Subject: [PATCH 28/54] Removed a function that was not yet implemented. --- src/MEDS_tabular_automl/tabular_dataset.py | 28 ---------------------- 1 file changed, 28 deletions(-) diff --git a/src/MEDS_tabular_automl/tabular_dataset.py b/src/MEDS_tabular_automl/tabular_dataset.py index b698904..f06433c 100644 --- a/src/MEDS_tabular_automl/tabular_dataset.py +++ b/src/MEDS_tabular_automl/tabular_dataset.py @@ -475,34 +475,6 @@ def extract_name(test_file): all_feats = [all_feats[i] for i in indices] return all_feats - def get_columns_and_indices(self) -> tuple[list[str], list[int]]: - """Retrieves the names and indices of the columns in the data. - - Returns: - A tuple containing the names of the columns and their indices. - """ - raise NotImplementedError("This method is not implemented yet.") - files = get_model_files(self.cfg, self.split, self._data_shards[0]) - - def extract_name(test_file): - return str(Path(test_file.parent.parent.stem, test_file.parent.stem, test_file.stem)) - - agg_wind_combos = [extract_name(test_file) for test_file in files] - - feature_columns = get_feature_columns(self.cfg.tabularization.filtered_code_metadata_fp) - all_feats = [] - all_indices = [] - for agg_wind in agg_wind_combos: - window, feat, agg = agg_wind.split("/") - feature_ids = get_feature_indices(feat + "/" + agg, feature_columns) - feature_names = [feature_columns[i] for i in feature_ids] - for feat_name in feature_names: - all_feats.append(f"{feat_name}/{agg}/{window}") - # use mask to append indices - all_indices.extend(feature_ids) - - return all_feats, all_indices - def densify(self) -> np.ndarray: """Builds the data as a dense matrix based on column subselection.""" From 2d80905692c175b9777d9d3988a10aa9ff957cc8 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Sun, 8 Sep 2024 16:42:27 -0400 Subject: [PATCH 29/54] Removing unused function in evaluation callback. --- src/MEDS_tabular_automl/evaluation_callback.py | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/src/MEDS_tabular_automl/evaluation_callback.py b/src/MEDS_tabular_automl/evaluation_callback.py index d9236f8..0a394c5 100644 --- a/src/MEDS_tabular_automl/evaluation_callback.py +++ b/src/MEDS_tabular_automl/evaluation_callback.py @@ -3,13 +3,10 @@ import polars as pl from hydra.experimental.callback import Callback from loguru import logger -from omegaconf import DictConfig, OmegaConf +from omegaconf import DictConfig class EvaluationCallback(Callback): - def __init__(self, **kwargs): - self.kwargs = kwargs - def on_multirun_end(self, config: DictConfig, **kwargs): """Find best model based on log files and logger.info its performance and hyperparameters.""" log_fp = Path(config.model_logging.model_log_dir) @@ -27,7 +24,6 @@ def on_multirun_end(self, config: DictConfig, **kwargs): logger.info(f"The best model can be found at {best_model}") self.log_performance(perf[0, :]) - # self.log_hyperparams(log_fp / best_model / f"{config.model_logging.config_log_stem}.log") if hasattr(config, "model_saving.delete_below_top_k") and config.delete_below_top_k >= 0: self.delete_below_top_k_models( perf, config.model_saving.delete_below_top_k, config.model_saving.model_dir @@ -43,15 +39,6 @@ def log_performance(self, perf): f"\nPerformance of best model:\nTuning AUC: {tuning_auc}\nTest AUC: {test_auc}", ) - def log_hyperparams(self, best_params_fp): - """logger.info hyperparameters of the best model with nice formatting.""" - # check if this file exists - if not best_params_fp.is_file(): - raise FileNotFoundError(f"Best hyperparameters file not found at {best_params_fp}") - best_params = OmegaConf.load(best_params_fp) - # print using OmegaConf.to_yaml - logger.info(f"\nHyperparameters of the best model:\n{OmegaConf.to_yaml(best_params)}") - def delete_below_top_k_models(self, perf, k, model_dir): """Save only top k models from the model directory and delete all other files.""" top_k_models = perf.head(k)["model_fp"].values From d29ece9dbe4f02cf8863b4edf97c32a468c4b461 Mon Sep 17 00:00:00 2001 From: teyaberg Date: Sun, 8 Sep 2024 21:06:14 +0000 Subject: [PATCH 30/54] eval callback --- .../evaluation_callback.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/MEDS_tabular_automl/evaluation_callback.py b/src/MEDS_tabular_automl/evaluation_callback.py index d9236f8..30618db 100644 --- a/src/MEDS_tabular_automl/evaluation_callback.py +++ b/src/MEDS_tabular_automl/evaluation_callback.py @@ -15,30 +15,30 @@ def on_multirun_end(self, config: DictConfig, **kwargs): log_fp = Path(config.model_logging.model_log_dir) try: - perf = pl.read_csv(log_fp / f"*/*{config.model_logging.performance_log_stem}.log") + performance = pl.read_csv(log_fp / f"*/*{config.model_logging.performance_log_stem}.log") except Exception as e: raise FileNotFoundError(f"Log files incomplete or not found at {log_fp}, exception {e}.") - perf = perf.sort("tuning_auc", descending=True, nulls_last=True) - logger.info(f"\nPerformance of the top 10 models:\n{perf.head(10)}") + performance = performance.sort("tuning_auc", descending=True, nulls_last=True) + logger.info(f"\nPerformance of the top 10 models:\n{performance.head(10)}") # get best model_fp - best_model = perf[0, 0] + best_model = performance[0, 0] logger.info(f"The best model can be found at {best_model}") - self.log_performance(perf[0, :]) + self.log_performance(performance[0, :]) # self.log_hyperparams(log_fp / best_model / f"{config.model_logging.config_log_stem}.log") if hasattr(config, "model_saving.delete_below_top_k") and config.delete_below_top_k >= 0: self.delete_below_top_k_models( - perf, config.model_saving.delete_below_top_k, config.model_saving.model_dir + performance, config.model_saving.delete_below_top_k, config.model_saving.model_dir ) - return perf.head(1) + return performance.head(1) - def log_performance(self, perf): + def log_performance(self, best_model_performance): """logger.info performance of the best model with nice formatting.""" - tuning_auc = perf["tuning_auc"][0] - test_auc = perf["test_auc"][0] + tuning_auc = best_model_performance["tuning_auc"][0] + test_auc = best_model_performance["test_auc"][0] logger.info( f"\nPerformance of best model:\nTuning AUC: {tuning_auc}\nTest AUC: {test_auc}", ) @@ -52,9 +52,9 @@ def log_hyperparams(self, best_params_fp): # print using OmegaConf.to_yaml logger.info(f"\nHyperparameters of the best model:\n{OmegaConf.to_yaml(best_params)}") - def delete_below_top_k_models(self, perf, k, model_dir): + def delete_below_top_k_models(self, performance, k, model_dir): """Save only top k models from the model directory and delete all other files.""" - top_k_models = perf.head(k)["model_fp"].values + top_k_models = performance.head(k)["model_fp"].values for model_fp in Path(model_dir).iterdir(): if model_fp.is_file() and model_fp.suffix != ".log" and str(model_fp) not in top_k_models: model_fp.unlink() From 81b022fef11427a41f61dcdfe18f01aa065ae0e7 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Sun, 8 Sep 2024 22:32:23 +0000 Subject: [PATCH 31/54] added yaml hierarchy for model_launcher --- README.md | 16 +-- docs/source/prediction.md | 16 +-- .../configs/launch_autogluon.yaml | 27 ------ .../configs/launch_model.yaml | 25 ++--- .../configs/model_launcher/__init__.py | 0 .../data_loading_params/__init__.py | 0 .../data_processing_params/__init__.py | 0 .../imputer/__init__.py | 0 .../normalization/__init__.py | 0 .../configs/model_launcher/default.yaml | 7 +- .../model_launcher/hydra/sweeper/default.yaml | 5 - .../model_launcher/knn_classifier.yaml | 29 +++--- .../model_launcher/logistic_regression.yaml | 43 ++++---- .../configs/model_launcher/path/__init__.py | 0 .../configs/model_launcher/path/default.yaml | 4 +- .../random_forest_classifier.yaml | 41 ++++---- .../model_launcher/sgd_classifier.yaml | 24 ++--- .../training_params/__init__.py | 0 .../training_params/default.yaml | 6 ++ .../configs/model_launcher/xgboost.yaml | 25 ++--- src/MEDS_tabular_automl/scripts/cache_task.py | 13 ++- .../scripts/describe_codes.py | 10 +- .../scripts/launch_autogluon.py | 6 +- .../scripts/launch_model.py | 6 +- .../scripts/tabularize_static.py | 6 +- .../scripts/tabularize_time_series.py | 6 +- src/MEDS_tabular_automl/sklearn_model.py | 8 +- src/MEDS_tabular_automl/tabular_dataset.py | 14 +-- src/MEDS_tabular_automl/utils.py | 80 +++++---------- src/MEDS_tabular_automl/xgboost_model.py | 8 +- tests/test_configs.py | 97 +++++++++++-------- 31 files changed, 254 insertions(+), 268 deletions(-) delete mode 100644 src/MEDS_tabular_automl/configs/launch_autogluon.yaml create mode 100644 src/MEDS_tabular_automl/configs/model_launcher/__init__.py create mode 100644 src/MEDS_tabular_automl/configs/model_launcher/data_loading_params/__init__.py create mode 100644 src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/__init__.py create mode 100644 src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/imputer/__init__.py create mode 100644 src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/normalization/__init__.py delete mode 100644 src/MEDS_tabular_automl/configs/model_launcher/hydra/sweeper/default.yaml create mode 100644 src/MEDS_tabular_automl/configs/model_launcher/path/__init__.py create mode 100644 src/MEDS_tabular_automl/configs/model_launcher/training_params/__init__.py create mode 100644 src/MEDS_tabular_automl/configs/model_launcher/training_params/default.yaml diff --git a/README.md b/README.md index 8900e41..bcf0e5f 100644 --- a/README.md +++ b/README.md @@ -529,14 +529,14 @@ The hydra sweeper swept over the parameters: ```yaml params: - +model_params.model.eta: tag(log, interval(0.001, 1)) - +model_params.model.lambda: tag(log, interval(0.001, 1)) - +model_params.model.alpha: tag(log, interval(0.001, 1)) - +model_params.model.subsample: interval(0.5, 1) - +model_params.model.min_child_weight: interval(1e-2, 100) - +model_params.model.max_depth: range(2, 16) - model_params.num_boost_round: range(100, 1000) - model_params.early_stopping_rounds: range(1, 10) + model.eta: tag(log, interval(0.001, 1)) + model.lambda: tag(log, interval(0.001, 1)) + model.alpha: tag(log, interval(0.001, 1)) + model.subsample: interval(0.5, 1) + model.min_child_weight: interval(1e-2, 100) + model.max_depth: range(2, 16) + num_boost_round: range(100, 1000) + early_stopping_rounds: range(1, 10) tabularization.min_code_inclusion_frequency: tag(log, range(10, 1000000)) ``` diff --git a/docs/source/prediction.md b/docs/source/prediction.md index 35131d8..cfd126c 100644 --- a/docs/source/prediction.md +++ b/docs/source/prediction.md @@ -107,14 +107,14 @@ The hydra sweeper swept over the parameters: ```yaml params: - +model_params.model.eta: tag(log, interval(0.001, 1)) - +model_params.model.lambda: tag(log, interval(0.001, 1)) - +model_params.model.alpha: tag(log, interval(0.001, 1)) - +model_params.model.subsample: interval(0.5, 1) - +model_params.model.min_child_weight: interval(1e-2, 100) - +model_params.model.max_depth: range(2, 16) - model_params.num_boost_round: range(100, 1000) - model_params.early_stopping_rounds: range(1, 10) + model.eta: tag(log, interval(0.001, 1)) + model.lambda: tag(log, interval(0.001, 1)) + model.alpha: tag(log, interval(0.001, 1)) + model.subsample: interval(0.5, 1) + model.min_child_weight: interval(1e-2, 100) + model.max_depth: range(2, 16) + num_boost_round: range(100, 1000) + early_stopping_rounds: range(1, 10) tabularization.min_code_inclusion_frequency: tag(log, range(10, 1000000)) ``` diff --git a/src/MEDS_tabular_automl/configs/launch_autogluon.yaml b/src/MEDS_tabular_automl/configs/launch_autogluon.yaml deleted file mode 100644 index 19ae671..0000000 --- a/src/MEDS_tabular_automl/configs/launch_autogluon.yaml +++ /dev/null @@ -1,27 +0,0 @@ -defaults: - - default - - tabularization: default - - imputer: default - - normalization: default - - model_launcher: autogluon - - _self_ - -task_name: ??? - -# Task cached data dir -input_dir: ${output_cohort_dir}/${task_name}/task_cache -# Directory with task labels -input_label_dir: ${output_cohort_dir}/${task_name}/labels/ -# Where to output the model and cached data -output_dir: ??? - -name: launch_autogluon - -hydra: - verbose: False - job: - name: MEDS_TAB_${name}_${worker}_${now:%Y-%m-%d_%H-%M-%S} - sweep: - dir: ${model_log_dir} - run: - dir: ${model_log_dir} diff --git a/src/MEDS_tabular_automl/configs/launch_model.yaml b/src/MEDS_tabular_automl/configs/launch_model.yaml index 2bc064d..5975e72 100644 --- a/src/MEDS_tabular_automl/configs/launch_model.yaml +++ b/src/MEDS_tabular_automl/configs/launch_model.yaml @@ -1,31 +1,34 @@ defaults: - - _self_ - default - tabularization: default - model_launcher: xgboost - - override hydra/callbacks: evaluation_callback - override hydra/sweeper: optuna - - override hydra/sweeper/sampler: tpe + - override hydra/callbacks: evaluation_callback - override hydra/launcher: joblib + - _self_ task_name: ??? +# Directory of meds data +MEDS_cohort_dir: ??? +# directory of tabularized data +tabularized_dir: ??? # Task cached data dir -input_dir: ${output_cohort_dir}/${task_name}/task_cache +input_dir: ${tabularized_dir}/${task_name}/task_cache # Directory with task labels -input_label_dir: ${output_cohort_dir}/${task_name}/labels/ +input_label_dir: ??? # Where to output the model and cached data -output_dir: ??? +output_model_dir: ??? + +output_cohort_dir: ${tabularized_dir} delete_below_top_k: -1 name: launch_model hydra: - verbose: False - job: - name: MEDS_TAB_${name}_${worker}_${now:%Y-%m-%d_%H-%M-%S} sweep: - dir: ${model_log_dir} + dir: ${output_model_dir}/sweeps/{now:%Y-%m-%d-%H-%M-%S}/ + subdir: "1" run: - dir: ${model_log_dir} + dir: ${path.model_log_dir} diff --git a/src/MEDS_tabular_automl/configs/model_launcher/__init__.py b/src/MEDS_tabular_automl/configs/model_launcher/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/MEDS_tabular_automl/configs/model_launcher/data_loading_params/__init__.py b/src/MEDS_tabular_automl/configs/model_launcher/data_loading_params/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/__init__.py b/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/imputer/__init__.py b/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/imputer/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/normalization/__init__.py b/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/normalization/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/MEDS_tabular_automl/configs/model_launcher/default.yaml b/src/MEDS_tabular_automl/configs/model_launcher/default.yaml index 6df964d..f9e5964 100644 --- a/src/MEDS_tabular_automl/configs/model_launcher/default.yaml +++ b/src/MEDS_tabular_automl/configs/model_launcher/default.yaml @@ -1,7 +1,12 @@ +# @package _global_ + defaults: - path: default - data_processing_params: default - data_loading_params: default - _self_ -tabularization: ${tabularization} +model_launcher: + data_processing_params: ${data_processing_params} + data_loading_params: ${data_loading_params} + tabularization: ${tabularization} diff --git a/src/MEDS_tabular_automl/configs/model_launcher/hydra/sweeper/default.yaml b/src/MEDS_tabular_automl/configs/model_launcher/hydra/sweeper/default.yaml deleted file mode 100644 index a1faf8f..0000000 --- a/src/MEDS_tabular_automl/configs/model_launcher/hydra/sweeper/default.yaml +++ /dev/null @@ -1,5 +0,0 @@ -hydra: - sweeper: - direction: maximize - n_trials: 250 - n_jobs: 25 diff --git a/src/MEDS_tabular_automl/configs/model_launcher/knn_classifier.yaml b/src/MEDS_tabular_automl/configs/model_launcher/knn_classifier.yaml index d6227f2..c2c32f4 100644 --- a/src/MEDS_tabular_automl/configs/model_launcher/knn_classifier.yaml +++ b/src/MEDS_tabular_automl/configs/model_launcher/knn_classifier.yaml @@ -1,27 +1,26 @@ +# @package _global_ + defaults: - default - - hydra/sweeper: default + - training_params: default - _self_ -_target_: MEDS_tabular_automl.sklearn_model.SklearnModel.initialize - -model: - _target_: sklearn.neighbors.KNeighborsClassifier - weights: "distance" - leaf_size: 30 - p: 2 - metric: "minkowski" +model_launcher: + _target_: MEDS_tabular_automl.sklearn_model.SklearnModel.initialize -training_params: - epochs: 20 - early_stopping_rounds: 5 + model: + _target_: sklearn.neighbors.KNeighborsClassifier + weights: "distance" + leaf_size: 30 + p: 2 + metric: "minkowski" -path: - model_file_extension: .pkl + path: + model_file_extension: .pkl hydra: sweeper: - +params: + params: model.n_neighbors: range(1, 20) model.weights: choice(['uniform', 'distance']) model.leaf_size: range(10, 50) diff --git a/src/MEDS_tabular_automl/configs/model_launcher/logistic_regression.yaml b/src/MEDS_tabular_automl/configs/model_launcher/logistic_regression.yaml index 58e3753..1871e51 100644 --- a/src/MEDS_tabular_automl/configs/model_launcher/logistic_regression.yaml +++ b/src/MEDS_tabular_automl/configs/model_launcher/logistic_regression.yaml @@ -1,33 +1,36 @@ +# @package _global_ + defaults: - default - - hydra/sweeper: default + - training_params: default - _self_ -_target_: MEDS_tabular_automl.sklearn_model.SklearnModel.initialize +model_launcher: + _target_: MEDS_tabular_automl.sklearn_model.SklearnModel.initialize -model: - _target_: sklearn.linear_model.LogisticRegression - penalty: "l2" - dual: false - tol: 0.0001 - C: 1.0 - fit_intercept: True - intercept_scaling: 1 - class_weight: null - random_state: null - solver: "lbfgs" - max_iter: 100 + model: + _target_: sklearn.linear_model.LogisticRegression + penalty: "l2" + dual: false + tol: 0.0001 + C: 1.0 + fit_intercept: True + intercept_scaling: 1 + class_weight: null + random_state: null + solver: "lbfgs" + max_iter: 100 -training_params: - epochs: 20 - early_stopping_rounds: 5 + training_params: + epochs: 20 + early_stopping_rounds: 5 -path: - model_file_extension: .pkl + path: + model_file_extension: .pkl hydra: sweeper: - +params: + params: model.C: tag(log, interval(1e-6, 1)) model.penalty: choice(['l1', 'l2', 'elasticnet']) model.solver: choice(['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']) diff --git a/src/MEDS_tabular_automl/configs/model_launcher/path/__init__.py b/src/MEDS_tabular_automl/configs/model_launcher/path/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/MEDS_tabular_automl/configs/model_launcher/path/default.yaml b/src/MEDS_tabular_automl/configs/model_launcher/path/default.yaml index 1e77b62..0c6d4d1 100644 --- a/src/MEDS_tabular_automl/configs/model_launcher/path/default.yaml +++ b/src/MEDS_tabular_automl/configs/model_launcher/path/default.yaml @@ -1,10 +1,10 @@ input_dir: ${input_dir} input_label_dir: ${input_label_dir} -output_dir: ${output_dir} +output_dir: ${output_model_dir} model_file_stem: model model_file_extension: .json log_dir: ${log_dir} cache_dir: ${cache_dir} -model_log_dir: ${output_dir}/.logs/ +model_log_dir: ${output_model_dir}/.logs/ performance_log_stem: performance config_log_stem: config diff --git a/src/MEDS_tabular_automl/configs/model_launcher/random_forest_classifier.yaml b/src/MEDS_tabular_automl/configs/model_launcher/random_forest_classifier.yaml index bfb285c..ccc5f2e 100644 --- a/src/MEDS_tabular_automl/configs/model_launcher/random_forest_classifier.yaml +++ b/src/MEDS_tabular_automl/configs/model_launcher/random_forest_classifier.yaml @@ -1,32 +1,35 @@ +# @package _global_ + defaults: - default - - hydra/sweeper: default + - training_params: default - _self_ -_target_: MEDS_tabular_automl.sklearn_model.SklearnModel.initialize +model_launcher: + _target_: MEDS_tabular_automl.sklearn_model.SklearnModel.initialize -model: - _target_: sklearn.ensemble.RandomForestClassifier - criterion: "gini" - max_depth: null - min_samples_split: 2 - min_samples_leaf: 1 - min_weight_fraction_leaf: 0.0 - max_features: "sqrt" - max_leaf_nodes: null - min_impurity_decrease: 0.0 - bootstrap: True + model: + _target_: sklearn.ensemble.RandomForestClassifier + criterion: "gini" + max_depth: null + min_samples_split: 2 + min_samples_leaf: 1 + min_weight_fraction_leaf: 0.0 + max_features: "sqrt" + max_leaf_nodes: null + min_impurity_decrease: 0.0 + bootstrap: True -training_params: - epochs: 20 - early_stopping_rounds: 5 + training_params: + epochs: 20 + early_stopping_rounds: 5 -path: - model_file_extension: .pkl + path: + model_file_extension: .pkl hydra: sweeper: - +params: + params: model.n_estimators: range(50, 300, 50) model.max_depth: choice([null, 10, 20, 30, 40, 50]) model.min_samples_split: range(2, 11) diff --git a/src/MEDS_tabular_automl/configs/model_launcher/sgd_classifier.yaml b/src/MEDS_tabular_automl/configs/model_launcher/sgd_classifier.yaml index 1a8a7e0..c62b7e1 100644 --- a/src/MEDS_tabular_automl/configs/model_launcher/sgd_classifier.yaml +++ b/src/MEDS_tabular_automl/configs/model_launcher/sgd_classifier.yaml @@ -1,24 +1,26 @@ +# @package _global_ defaults: - default - - hydra/sweeper: default + - training_params: default - _self_ -_target_: MEDS_tabular_automl.sklearn_model.SklearnModel.initialize +model_launcher: + _target_: MEDS_tabular_automl.sklearn_model.SklearnModel.initialize -model: - _target_: sklearn.linear_model.SGDClassifier - loss: log_loss + model: + _target_: sklearn.linear_model.SGDClassifier + loss: log_loss -training_params: - epochs: 20 - early_stopping_rounds: 5 + training_params: + epochs: 20 + early_stopping_rounds: 5 -path: - model_file_extension: .pkl + path: + model_file_extension: .pkl hydra: sweeper: - +params: + params: model.alpha: tag(log, interval(1e-6, 1)) model.l1_ratio: interval(0, 1) model.penalty: choice(['l1', 'l2', 'elasticnet']) diff --git a/src/MEDS_tabular_automl/configs/model_launcher/training_params/__init__.py b/src/MEDS_tabular_automl/configs/model_launcher/training_params/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/MEDS_tabular_automl/configs/model_launcher/training_params/default.yaml b/src/MEDS_tabular_automl/configs/model_launcher/training_params/default.yaml new file mode 100644 index 0000000..abd29f2 --- /dev/null +++ b/src/MEDS_tabular_automl/configs/model_launcher/training_params/default.yaml @@ -0,0 +1,6 @@ +# @package _global_ + +model_launcher: + training_params: + epochs: 20 + early_stopping_rounds: 5 diff --git a/src/MEDS_tabular_automl/configs/model_launcher/xgboost.yaml b/src/MEDS_tabular_automl/configs/model_launcher/xgboost.yaml index 2b69965..71e4637 100644 --- a/src/MEDS_tabular_automl/configs/model_launcher/xgboost.yaml +++ b/src/MEDS_tabular_automl/configs/model_launcher/xgboost.yaml @@ -1,24 +1,25 @@ +# @package _global_ defaults: - default - - hydra/sweeper: default - _self_ -_target_: MEDS_tabular_automl.xgboost_model.XGBoostModel.initialize +model_launcher: + _target_: MEDS_tabular_automl.xgboost_model.XGBoostModel.initialize -model: - booster: gbtree - device: cpu - nthread: 1 - tree_method: hist - objective: binary:logistic + model: + booster: gbtree + device: cpu + nthread: 1 + tree_method: hist + objective: binary:logistic -training_params: - num_boost_round: 1000 - early_stopping_rounds: 5 + training_params: + num_boost_round: 1000 + early_stopping_rounds: 5 hydra: sweeper: - +params: + params: model.eta: tag(log, interval(0.001, 1)) model.lambda: tag(log, interval(0.001, 1)) model.alpha: tag(log, interval(0.001, 1)) diff --git a/src/MEDS_tabular_automl/scripts/cache_task.py b/src/MEDS_tabular_automl/scripts/cache_task.py index 2ba030d..36194cb 100644 --- a/src/MEDS_tabular_automl/scripts/cache_task.py +++ b/src/MEDS_tabular_automl/scripts/cache_task.py @@ -25,7 +25,7 @@ hydra_loguru_init, load_matrix, load_tqdm, - tabularize_init, + stage_init, write_df, ) @@ -80,7 +80,16 @@ def main(cfg: DictConfig): Args: cfg: The configuration for processing, loaded from a YAML file. """ - tabularize_init(cfg) + stage_init( + cfg, + [ + "input_dir", + "input_label_dir", + "output_dir", + "output_label_dir", + "tabularization.filtered_code_metadata_fp", + ], + ) iter_wrapper = load_tqdm(cfg.tqdm) if not cfg.loguru_init: hydra_loguru_init() diff --git a/src/MEDS_tabular_automl/scripts/describe_codes.py b/src/MEDS_tabular_automl/scripts/describe_codes.py index a742a29..ad57406 100644 --- a/src/MEDS_tabular_automl/scripts/describe_codes.py +++ b/src/MEDS_tabular_automl/scripts/describe_codes.py @@ -17,13 +17,7 @@ ) from ..file_name import list_subdir_files from ..mapper import wrap as rwlock_wrap -from ..utils import ( - get_shard_prefix, - hydra_loguru_init, - load_tqdm, - tabularize_init, - write_df, -) +from ..utils import get_shard_prefix, hydra_loguru_init, load_tqdm, stage_init, write_df config_yaml = files("MEDS_tabular_automl").joinpath("configs/describe_codes.yaml") if not config_yaml.is_file(): @@ -38,7 +32,7 @@ def main(cfg: DictConfig): cfg: The configuration object for the tabularization process, loaded from a Hydra YAML configuration file. """ - tabularize_init(cfg) + stage_init(cfg, ["input_dir", "output_filepath"]) iter_wrapper = load_tqdm(cfg.tqdm) if not cfg.loguru_init: hydra_loguru_init() diff --git a/src/MEDS_tabular_automl/scripts/launch_autogluon.py b/src/MEDS_tabular_automl/scripts/launch_autogluon.py index dbd411d..0cda6da 100644 --- a/src/MEDS_tabular_automl/scripts/launch_autogluon.py +++ b/src/MEDS_tabular_automl/scripts/launch_autogluon.py @@ -13,7 +13,7 @@ from MEDS_tabular_automl.dense_iterator import DenseIterator -from ..utils import hydra_loguru_init, launch_model_init +from ..utils import hydra_loguru_init, stage_init config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_autogluon.yaml") if not config_yaml.is_file(): @@ -35,7 +35,9 @@ def main(cfg: DictConfig) -> float: cfg: The configuration dictionary specifying model and training parameters. """ check_autogluon() - launch_model_init(cfg) + stage_init( + cfg, ["input_dir", "input_label_dir", "output_dir", "tabularization.filtered_code_metadata_fp"] + ) if not cfg.loguru_init: hydra_loguru_init() diff --git a/src/MEDS_tabular_automl/scripts/launch_model.py b/src/MEDS_tabular_automl/scripts/launch_model.py index cc9c7bb..a8a8d28 100644 --- a/src/MEDS_tabular_automl/scripts/launch_model.py +++ b/src/MEDS_tabular_automl/scripts/launch_model.py @@ -7,7 +7,7 @@ from MEDS_tabular_automl.base_model import BaseModel -from ..utils import hydra_loguru_init, launch_model_init, log_to_logfile +from ..utils import hydra_loguru_init, log_to_logfile, stage_init config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_model.yaml") if not config_yaml.is_file(): @@ -24,7 +24,9 @@ def main(cfg: DictConfig) -> float: Returns: The evaluation result as the ROC AUC score on the held-out test set. """ - launch_model_init(cfg) + stage_init( + cfg, ["input_dir", "input_label_dir", "output_dir", "tabularization.filtered_code_metadata_fp"] + ) if not cfg.loguru_init: hydra_loguru_init() diff --git a/src/MEDS_tabular_automl/scripts/tabularize_static.py b/src/MEDS_tabular_automl/scripts/tabularize_static.py index d692142..82b7870 100644 --- a/src/MEDS_tabular_automl/scripts/tabularize_static.py +++ b/src/MEDS_tabular_automl/scripts/tabularize_static.py @@ -30,7 +30,7 @@ get_shard_prefix, hydra_loguru_init, load_tqdm, - tabularize_init, + stage_init, write_df, ) @@ -82,7 +82,9 @@ def main( .. _link: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.groupby_rolling.html # noqa: E501 """ - tabularize_init(cfg) + stage_init( + cfg, ["input_code_metadata_fp", "input_dir", "output_dir", "tabularization.filtered_code_metadata_fp"] + ) iter_wrapper = load_tqdm(cfg.tqdm) if not cfg.loguru_init: hydra_loguru_init() diff --git a/src/MEDS_tabular_automl/scripts/tabularize_time_series.py b/src/MEDS_tabular_automl/scripts/tabularize_time_series.py index 8de82a3..9dd508e 100644 --- a/src/MEDS_tabular_automl/scripts/tabularize_time_series.py +++ b/src/MEDS_tabular_automl/scripts/tabularize_time_series.py @@ -26,7 +26,7 @@ get_shard_prefix, hydra_loguru_init, load_tqdm, - tabularize_init, + stage_init, write_df, ) @@ -65,7 +65,9 @@ def main( FileNotFoundError: If specified directories or files in the configuration are not found. ValueError: If required columns like 'code' or 'value' are missing in the data files. """ - tabularize_init(cfg) + stage_init( + cfg, ["input_code_metadata_fp", "input_dir", "output_dir", "tabularization.filtered_code_metadata_fp"] + ) iter_wrapper = load_tqdm(cfg.tqdm) if not cfg.loguru_init: hydra_loguru_init() diff --git a/src/MEDS_tabular_automl/sklearn_model.py b/src/MEDS_tabular_automl/sklearn_model.py index 9bd96a3..8edd245 100644 --- a/src/MEDS_tabular_automl/sklearn_model.py +++ b/src/MEDS_tabular_automl/sklearn_model.py @@ -102,7 +102,7 @@ def __init__(self, cfg: DictConfig): """ super().__init__() self.cfg = cfg - self.keep_data_in_memory = cfg.model_params.iterator.keep_data_in_memory + self.keep_data_in_memory = cfg.data_loading_params.keep_data_in_memory self.itrain = None self.ituning = None @@ -112,7 +112,7 @@ def __init__(self, cfg: DictConfig): self.dtuning = None self.dheld_out = None - self.model = cfg.model_params.model + self.model = cfg.model # check that self.model is a valid model if not hasattr(self.model, "fit"): raise ValueError("Model does not have a fit method.") @@ -134,7 +134,7 @@ def _fit_from_partial(self): classes = self.itrain.get_classes() best_auc = 0 best_epoch = 0 - for epoch in range(self.cfg.model_params.epochs): + for epoch in range(self.cfg.training_params.epochs): # train on each all data for shard_idx in range(len(self.itrain._data_shards)): data, labels = self.itrain.get_data_shards(shard_idx) @@ -145,7 +145,7 @@ def _fit_from_partial(self): if auc > best_auc: best_auc = auc best_epoch = epoch - if epoch - best_epoch > self.cfg.model_params.early_stopping_rounds: + if epoch - best_epoch > self.cfg.training_params.early_stopping_rounds: break def _train(self): diff --git a/src/MEDS_tabular_automl/tabular_dataset.py b/src/MEDS_tabular_automl/tabular_dataset.py index 84a6609..668f80b 100644 --- a/src/MEDS_tabular_automl/tabular_dataset.py +++ b/src/MEDS_tabular_automl/tabular_dataset.py @@ -126,7 +126,7 @@ def _load_ids_and_labels( if load_labels: cached_labels[shard] = label_df.select(pl.col("label")).collect().to_series() - if self.cfg.model_params.iterator.binarize_task: + if self.cfg.data_loading_params.binarize_task: cached_labels[shard] = cached_labels[shard].map_elements( lambda x: 1 if x > 0 else 0, return_dtype=pl.Int8 ) @@ -221,10 +221,10 @@ def _get_approximate_correlation_per_feature(self, X: sp.csc_matrix, y: np.ndarr def _set_imputer(self): """Sets the imputer for the data.""" if ( - hasattr(self.cfg.model_params.iterator, "imputer") - and self.cfg.model_params.iterator.imputer.imputer_target + hasattr(self.cfg.data_loading_params, "imputer") + and self.cfg.data_loading_params.imputer.imputer_target ): - imputer = self.cfg.model_params.iterator.imputer.imputer_target + imputer = self.cfg.data_loading_params.imputer.imputer_target if hasattr(imputer, "partial_fit"): for i in range(len(self._data_shards)): X, _ = self._get_shard_by_index(i) @@ -240,10 +240,10 @@ def _set_imputer(self): def _set_scaler(self): """Sets the scaler for the data.""" if ( - hasattr(self.cfg.model_params.iterator, "normalization") - and self.cfg.model_params.iterator.normalization.normalizer + hasattr(self.cfg.data_loading_params, "normalization") + and self.cfg.data_loading_params.normalization.normalizer ): - scaler = self.cfg.model_params.iterator.normalization.normalizer + scaler = self.cfg.data_loading_params.normalization.normalizer if hasattr(scaler, "partial_fit"): for i in range(len(self._data_shards)): X, _ = self._get_shard_by_index(i) diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py index 17498eb..1355972 100644 --- a/src/MEDS_tabular_automl/utils.py +++ b/src/MEDS_tabular_automl/utils.py @@ -7,7 +7,7 @@ import numpy as np import polars as pl from loguru import logger -from omegaconf import DictConfig, OmegaConf +from omegaconf import DictConfig, ListConfig, OmegaConf from scipy.sparse import coo_array WRITE_USE_PYARROW = True @@ -45,7 +45,7 @@ def filter_to_codes( min_code_inclusion_count: int | None, min_code_inclusion_frequency: float | None, max_include_codes: int | None, -) -> list[str]: +) -> ListConfig[str]: """Filters and returns codes based on allowed list and minimum frequency. Args: @@ -63,8 +63,14 @@ def filter_to_codes( ... pl.DataFrame({"code": ["E", "D", "A"], "count": [4, 3, 2]}).write_parquet(f.name) ... filter_to_codes( f.name, ["A", "D"], 3, None, None) ['D'] + >>> with NamedTemporaryFile() as f: + ... pl.DataFrame({"code": ["E", "D", "A"], "count": [4, 3, 2]}).write_parquet(f.name) + ... filter_to_codes( f.name, ["A", "D"], 10, None, None) + Traceback (most recent call last): + ... + ValueError: Code filtering criteria ... + ... """ - feature_freqs = pl.read_parquet(code_metadata_fp) if allowed_codes is not None: @@ -72,18 +78,23 @@ def filter_to_codes( if min_code_inclusion_frequency is not None: pass - # need to consider size of the dataset vs count - - # feature_freqs = feature_freqs.filter(pl.col("frequency") >= min_code_inclusion_frequency) if min_code_inclusion_count is not None: feature_freqs = feature_freqs.filter(pl.col("count") >= min_code_inclusion_count) if max_include_codes is not None: - # feature_freqs = feature_freqs.sort("count", reverse=True).head(max_include_codes) feature_freqs = feature_freqs.sort("count", descending=True).head(max_include_codes) - return sorted(feature_freqs["code"].to_list()) + if len(feature_freqs["code"]) == 0: + raise ValueError( + f"Code filtering criteria leaves only 0 codes. Note that {feature_freqs.shape[0]} " + "codes are read in, try modifying the following kwargs:" + f"\n- tabularization.allowed_codes: {allowed_codes}" + f"\n- tabularization.min_code_inclusion_count: {min_code_inclusion_count}" + f"\n- tabularization.min_code_inclusion_frequency: {min_code_inclusion_frequency}" + f"\n- tabularization.max_include_codes: {max_include_codes}" + ) + return ListConfig(sorted(feature_freqs["code"].to_list())) OmegaConf.register_new_resolver("filter_to_codes", filter_to_codes, replace=True) @@ -434,7 +445,7 @@ def current_script_name() -> str: return Path(sys.argv[0]).stem -def tabularize_init(cfg: DictConfig): +def stage_init(cfg: DictConfig, keys: list[str]): """Initializes the stage by logging the configuration and the stage-specific paths. Args: @@ -449,59 +460,12 @@ def tabularize_init(cfg: DictConfig): f"Running {current_script_name()} with the following configuration:\n{OmegaConf.to_yaml(cfg)}" ) - input_dir = Path(cfg.data_input_dir) - output_dir = Path(cfg.stage_cfg.output_dir) - metadata_input_dir = Path(cfg.stage_cfg.metadata_input_dir) + chk_kwargs = {k: cfg[k] for k in keys} def chk(x: Path): return "✅" if x.exists() else "❌" - paths_strs = [ - f" - {k}: {chk(v)} {str(v.resolve())}" - for k, v in { - "input_dir": input_dir, - "output_dir": output_dir, - "metadata_input_dir": metadata_input_dir, - }.items() - ] - - logger_strs = [ - f"Stage config:\n{OmegaConf.to_yaml(cfg.stage_cfg)}", - "Paths: (checkbox indicates if it exists)", - ] - logger.debug("\n".join(logger_strs + paths_strs)) - - -def launch_model_init(cfg: DictConfig): - """Initializes the stage by logging the configuration and the stage-specific paths. - - Args: - cfg: The global configuration object, which should have a ``cfg.stage_cfg`` attribute containing the - stage specific configuration. - - Returns: The data input directory, stage output directory, and metadata input directory. - """ - hydra_loguru_init() - - logger.info( - f"Running {current_script_name()} with the following configuration:\n{OmegaConf.to_yaml(cfg)}" - ) - - input_dir = Path(cfg.data_input_dir) - output_dir = Path(cfg.stage_cfg.output_dir) - metadata_input_dir = Path(cfg.stage_cfg.metadata_input_dir) - - def chk(x: Path): - return "✅" if x.exists() else "❌" - - paths_strs = [ - f" - {k}: {chk(v)} {str(v.resolve())}" - for k, v in { - "input_dir": input_dir, - "output_dir": output_dir, - "metadata_input_dir": metadata_input_dir, - }.items() - ] + paths_strs = [f" - {k}: {chk(v)} {str(v.resolve())}" for k, v in chk_kwargs.items()] logger_strs = [ f"Stage config:\n{OmegaConf.to_yaml(cfg.stage_cfg)}", diff --git a/src/MEDS_tabular_automl/xgboost_model.py b/src/MEDS_tabular_automl/xgboost_model.py index fe8d96c..ce14ade 100644 --- a/src/MEDS_tabular_automl/xgboost_model.py +++ b/src/MEDS_tabular_automl/xgboost_model.py @@ -107,7 +107,7 @@ def __init__(self, cfg: DictConfig): """ super().__init__() self.cfg = cfg - self.keep_data_in_memory = cfg.model_params.iterator.keep_data_in_memory + self.keep_data_in_memory = cfg.data_loading_params.keep_data_in_memory self.itrain = None self.ituning = None @@ -132,10 +132,10 @@ def _train(self): """Trains the model.""" self.model = self.cfg.model self.model = self.model.train( - OmegaConf.to_container(self.cfg.model_params.model), + OmegaConf.to_container(self.cfg.model), self.dtrain, - num_boost_round=self.cfg.model_params.num_boost_round, - early_stopping_rounds=self.cfg.model_params.early_stopping_rounds, + num_boost_round=self.cfg.training_params.num_boost_round, + early_stopping_rounds=self.cfg.training_params.early_stopping_rounds, # nthreads=self.cfg.nthreads, evals=[(self.dtrain, "train"), (self.dtuning, "tuning")], verbose_eval=0, diff --git a/tests/test_configs.py b/tests/test_configs.py index 708d270..a580597 100644 --- a/tests/test_configs.py +++ b/tests/test_configs.py @@ -5,17 +5,14 @@ import subprocess import hydra +import polars as pl import pytest from hydra import compose, initialize -from hydra.core.hydra_config import HydraConfig -from loguru import logger +from omegaconf import DictConfig, OmegaConf from MEDS_tabular_automl.sklearn_model import SklearnModel from MEDS_tabular_automl.xgboost_model import XGBoostModel -logger.disable("MEDS_tabular_automl") -from omegaconf import OmegaConf - def run_command(script: str, args: list[str], hydra_kwargs: dict[str, str], test_name: str): command_parts = [script] + args + [f"{k}={v}" for k, v in hydra_kwargs.items()] @@ -28,46 +25,70 @@ def run_command(script: str, args: list[str], hydra_kwargs: dict[str, str], test def make_config_mutable(cfg): - OmegaConf.set_readonly(cfg, False) - for key in cfg: - if isinstance(cfg[key], OmegaConf): - make_config_mutable(cfg[key]) + if OmegaConf.is_config(cfg): + OmegaConf.set_readonly(cfg, False) + for key in cfg.keys(): + print(key) + # try: + cfg[key] = make_config_mutable(cfg[key]) + # except: + # import pdb; pdb.set_trace() + return cfg + # elif isinstance(cfg, list): + # return [make_config_mutable(item) for item in cfg] + # elif isinstance(cfg, dict): + # return {key: make_config_mutable(value) for key, value in cfg.items()} + else: + return cfg @pytest.mark.parametrize( - "model", - ["xgboost", "sgd_classifier", "knn_classifier", "logistic_regression", "random_forest_classifier"], + "model_launcher_override", + [ + "xgboost", + "sgd_classifier", + "knn_classifier", + "logistic_regression", + "random_forest_classifier", + "autogluon", + ], ) @pytest.mark.parametrize("imputer", ["default", "mean_imputer", "mode_imputer", "median_imputer"]) @pytest.mark.parametrize("normalization", ["standard_scaler", "max_abs_scaler"]) -def test_model_config(model, imputer, normalization): - MEDS_cohort_dir = "blah" - xgboost_config_kwargs = { +def test_model_config(model_launcher_override, imputer, normalization, tmp_path): + MEDS_cohort_dir = "/foo/" + code_metadata_fp = f"/{str(tmp_path)}/codes.parquet" + model_launcher_config_kwargs = { "MEDS_cohort_dir": MEDS_cohort_dir, - "output_cohort_dir": "blah", - "do_overwrite": False, - "seed": 1, - "hydra.verbose": True, - "tqdm": False, - "loguru_init": True, - "tabularization.min_code_inclusion_count": 1, - "tabularization.window_sizes": "[30d,365d,full]", - "tabularization._resolved_codes": "[test,test2]", + "tabularized_dir": "/bar/", + "output_model_dir": "/baz/", + "++tabularization.filtered_code_metadata_fp": code_metadata_fp, + "++tabularization.min_code_inclusion_count": "0", + "task_name": "foo_bar", + "input_label_dir": "/qux/", } + pl.DataFrame({"code": ["E", "D", "A"], "count": [4, 3, 2]}).write_parquet(code_metadata_fp) - with initialize( - version_base=None, config_path="../src/MEDS_tabular_automl/configs/" - ): # path to config.yaml - overrides = [f"model={model}", f"imputer={imputer}", f"normalization={normalization}"] + [ - f"{k}={v}" for k, v in xgboost_config_kwargs.items() - ] - cfg = compose( - config_name="launch_model", overrides=overrides, return_hydra_config=True - ) # config.yaml + with initialize(version_base=None, config_path="../src/MEDS_tabular_automl/configs/"): + overrides = [ + f"model_launcher={model_launcher_override}", + f"data_processing_params.imputer={imputer}", + f"data_processing_params.normalization={normalization}", + ] + [f"{k}={v}" for k, v in model_launcher_config_kwargs.items()] + cfg = compose(config_name="launch_model", overrides=overrides, return_hydra_config=True) - HydraConfig().set_config(cfg) - # make_config_mutable(cfg) - expected_model_class = XGBoostModel if model == "xgboost" else SklearnModel - model = hydra.utils.instantiate(cfg.model_target) - assert isinstance(model, expected_model_class) - # assert cfg.tabularization.window_sizes + model_launcher = hydra.utils.instantiate(cfg.model_launcher) + match model_launcher_override: + case "xgboost": + assert isinstance( + model_launcher, XGBoostModel + ), "model_launcher should be an instance of XGBoostModel" + case "autogluon": + assert isinstance( + model_launcher, DictConfig + ), "model_launcher should not be a DictConfig for autogluon" + case _: + assert isinstance( + model_launcher, SklearnModel + ), "model_launcher should be an instance of SklearnModel" + assert cfg.tabularization.window_sizes From 57a4a8195e5df503907bd74c85beb2954f658d9c Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Mon, 9 Sep 2024 00:56:00 +0000 Subject: [PATCH 32/54] updated configs, fixed most tests --- README.md | 16 +- docs/source/overview.md | 12 +- docs/source/prediction.md | 4 +- src/MEDS_tabular_automl/configs/default.yaml | 8 +- .../configs/describe_codes.yaml | 4 +- .../configs/launch_model.yaml | 14 +- .../configs/model_launcher/default.yaml | 1 + .../configs/tabularization.yaml | 6 +- .../configs/tabularization/default.yaml | 2 +- .../configs/task_specific_caching.yaml | 13 +- src/MEDS_tabular_automl/scripts/cache_task.py | 16 +- .../scripts/describe_codes.py | 2 +- .../scripts/launch_model.py | 21 +- .../scripts/tabularize_static.py | 2 +- src/MEDS_tabular_automl/tabular_dataset.py | 8 +- src/MEDS_tabular_automl/utils.py | 18 +- src/MEDS_tabular_automl/xgboost_model.py | 2 +- tests/test_configs.py | 28 ++- tests/test_integration.py | 14 +- tests/test_tabularize.py | 200 ++++++++---------- 20 files changed, 202 insertions(+), 189 deletions(-) diff --git a/README.md b/README.md index bcf0e5f..58d3414 100644 --- a/README.md +++ b/README.md @@ -103,14 +103,14 @@ By following these steps, you can seamlessly transform your dataset, define nece - static codes (codes without timestamps) - static numerical codes (codes without timestamps but with numerical values). - This script further caches feature names and frequencies in a dataset stored in a `code_metadata.parquet` file within the `MEDS_cohort_dir` argument specified as a hydra-style command line argument. + This script further caches feature names and frequencies in a dataset stored in a `code_metadata.parquet` file within the `meds_dir` argument specified as a hydra-style command line argument. 2. **`meds-tab-tabularize-static`**: Filters and processes the dataset based on the frequency of codes, generating a tabular vector for each patient at each timestamp in the shards. Each row corresponds to a unique `subject_id` and `timestamp` combination, thus rows are duplicated across multiple timestamps for the same patient. **Example: Tabularizing static data** with the minimum code frequency of 10, window sizes of `[1d, 30d, 365d, full]`, and value aggregation methods of `[static/present, static/first, code/count, value/count, value/sum, value/sum_sqd, value/min, value/max]` ```console - meds-tab-tabularize-static MEDS_cohort_dir="path_to_data" \ + meds-tab-tabularize-static meds_dir="path_to_data" \ tabularization.min_code_inclusion_frequency=10 \ tabularization.window_sizes=[1d,30d,365d,full] \ do_overwrite=False \ @@ -127,19 +127,19 @@ By following these steps, you can seamlessly transform your dataset, define nece meds-tab-tabularize-time-series --multirun \ worker="range(0,$N_PARALLEL_WORKERS)" \ hydra/launcher=joblib \ - MEDS_cohort_dir="path_to_data" \ + meds_dir="path_to_data" \ tabularization.min_code_inclusion_frequency=10 \ do_overwrite=False \ tabularization.window_sizes=[1d,30d,365d,full] \ tabularization.aggs=[static/present,static/first,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max] ``` -4. **`meds-tab-cache-task`**: Aligns task-specific labels with the nearest prior event in the tabularized data. It requires a labeled dataset directory with three columns (`subject_id`, `timestamp`, `label`) structured similarly to the `MEDS_cohort_dir`. +4. **`meds-tab-cache-task`**: Aligns task-specific labels with the nearest prior event in the tabularized data. It requires a labeled dataset directory with three columns (`subject_id`, `timestamp`, `label`) structured similarly to the `meds_dir`. **Example: Align tabularized data** for a specific task `$TASK` and labels that has pulled from [ACES](https://github.com/justin13601/ACES) ```console - meds-tab-cache-task MEDS_cohort_dir="path_to_data" \ + meds-tab-cache-task meds_dir="path_to_data" \ task_name=$TASK \ tabularization.min_code_inclusion_frequency=10 \ do_overwrite=False \ @@ -151,7 +151,7 @@ By following these steps, you can seamlessly transform your dataset, define nece ```console meds-tab-xgboost --multirun \ - MEDS_cohort_dir="path_to_data" \ + meds_dir="path_to_data" \ task_name=$TASK \ output_dir="output_directory" \ tabularization.min_code_inclusion_frequency=10 \ @@ -436,7 +436,7 @@ A single XGBoost run was completed to profile time and memory usage. This was do ```console meds-tab-xgboost - MEDS_cohort_dir="path_to_data" \ + meds_dir="path_to_data" \ task_name=$TASK \ output_dir="output_directory" \ do_overwrite=False \ @@ -506,7 +506,7 @@ The XGBoost sweep was run using the following command for each `$TASK`: ```console meds-tab-xgboost --multirun \ - MEDS_cohort_dir="path_to_data" \ + meds_dir="path_to_data" \ task_name=$TASK \ output_dir="output_directory" \ tabularization.window_sizes=$(generate-subsets [1d,30d,365d,full]) \ diff --git a/docs/source/overview.md b/docs/source/overview.md index 44f68bf..0f91818 100644 --- a/docs/source/overview.md +++ b/docs/source/overview.md @@ -38,14 +38,14 @@ See [`/tests/test_integration.py`](https://github.com/mmcdermott/MEDS_Tabular_Au - static codes (codes without timestamps) - static numerical codes (codes without timestamps but with numerical values). - This script further caches feature names and frequencies in a dataset stored in a `code_metadata.parquet` file within the `MEDS_cohort_dir` argument specified as a hydra-style command line argument. + This script further caches feature names and frequencies in a dataset stored in a `code_metadata.parquet` file within the `meds_dir` argument specified as a hydra-style command line argument. 2. **`meds-tab-tabularize-static`**: Filters and processes the dataset based on the frequency of codes, generating a tabular vector for each patient at each timestamp in the shards. Each row corresponds to a unique `subject_id` and `timestamp` combination, thus rows are duplicated across multiple timestamps for the same patient. **Example: Tabularizing static data** with the minimum code frequency of 10, window sizes of `[1d, 30d, 365d, full]`, and value aggregation methods of `[static/present, static/first, code/count, value/count, value/sum, value/sum_sqd, value/min, value/max]` ```console - meds-tab-tabularize-static MEDS_cohort_dir="path_to_data" \ + meds-tab-tabularize-static meds_dir="path_to_data" \ tabularization.min_code_inclusion_frequency=10 \ tabularization.window_sizes=[1d,30d,365d,full] \ do_overwrite=False \ @@ -62,19 +62,19 @@ See [`/tests/test_integration.py`](https://github.com/mmcdermott/MEDS_Tabular_Au meds-tab-tabularize-time-series --multirun \ worker="range(0,$N_PARALLEL_WORKERS)" \ hydra/launcher=joblib \ - MEDS_cohort_dir="path_to_data" \ + meds_dir="path_to_data" \ tabularization.min_code_inclusion_frequency=10 \ do_overwrite=False \ tabularization.window_sizes=[1d,30d,365d,full] \ tabularization.aggs=[static/present,static/first,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max] ``` -4. **`meds-tab-cache-task`**: Aligns task-specific labels with the nearest prior event in the tabularized data. It requires a labeled dataset directory with three columns (`subject_id`, `timestamp`, `label`) structured similarly to the `MEDS_cohort_dir`. +4. **`meds-tab-cache-task`**: Aligns task-specific labels with the nearest prior event in the tabularized data. It requires a labeled dataset directory with three columns (`subject_id`, `timestamp`, `label`) structured similarly to the `meds_dir`. **Example: Align tabularized data** for a specific task `$TASK` and labels that has pulled from [ACES](https://github.com/justin13601/ACES) ```console - meds-tab-cache-task MEDS_cohort_dir="path_to_data" \ + meds-tab-cache-task meds_dir="path_to_data" \ task_name=$TASK \ tabularization.min_code_inclusion_frequency=10 \ do_overwrite=False \ @@ -86,7 +86,7 @@ See [`/tests/test_integration.py`](https://github.com/mmcdermott/MEDS_Tabular_Au ```console meds-tab-xgboost --multirun \ - MEDS_cohort_dir="path_to_data" \ + meds_dir="path_to_data" \ task_name=$TASK \ output_dir="output_directory" \ tabularization.min_code_inclusion_frequency=10 \ diff --git a/docs/source/prediction.md b/docs/source/prediction.md index cfd126c..719fb5b 100644 --- a/docs/source/prediction.md +++ b/docs/source/prediction.md @@ -14,7 +14,7 @@ A single XGBoost run was completed to profile time and memory usage. This was do ```console meds-tab-xgboost - MEDS_cohort_dir="path_to_data" \ + meds_dir="path_to_data" \ task_name=$TASK \ output_dir="output_directory" \ do_overwrite=False \ @@ -84,7 +84,7 @@ The XGBoost sweep was run using the following command for each `$TASK`: ```console meds-tab-xgboost --multirun \ - MEDS_cohort_dir="path_to_data" \ + meds_dir="path_to_data" \ task_name=$TASK \ output_dir="output_directory" \ tabularization.window_sizes=$(generate-permutations [1d,30d,365d,full]) \ diff --git a/src/MEDS_tabular_automl/configs/default.yaml b/src/MEDS_tabular_automl/configs/default.yaml index 82a2164..9ee58e9 100644 --- a/src/MEDS_tabular_automl/configs/default.yaml +++ b/src/MEDS_tabular_automl/configs/default.yaml @@ -1,13 +1,13 @@ -MEDS_cohort_dir: ??? -output_cohort_dir: ??? +meds_dir: ??? +tabularized_dir: ??? do_overwrite: False seed: 1 tqdm: False worker: 0 loguru_init: False -log_dir: ${output_cohort_dir}/.logs/ -cache_dir: ${output_cohort_dir}/.cache +log_dir: ${tabularized_dir}/.logs/ +cache_dir: ${tabularized_dir}/.cache hydra: verbose: False diff --git a/src/MEDS_tabular_automl/configs/describe_codes.yaml b/src/MEDS_tabular_automl/configs/describe_codes.yaml index 8d0aac3..fe5d07b 100644 --- a/src/MEDS_tabular_automl/configs/describe_codes.yaml +++ b/src/MEDS_tabular_automl/configs/describe_codes.yaml @@ -2,8 +2,8 @@ defaults: - default - _self_ -input_dir: ${MEDS_cohort_dir}/data +input_dir: ${meds_dir} # Where to store output code frequency data -output_filepath: ${output_cohort_dir}/metadata/codes.parquet +output_filepath: ${tabularized_dir}/metadata/codes.parquet name: describe_codes diff --git a/src/MEDS_tabular_automl/configs/launch_model.yaml b/src/MEDS_tabular_automl/configs/launch_model.yaml index 5975e72..bba2fea 100644 --- a/src/MEDS_tabular_automl/configs/launch_model.yaml +++ b/src/MEDS_tabular_automl/configs/launch_model.yaml @@ -9,19 +9,13 @@ defaults: task_name: ??? -# Directory of meds data -MEDS_cohort_dir: ??? -# directory of tabularized data -tabularized_dir: ??? -# Task cached data dir -input_dir: ${tabularized_dir}/${task_name}/task_cache -# Directory with task labels -input_label_dir: ??? +# Location of task, split, and shard specific tabularized data +input_tabularized_cache_dir: ${tabularized_dir}/${task_name}/task_cache +# Location of task, split, and shard specific label data +input_label_cache_dir: ${tabularized_dir}/${task_name}/labels # Where to output the model and cached data output_model_dir: ??? -output_cohort_dir: ${tabularized_dir} - delete_below_top_k: -1 name: launch_model diff --git a/src/MEDS_tabular_automl/configs/model_launcher/default.yaml b/src/MEDS_tabular_automl/configs/model_launcher/default.yaml index f9e5964..7b75e6e 100644 --- a/src/MEDS_tabular_automl/configs/model_launcher/default.yaml +++ b/src/MEDS_tabular_automl/configs/model_launcher/default.yaml @@ -7,6 +7,7 @@ defaults: - _self_ model_launcher: + path: ${path} data_processing_params: ${data_processing_params} data_loading_params: ${data_loading_params} tabularization: ${tabularization} diff --git a/src/MEDS_tabular_automl/configs/tabularization.yaml b/src/MEDS_tabular_automl/configs/tabularization.yaml index ca2c4cb..f356d4f 100644 --- a/src/MEDS_tabular_automl/configs/tabularization.yaml +++ b/src/MEDS_tabular_automl/configs/tabularization.yaml @@ -5,8 +5,8 @@ defaults: # Raw data # Where the code metadata is stored -input_code_metadata_fp: ${output_cohort_dir}/metadata/codes.parquet -input_dir: ${MEDS_cohort_dir}/data -output_dir: ${output_cohort_dir}/tabularize +input_code_metadata_fp: ${tabularized_dir}/metadata/codes.parquet +input_dir: ${meds_dir} +output_dir: ${tabularized_dir}/tabularize name: tabularization diff --git a/src/MEDS_tabular_automl/configs/tabularization/default.yaml b/src/MEDS_tabular_automl/configs/tabularization/default.yaml index ada7dc9..820fe48 100644 --- a/src/MEDS_tabular_automl/configs/tabularization/default.yaml +++ b/src/MEDS_tabular_automl/configs/tabularization/default.yaml @@ -1,5 +1,5 @@ # User inputs -filtered_code_metadata_fp: ${output_cohort_dir}/metadata/codes.parquet +filtered_code_metadata_fp: ${tabularized_dir}/metadata/codes.parquet allowed_codes: null min_code_inclusion_count: 10 min_code_inclusion_frequency: null diff --git a/src/MEDS_tabular_automl/configs/task_specific_caching.yaml b/src/MEDS_tabular_automl/configs/task_specific_caching.yaml index 80510f6..54ad9d3 100644 --- a/src/MEDS_tabular_automl/configs/task_specific_caching.yaml +++ b/src/MEDS_tabular_automl/configs/task_specific_caching.yaml @@ -4,13 +4,14 @@ defaults: - _self_ task_name: ??? -# Tabularized Data -input_dir: ${output_cohort_dir}/tabularize +# Directory of tabularized data +input_tabularized_dir: ${tabularized_dir}/tabularize # Where the labels are stored, with columns subject_id, timestamp, label -input_label_dir: ${MEDS_cohort_dir}/tasks/${task_name}/ -# Where to output the task specific tabularized data -output_dir: ${output_cohort_dir}/${task_name}/task_cache -output_label_dir: ${output_cohort_dir}/${task_name}/labels +input_label_dir: ??? +# Where to output the task, split, and shard specific tabularized data +output_tabularized_cache_dir: ${tabularized_dir}/${task_name}/task_cache +# Where to output the task, split, and shard specific label data +output_label_cache_dir: ${tabularized_dir}/${task_name}/labels label_column: "boolean_value" diff --git a/src/MEDS_tabular_automl/scripts/cache_task.py b/src/MEDS_tabular_automl/scripts/cache_task.py index 36194cb..07fa0be 100644 --- a/src/MEDS_tabular_automl/scripts/cache_task.py +++ b/src/MEDS_tabular_automl/scripts/cache_task.py @@ -96,11 +96,19 @@ def main(cfg: DictConfig): # Produce ts representation # shuffle tasks - tabularization_tasks = list_subdir_files(cfg.input_dir, "npz") + tabularization_tasks = list_subdir_files(cfg.input_tabularized_dir, "npz") + if len(tabularization_tasks) == 0: + raise FileNotFoundError( + f"No tabularized data found, `tabularized_dir`: {cfg.input_tabularized_dir}, is likely incorrect" + ) np.random.shuffle(tabularization_tasks) label_dir = Path(cfg.input_label_dir) + if not label_dir.exists(): + raise FileNotFoundError( + f"Label directory {label_dir} does not exist, please check the `input_label_dir` kwarg" + ) label_df = ( pl.scan_parquet(label_dir / "**/*.parquet") .rename( @@ -119,9 +127,11 @@ def main(cfg: DictConfig): for data_fp in iter_wrapper(tabularization_tasks): # parse as time series agg split, shard_num, window_size, code_type, agg_name = Path(data_fp).with_suffix("").parts[-5:] - meds_data_in_fp = Path(cfg.output_cohort_dir) / "data" / split / f"{shard_num}.parquet" + meds_data_in_fp = Path(cfg.meds_dir) / split / f"{shard_num}.parquet" shard_label_fp = Path(cfg.output_label_dir) / split / f"{shard_num}.parquet" - out_fp = (Path(cfg.output_dir) / get_shard_prefix(cfg.input_dir, data_fp)).with_suffix(".npz") + out_fp = (Path(cfg.output_dir) / get_shard_prefix(cfg.input_tabularized_dir, data_fp)).with_suffix( + ".npz" + ) def read_meds_data_df(meds_data_fp): if "numeric_value" not in pl.scan_parquet(meds_data_fp).columns: diff --git a/src/MEDS_tabular_automl/scripts/describe_codes.py b/src/MEDS_tabular_automl/scripts/describe_codes.py index ad57406..0e46ac3 100644 --- a/src/MEDS_tabular_automl/scripts/describe_codes.py +++ b/src/MEDS_tabular_automl/scripts/describe_codes.py @@ -32,7 +32,7 @@ def main(cfg: DictConfig): cfg: The configuration object for the tabularization process, loaded from a Hydra YAML configuration file. """ - stage_init(cfg, ["input_dir", "output_filepath"]) + stage_init(cfg, ["input_dir", "meds_dir"]) iter_wrapper = load_tqdm(cfg.tqdm) if not cfg.loguru_init: hydra_loguru_init() diff --git a/src/MEDS_tabular_automl/scripts/launch_model.py b/src/MEDS_tabular_automl/scripts/launch_model.py index a8a8d28..1dc0792 100644 --- a/src/MEDS_tabular_automl/scripts/launch_model.py +++ b/src/MEDS_tabular_automl/scripts/launch_model.py @@ -31,23 +31,22 @@ def main(cfg: DictConfig) -> float: if not cfg.loguru_init: hydra_loguru_init() - model: BaseModel = hydra.utils.instantiate(cfg.model_target) + model_launcher: BaseModel = hydra.utils.instantiate(cfg.model_launcher) - model.train() - auc = model.evaluate() + model_launcher.train() + auc = model_launcher.evaluate() # save model - output_fp = Path(cfg.model_saving.model_dir) - output_fp = ( - output_fp.parent - / f"{cfg.model_saving.model_file_stem}_{auc:.4f}_{time.time()}{cfg.model_target.model_file_extension}" - ) - output_fp.parent.mkdir(parents=True, exist_ok=True) + output_model_dir = Path(cfg.output_model_dir) + path_cfg = model_launcher.cfg.path + model_filename = f"{path_cfg.model_file_stem}_{auc:.4f}_{time.time()}{path_cfg.model_file_extension}" + output_fp = output_model_dir / model_filename + output_model_dir.parent.mkdir(parents=True, exist_ok=True) # log to logfile - log_to_logfile(model, cfg, output_fp.stem) + log_to_logfile(model_launcher, cfg, output_fp.stem) - model.save_model(output_fp) + model_launcher.save_model(output_fp) return auc diff --git a/src/MEDS_tabular_automl/scripts/tabularize_static.py b/src/MEDS_tabular_automl/scripts/tabularize_static.py index 82b7870..1a616bf 100644 --- a/src/MEDS_tabular_automl/scripts/tabularize_static.py +++ b/src/MEDS_tabular_automl/scripts/tabularize_static.py @@ -57,7 +57,7 @@ def main( Args: cfg: - MEDS_cohort_dir: directory of MEDS format dataset that is ingested. + meds_dir: directory of MEDS format dataset that is ingested. tabularized_data_dir: output directory of tabularized data. min_code_inclusion_frequency: The base feature inclusion frequency that should be used to dictate what features can be included in the flat representation. It can either be a float, in which diff --git a/src/MEDS_tabular_automl/tabular_dataset.py b/src/MEDS_tabular_automl/tabular_dataset.py index 668f80b..db73041 100644 --- a/src/MEDS_tabular_automl/tabular_dataset.py +++ b/src/MEDS_tabular_automl/tabular_dataset.py @@ -47,13 +47,17 @@ def __init__(self, cfg: DictConfig, split: str = "train"): split: The data split to use, which can be one of "train", "tuning", or "held_out". This determines which subset of the data is loaded and processed. """ - super().__init__(cache_prefix=Path(cfg.cache_dir)) + super().__init__(cache_prefix=Path(cfg.path.cache_dir)) self.cfg = cfg self.split = split # Load shards for this split self._data_shards = sorted( - [shard.stem for shard in list_subdir_files(Path(cfg.input_label_dir) / split, "parquet")] + [shard.stem for shard in list_subdir_files(Path(cfg.path.input_label_dir) / split, "parquet")] ) + if len(self._data_shards) == 0: + raise ValueError( + f"No labels found in the `input_label_dir` {str(Path(cfg.path.input_label_dir).resolve())}" + ) self.valid_event_ids, self.labels = None, None self.codes_set, self.code_masks, self.num_features = self._get_code_set() diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py index 1355972..1296483 100644 --- a/src/MEDS_tabular_automl/utils.py +++ b/src/MEDS_tabular_automl/utils.py @@ -454,21 +454,25 @@ def stage_init(cfg: DictConfig, keys: list[str]): Returns: The data input directory, stage output directory, and metadata input directory. """ - hydra_loguru_init() - logger.info( f"Running {current_script_name()} with the following configuration:\n{OmegaConf.to_yaml(cfg)}" ) - chk_kwargs = {k: cfg[k] for k in keys} + chk_kwargs = {k: OmegaConf.select(cfg, k) for k in keys} - def chk(x: Path): - return "✅" if x.exists() else "❌" + def chk(x: Path | None) -> str: + if x is None: + return "❌" + return "✅" if x.exists() and str(x) != "" else "❌" - paths_strs = [f" - {k}: {chk(v)} {str(v.resolve())}" for k, v in chk_kwargs.items()] + paths_strs = [ + f" - {k}: {chk(Path(v) if v is not None else None)} " + f"{str(Path(v).resolve()) if v is not None else 'None'}" + for k, v in chk_kwargs.items() + ] logger_strs = [ - f"Stage config:\n{OmegaConf.to_yaml(cfg.stage_cfg)}", + f"Stage config:\n{OmegaConf.to_yaml(cfg)}", "Paths: (checkbox indicates if it exists)", ] logger.debug("\n".join(logger_strs + paths_strs)) diff --git a/src/MEDS_tabular_automl/xgboost_model.py b/src/MEDS_tabular_automl/xgboost_model.py index ce14ade..5ff6fd2 100644 --- a/src/MEDS_tabular_automl/xgboost_model.py +++ b/src/MEDS_tabular_automl/xgboost_model.py @@ -43,7 +43,7 @@ def __init__(self, cfg: DictConfig, split: str): cfg: The configuration dictionary. split: The data split to use. """ - xgb.DataIter.__init__(self, cache_prefix=Path(cfg.cache_dir)) + xgb.DataIter.__init__(self, cache_prefix=Path(cfg.path.cache_dir)) TabularDataset.__init__(self, cfg=cfg, split=split) self.valid_event_ids, self.labels = self._load_ids_and_labels() # check if the labels are empty diff --git a/tests/test_configs.py b/tests/test_configs.py index a580597..b16f34f 100644 --- a/tests/test_configs.py +++ b/tests/test_configs.py @@ -56,10 +56,10 @@ def make_config_mutable(cfg): @pytest.mark.parametrize("imputer", ["default", "mean_imputer", "mode_imputer", "median_imputer"]) @pytest.mark.parametrize("normalization", ["standard_scaler", "max_abs_scaler"]) def test_model_config(model_launcher_override, imputer, normalization, tmp_path): - MEDS_cohort_dir = "/foo/" + meds_dir = "/foo/" code_metadata_fp = f"/{str(tmp_path)}/codes.parquet" model_launcher_config_kwargs = { - "MEDS_cohort_dir": MEDS_cohort_dir, + "meds_dir": meds_dir, "tabularized_dir": "/bar/", "output_model_dir": "/baz/", "++tabularization.filtered_code_metadata_fp": code_metadata_fp, @@ -92,3 +92,27 @@ def test_model_config(model_launcher_override, imputer, normalization, tmp_path) model_launcher, SklearnModel ), "model_launcher should be an instance of SklearnModel" assert cfg.tabularization.window_sizes + + +def test_generate_subsets_configs(): + meds_dir = "blah" + stderr, stdout_ws = run_command("generate-subsets", ["[30d]"], {}, "generate-subsets window_sizes") + stderr, stdout_agg = run_command("generate-subsets", ["[static/present]"], {}, "generate-subsets aggs") + xgboost_config_kwargs = { + "meds_dir": meds_dir, + "tabularized_dir": "blah", + "do_overwrite": False, + "seed": 1, + "hydra.verbose": True, + "tqdm": False, + "loguru_init": True, + "tabularization.min_code_inclusion_count": 1, + "tabularization.window_sizes": f"{stdout_ws.strip()}", + } + + with initialize( + version_base=None, config_path="../src/MEDS_tabular_automl/configs/" + ): # path to config.yaml + overrides = [f"{k}={v}" for k, v in xgboost_config_kwargs.items()] + cfg = compose(config_name="launch_model", overrides=overrides) # config.yaml + assert cfg.tabularization.window_sizes diff --git a/tests/test_integration.py b/tests/test_integration.py index 907b038..800a14e 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -43,12 +43,12 @@ def run_command(script: str, args: list[str], hydra_kwargs: dict[str, str], test def test_integration(tmp_path): # Step 0: Setup Environment - MEDS_cohort_dir = Path(tmp_path) / "MEDS_cohort_dir" - output_cohort_dir = Path(tmp_path) / "output_cohort_dir" + meds_dir = Path(tmp_path) / "meds_dir" + tabularized_dir = Path(tmp_path) / "tabularized_dir" shared_config = { - "MEDS_cohort_dir": str(MEDS_cohort_dir.resolve()), - "output_cohort_dir": str(output_cohort_dir.resolve()), + "meds_dir": str(meds_dir.resolve()), + "tabularized_dir": str(tabularized_dir.resolve()), "do_overwrite": False, "seed": 1, "hydra.verbose": True, @@ -65,12 +65,12 @@ def test_integration(tmp_path): cfg = compose(config_name="describe_codes", overrides=overrides) # config.yaml # Create the directories - (output_cohort_dir / "data").mkdir(parents=True, exist_ok=True) + (tabularized_dir).mkdir(parents=True, exist_ok=True) # Store MEDS outputs all_data = [] for split, data in MEDS_OUTPUTS.items(): - file_path = output_cohort_dir / "data" / f"{split}.parquet" + file_path = tabularized_dir / f"{split}.parquet" file_path.parent.mkdir(exist_ok=True) df = pl.read_csv(StringIO(data)).with_columns(pl.col("time").str.to_datetime("%Y-%m-%dT%H:%M:%S%.f")) df.write_parquet(file_path) @@ -86,7 +86,7 @@ def test_integration(tmp_path): for f in meds_files: assert pl.read_parquet(f).shape[0] > 0, "MEDS Data Tabular Dataframe Should not be Empty!" split_json = json.load(StringIO(SPLITS_JSON)) - splits_fp = output_cohort_dir / ".shards.json" + splits_fp = tabularized_dir / ".shards.json" json.dump(split_json, splits_fp.open("w")) # Step 1: Run the describe_codes script diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index b1bd879..481d601 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -2,10 +2,7 @@ root = rootutils.setup_root(__file__, dotenv=True, pythonpath=True, cwd=True) -import importlib.util import json -import os -import subprocess from io import StringIO from pathlib import Path @@ -150,12 +147,14 @@ def test_tabularize(tmp_path): - MEDS_cohort_dir = Path(tmp_path) / "MEDS_cohort_dir" - output_cohort_dir = Path(tmp_path) / "output_cohort_dir" + meds_dir = Path(tmp_path) / "meds_dir" + tabularized_dir = Path(tmp_path) / "tabularized_dir" + input_label_dir = Path(tmp_path) / "label_dir" + output_model_dir = Path(tmp_path) / "output_model_dir" shared_config = { - "MEDS_cohort_dir": str(MEDS_cohort_dir.resolve()), - "output_cohort_dir": str(output_cohort_dir.resolve()), + "meds_dir": str(meds_dir.resolve()), + "tabularized_dir": str(tabularized_dir.resolve()), "do_overwrite": False, "seed": 1, "hydra.verbose": True, @@ -172,16 +171,17 @@ def test_tabularize(tmp_path): cfg = compose(config_name="describe_codes", overrides=overrides) # config.yaml # Create the directories - (output_cohort_dir / "data").mkdir(parents=True, exist_ok=True) + (tabularized_dir).mkdir(parents=True, exist_ok=True) # Store MEDS outputs all_data = [] for split, data in MEDS_OUTPUTS.items(): - file_path = output_cohort_dir / "data" / f"{split}.parquet" - file_path.parent.mkdir(exist_ok=True) + file_path = meds_dir / f"{split}.parquet" + file_path.parent.mkdir(exist_ok=True, parents=True) df = pl.read_csv(StringIO(data)).with_columns(pl.col("time").str.to_datetime("%Y-%m-%dT%H:%M:%S%.f")) df.write_parquet(file_path) all_data.append(df) + assert file_path.exists() all_data = pl.concat(all_data, how="diagonal_relaxed").sort(by=["subject_id", "time"]) @@ -193,7 +193,7 @@ def test_tabularize(tmp_path): for f in meds_files: assert pl.read_parquet(f).shape[0] > 0, "MEDS Data Tabular Dataframe Should not be Empty!" split_json = json.load(StringIO(SPLITS_JSON)) - splits_fp = output_cohort_dir / ".shards.json" + splits_fp = meds_dir / ".shards.json" json.dump(split_json, splits_fp.open("w")) # Step 1: Describe Codes - compute code frequencies describe_codes.main(cfg) @@ -221,7 +221,7 @@ def test_tabularize(tmp_path): cfg = compose(config_name="tabularization", overrides=overrides) # config.yaml tabularize_static.main(cfg) - output_dir = Path(cfg.output_cohort_dir) / "tabularize" + output_dir = Path(cfg.tabularized_dir) / "tabularize" output_files = list(output_dir.glob("**/static/**/*.npz")) actual_files = [get_shard_prefix(output_dir, each) + ".npz" for each in output_files] @@ -302,6 +302,9 @@ def test_tabularize(tmp_path): **shared_config, "tabularization.min_code_inclusion_count": 1, "tabularization.window_sizes": "[30d,365d,full]", + "task_name": "test_task", + "tabularized_dir": str(tabularized_dir.resolve()), + "input_label_dir": str(input_label_dir.resolve()), } with initialize( @@ -335,121 +338,94 @@ def test_tabularize(tmp_path): [each for each in output_files if "0/30d" in str(each) and "code/count" in str(each)] assert len(list_subdir_files(cfg.output_dir, "npz")) == expected_num_time_tabs + expected_num_static_tabs - xgboost_config_kwargs = { + xgboost_config = { **shared_config, "tabularization.min_code_inclusion_count": 1, "tabularization.window_sizes": "[30d,365d,full]", + "task_name": "test_task", + "tabularized_dir": str(tabularized_dir.resolve()), + "input_label_dir": str(input_label_dir.resolve()), + "output_model_dir": str(output_model_dir.resolve()), } with initialize( version_base=None, config_path="../src/MEDS_tabular_automl/configs/" ): # path to config.yaml - overrides = ["model=xgboost"] + [f"{k}={v}" for k, v in xgboost_config_kwargs.items()] + overrides = ["model_launcher=xgboost"] + [f"{k}={v}" for k, v in xgboost_config.items()] cfg = compose( config_name="launch_model", overrides=overrides, return_hydra_config=True ) # config.yaml - output_dir = Path(cfg.output_cohort_dir) / "model" + output_dir = Path(cfg.tabularized_dir) / "model" HydraConfig().set_config(cfg) launch_model.main(cfg) output_files = list(output_dir.glob("**/*.json")) assert len(output_files) == 2 - sklearnmodel_config_kwargs = { - **shared_config, - "tabularization.min_code_inclusion_count": 1, - "tabularization.window_sizes": "[30d,365d,full]", - } - - with initialize( - version_base=None, config_path="../src/MEDS_tabular_automl/configs/" - ): # path to config.yaml - overrides = ["model=sgd_classifier"] + [f"{k}={v}" for k, v in sklearnmodel_config_kwargs.items()] - cfg = compose(config_name="launch_model", overrides=overrides) # config.yaml - - output_dir = Path(cfg.output_cohort_dir) / "model" - - launch_model.main(cfg) - output_files = list(output_dir.glob("**/*.pkl")) - assert len(output_files) == 1 - - sklearnmodel_config_kwargs = { - **shared_config, - "tabularization.min_code_inclusion_count": 1, - "tabularization.window_sizes": "[30d,365d,full]", - "model_params.iterator.keep_data_in_memory": False, - "model_saving.model_dir": "${output_cohort_dir}/model_online/model_${now:%Y-%m-%d_%H-%M-%S}", - } - - with initialize( - version_base=None, config_path="../src/MEDS_tabular_automl/configs/" - ): # path to config.yaml - overrides = ["model=sgd_classifier"] + [f"{k}={v}" for k, v in sklearnmodel_config_kwargs.items()] - cfg = compose(config_name="launch_model", overrides=overrides) # config.yaml - - output_dir = Path(cfg.output_cohort_dir) / "model_online" - - launch_model.main(cfg) - output_files = list(output_dir.glob("**/*.pkl")) - assert len(output_files) == 1 - - if importlib.util.find_spec("autogluon") is not None: - import autogluon as ag - - from MEDS_tabular_automl.scripts import launch_autogluon - - autogluon_config_kwargs = { - **shared_config, - "tabularization.min_code_inclusion_count": 1, - "tabularization.window_sizes": "[30d,365d,full]", - "model_params.iterator.keep_data_in_memory": False, - "model_saving.model_dir": "${output_cohort_dir}/model_online/model_${now:%Y-%m-%d_%H-%M-%S}", - } - - with initialize( - version_base=None, config_path="../src/MEDS_tabular_automl/configs/" - ): # path to config.yaml - overrides = [f"{k}={v}" for k, v in autogluon_config_kwargs.items()] - cfg = compose(config_name="launch_autogluon", overrides=overrides) # config.yaml - - output_dir = Path(cfg.output_cohort_dir) / "model_online" - - launch_autogluon.main(cfg) - output_files = list(output_dir.glob("*")) - most_recent_file = max(output_files, key=os.path.getmtime) - ag.tabular.TabularPredictor.load(most_recent_file) - - -def run_command(script: str, args: list[str], hydra_kwargs: dict[str, str], test_name: str): - command_parts = [script] + args + [f"{k}={v}" for k, v in hydra_kwargs.items()] - command_out = subprocess.run(" ".join(command_parts), shell=True, capture_output=True) - stderr = command_out.stderr.decode() - stdout = command_out.stdout.decode() - if command_out.returncode != 0: - raise AssertionError(f"{test_name} failed!\nstdout:\n{stdout}\nstderr:\n{stderr}") - return stderr, stdout - - -def test_xgboost_config(): - MEDS_cohort_dir = "blah" - stderr, stdout_ws = run_command("generate-subsets", ["[30d]"], {}, "generate-subsets window_sizes") - stderr, stdout_agg = run_command("generate-subsets", ["[static/present]"], {}, "generate-subsets aggs") - xgboost_config_kwargs = { - "MEDS_cohort_dir": MEDS_cohort_dir, - "output_cohort_dir": "blah", - "do_overwrite": False, - "seed": 1, - "hydra.verbose": True, - "tqdm": False, - "loguru_init": True, - "tabularization.min_code_inclusion_count": 1, - "tabularization.window_sizes": f"{stdout_ws.strip()}", - } - - with initialize( - version_base=None, config_path="../src/MEDS_tabular_automl/configs/" - ): # path to config.yaml - overrides = [f"{k}={v}" for k, v in xgboost_config_kwargs.items()] - cfg = compose(config_name="launch_model", overrides=overrides) # config.yaml - assert cfg.tabularization.window_sizes + # sklearnmodel_config = { + # **shared_config, + # "tabularization.min_code_inclusion_count": 1, + # "tabularization.window_sizes": "[30d,365d,full]", + # "task_name": "test_task", + # } + + # with initialize( + # version_base=None, config_path="../src/MEDS_tabular_automl/configs/" + # ): # path to config.yaml + # overrides = ["model_launcher=sgd_classifier"] + [f"{k}={v}" for k, v in sklearnmodel_config.items()] + # cfg = compose(config_name="launch_model", overrides=overrides) # config.yaml + + # output_dir = Path(cfg.tabularized_dir) / "model" + + # launch_model.main(cfg) + # output_files = list(output_dir.glob("**/*.pkl")) + # assert len(output_files) == 1 + + # sklearnmodel_config = { + # **shared_config, + # "tabularization.min_code_inclusion_count": 1, + # "tabularization.window_sizes": "[30d,365d,full]", + # "model_params.iterator.keep_data_in_memory": False, + # "model_saving.model_dir": "${tabularized_dir}/model_online/model_${now:%Y-%m-%d_%H-%M-%S}", + # "task_name": "test_task", + # } + + # with initialize( + # version_base=None, config_path="../src/MEDS_tabular_automl/configs/" + # ): # path to config.yaml + # overrides = ["model_launcher=sgd_classifier"] + [f"{k}={v}" for k, v in sklearnmodel_config.items()] + # cfg = compose(config_name="launch_model", overrides=overrides) # config.yaml + + # output_dir = Path(cfg.tabularized_dir) / "model_online" + + # launch_model.main(cfg) + # output_files = list(output_dir.glob("**/*.pkl")) + # assert len(output_files) == 1 + + # if importlib.util.find_spec("autogluon") is not None: + # import autogluon as ag + + # from MEDS_tabular_automl.scripts import launch_autogluon + + # autogluon_config = { + # **shared_config, + # "tabularization.min_code_inclusion_count": 1, + # "tabularization.window_sizes": "[30d,365d,full]", + # "model_params.iterator.keep_data_in_memory": False, + # "model_saving.model_dir": "${tabularized_dir}/model_online/model_${now:%Y-%m-%d_%H-%M-%S}", + # "task_name": "test_task", + # } + + # with initialize( + # version_base=None, config_path="../src/MEDS_tabular_automl/configs/" + # ): # path to config.yaml + # overrides = [f"{k}={v}" for k, v in autogluon_config.items()] + # cfg = compose(config_name="launch_autogluon", overrides=overrides) # config.yaml + + # output_dir = Path(cfg.tabularized_dir) / "model_online" + + # launch_autogluon.main(cfg) + # output_files = list(output_dir.glob("*")) + # most_recent_file = max(output_files, key=os.path.getmtime) + # ag.tabular.TabularPredictor.load(most_recent_file) From 2f564e6c087736046825291eb00ca4242555405a Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Sun, 8 Sep 2024 21:14:52 -0400 Subject: [PATCH 33/54] Removed unused pass block. --- src/MEDS_tabular_automl/utils.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py index 3d6f496..bd272ef 100644 --- a/src/MEDS_tabular_automl/utils.py +++ b/src/MEDS_tabular_automl/utils.py @@ -77,16 +77,12 @@ def filter_to_codes( feature_freqs = feature_freqs.filter(pl.col("code").is_in(allowed_codes)) if min_code_inclusion_frequency is not None: - pass - # need to consider size of the dataset vs count - - # feature_freqs = feature_freqs.filter(pl.col("frequency") >= min_code_inclusion_frequency) + raise NotImplementedError("min_code_inclusion_frequency is not implemented yet") if min_code_inclusion_count is not None: feature_freqs = feature_freqs.filter(pl.col("count") >= min_code_inclusion_count) if max_include_codes is not None: - # feature_freqs = feature_freqs.sort("count", reverse=True).head(max_include_codes) feature_freqs = feature_freqs.sort("count", descending=True).head(max_include_codes) return sorted(feature_freqs["code"].to_list()) From 6f68a4b922b5fd9a1de78bf54457e2e607d202d4 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Sun, 8 Sep 2024 21:16:51 -0400 Subject: [PATCH 34/54] Removing unnecessary keys call --- tests/test_tabularize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index d110121..32ab69b 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -281,7 +281,7 @@ def test_tabularize(tmp_path): f"Time-Series Data matrix Should have {expected_num_rows}" f" rows but has {ts_matrix.shape[0]}!" ) output_files = list_subdir_files(str(output_dir.resolve()), "npz") - for split in split_json.keys(): + for split in split_json: for window in cfg.tabularization.window_sizes: for agg in cfg.tabularization.aggs: if agg.startswith("static"): From 6c2ba9a73cf26336f9d842b320f25cdcd397d2a0 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Sun, 8 Sep 2024 21:23:38 -0400 Subject: [PATCH 35/54] Fixed workflow files --- .github/workflows/code-quality-main.yaml | 6 ++++-- .github/workflows/code-quality-pr.yaml | 6 ++++-- .github/workflows/publish-to-pypi.yml | 26 +----------------------- .github/workflows/tests.yaml | 8 ++++---- 4 files changed, 13 insertions(+), 33 deletions(-) diff --git a/.github/workflows/code-quality-main.yaml b/.github/workflows/code-quality-main.yaml index 3703b1f..bb2d601 100644 --- a/.github/workflows/code-quality-main.yaml +++ b/.github/workflows/code-quality-main.yaml @@ -13,10 +13,12 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v3 + uses: actions/setup-python@v5 + with: + python-version: "3.11" - name: Run pre-commits uses: pre-commit/action@v3.0.1 diff --git a/.github/workflows/code-quality-pr.yaml b/.github/workflows/code-quality-pr.yaml index a97d2c0..46c9eec 100644 --- a/.github/workflows/code-quality-pr.yaml +++ b/.github/workflows/code-quality-pr.yaml @@ -16,10 +16,12 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v3 + uses: actions/setup-python@v5 + with: + python-version: "3.11" - name: Find modified files id: file_changes diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml index d86806f..34eddad 100644 --- a/.github/workflows/publish-to-pypi.yml +++ b/.github/workflows/publish-to-pypi.yml @@ -12,7 +12,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v5 with: - python-version: "3.x" + python-version: "3.11" - name: Install pypa/build run: >- python3 -m @@ -91,27 +91,3 @@ jobs: gh release upload '${{ github.ref_name }}' dist/** --repo '${{ github.repository }}' - - publish-to-testpypi: - name: Publish Python 🐍 distribution 📦 to TestPyPI - needs: - - build - runs-on: ubuntu-latest - - environment: - name: testpypi - url: https://test.pypi.org/p/ - - permissions: - id-token: write # IMPORTANT: mandatory for trusted publishing - - steps: - - name: Download all the dists - uses: actions/download-artifact@v3 - with: - name: python-package-distributions - path: dist/ - - name: Publish distribution 📦 to TestPyPI - uses: pypa/gh-action-pypi-publish@release/v1 - with: - repository-url: https://test.pypi.org/legacy/ diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index c96be0e..268269e 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -17,12 +17,12 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - - name: Set up Python 3.12 - uses: actions/setup-python@v3 + - name: Set up Python + uses: actions/setup-python@v5 with: - python-version: "3.12" + python-version: "3.11" - name: Install packages run: | From e678145b7066c230b0d6a0bfde222109e893a288 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Mon, 9 Sep 2024 03:28:53 +0000 Subject: [PATCH 36/54] fixed tabularize tests --- README.md | 20 +- docs/source/overview.md | 12 +- docs/source/prediction.md | 4 +- src/MEDS_tabular_automl/configs/default.yaml | 8 +- .../configs/describe_codes.yaml | 3 +- .../configs/launch_model.yaml | 4 +- .../configs/model_launcher/path/default.yaml | 7 +- .../configs/tabularization.yaml | 6 +- .../configs/tabularization/default.yaml | 2 +- .../configs/task_specific_caching.yaml | 6 +- src/MEDS_tabular_automl/file_name.py | 4 +- src/MEDS_tabular_automl/scripts/cache_task.py | 16 +- .../scripts/describe_codes.py | 2 +- .../scripts/launch_autogluon.py | 31 ++- .../scripts/launch_model.py | 2 +- .../scripts/tabularize_static.py | 12 +- .../scripts/tabularize_time_series.py | 2 +- src/MEDS_tabular_automl/sklearn_model.py | 2 +- src/MEDS_tabular_automl/tabular_dataset.py | 10 +- src/MEDS_tabular_automl/utils.py | 6 +- src/MEDS_tabular_automl/xgboost_model.py | 4 +- tests/test_configs.py | 18 +- tests/test_integration.py | 32 ++- tests/test_tabularize.py | 204 +++++++++--------- 24 files changed, 210 insertions(+), 207 deletions(-) diff --git a/README.md b/README.md index 58d3414..ac704f8 100644 --- a/README.md +++ b/README.md @@ -84,12 +84,12 @@ By following these steps, you can seamlessly transform your dataset, define nece ```console # Re-shard pipeline - # $MIMICIV_MEDS_DIR is the directory containing the input, MEDS v0.3 formatted MIMIC-IV data + # $MIMICIV_input_dir is the directory containing the input, MEDS v0.3 formatted MIMIC-IV data # $MEDS_TAB_COHORT_DIR is the directory where the re-sharded MEDS dataset will be stored, and where your model # will store cached files during processing by default. # $N_PATIENTS_PER_SHARD is the number of patients per shard you want to use. MEDS_transform-reshard_to_split \ - input_dir="$MIMICIV_MEDS_DIR" \ + input_dir="$MIMICIV_input_dir" \ cohort_dir="$MEDS_TAB_COHORT_DIR" \ 'stages=["reshard_to_split"]' \ stage="reshard_to_split" \ @@ -103,14 +103,14 @@ By following these steps, you can seamlessly transform your dataset, define nece - static codes (codes without timestamps) - static numerical codes (codes without timestamps but with numerical values). - This script further caches feature names and frequencies in a dataset stored in a `code_metadata.parquet` file within the `meds_dir` argument specified as a hydra-style command line argument. + This script further caches feature names and frequencies in a dataset stored in a `code_metadata.parquet` file within the `input_dir` argument specified as a hydra-style command line argument. 2. **`meds-tab-tabularize-static`**: Filters and processes the dataset based on the frequency of codes, generating a tabular vector for each patient at each timestamp in the shards. Each row corresponds to a unique `subject_id` and `timestamp` combination, thus rows are duplicated across multiple timestamps for the same patient. **Example: Tabularizing static data** with the minimum code frequency of 10, window sizes of `[1d, 30d, 365d, full]`, and value aggregation methods of `[static/present, static/first, code/count, value/count, value/sum, value/sum_sqd, value/min, value/max]` ```console - meds-tab-tabularize-static meds_dir="path_to_data" \ + meds-tab-tabularize-static input_dir="path_to_data" \ tabularization.min_code_inclusion_frequency=10 \ tabularization.window_sizes=[1d,30d,365d,full] \ do_overwrite=False \ @@ -127,19 +127,19 @@ By following these steps, you can seamlessly transform your dataset, define nece meds-tab-tabularize-time-series --multirun \ worker="range(0,$N_PARALLEL_WORKERS)" \ hydra/launcher=joblib \ - meds_dir="path_to_data" \ + input_dir="path_to_data" \ tabularization.min_code_inclusion_frequency=10 \ do_overwrite=False \ tabularization.window_sizes=[1d,30d,365d,full] \ tabularization.aggs=[static/present,static/first,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max] ``` -4. **`meds-tab-cache-task`**: Aligns task-specific labels with the nearest prior event in the tabularized data. It requires a labeled dataset directory with three columns (`subject_id`, `timestamp`, `label`) structured similarly to the `meds_dir`. +4. **`meds-tab-cache-task`**: Aligns task-specific labels with the nearest prior event in the tabularized data. It requires a labeled dataset directory with three columns (`subject_id`, `timestamp`, `label`) structured similarly to the `input_dir`. **Example: Align tabularized data** for a specific task `$TASK` and labels that has pulled from [ACES](https://github.com/justin13601/ACES) ```console - meds-tab-cache-task meds_dir="path_to_data" \ + meds-tab-cache-task input_dir="path_to_data" \ task_name=$TASK \ tabularization.min_code_inclusion_frequency=10 \ do_overwrite=False \ @@ -151,7 +151,7 @@ By following these steps, you can seamlessly transform your dataset, define nece ```console meds-tab-xgboost --multirun \ - meds_dir="path_to_data" \ + input_dir="path_to_data" \ task_name=$TASK \ output_dir="output_directory" \ tabularization.min_code_inclusion_frequency=10 \ @@ -436,7 +436,7 @@ A single XGBoost run was completed to profile time and memory usage. This was do ```console meds-tab-xgboost - meds_dir="path_to_data" \ + input_dir="path_to_data" \ task_name=$TASK \ output_dir="output_directory" \ do_overwrite=False \ @@ -506,7 +506,7 @@ The XGBoost sweep was run using the following command for each `$TASK`: ```console meds-tab-xgboost --multirun \ - meds_dir="path_to_data" \ + input_dir="path_to_data" \ task_name=$TASK \ output_dir="output_directory" \ tabularization.window_sizes=$(generate-subsets [1d,30d,365d,full]) \ diff --git a/docs/source/overview.md b/docs/source/overview.md index 0f91818..1d453f0 100644 --- a/docs/source/overview.md +++ b/docs/source/overview.md @@ -38,14 +38,14 @@ See [`/tests/test_integration.py`](https://github.com/mmcdermott/MEDS_Tabular_Au - static codes (codes without timestamps) - static numerical codes (codes without timestamps but with numerical values). - This script further caches feature names and frequencies in a dataset stored in a `code_metadata.parquet` file within the `meds_dir` argument specified as a hydra-style command line argument. + This script further caches feature names and frequencies in a dataset stored in a `code_metadata.parquet` file within the `input_dir` argument specified as a hydra-style command line argument. 2. **`meds-tab-tabularize-static`**: Filters and processes the dataset based on the frequency of codes, generating a tabular vector for each patient at each timestamp in the shards. Each row corresponds to a unique `subject_id` and `timestamp` combination, thus rows are duplicated across multiple timestamps for the same patient. **Example: Tabularizing static data** with the minimum code frequency of 10, window sizes of `[1d, 30d, 365d, full]`, and value aggregation methods of `[static/present, static/first, code/count, value/count, value/sum, value/sum_sqd, value/min, value/max]` ```console - meds-tab-tabularize-static meds_dir="path_to_data" \ + meds-tab-tabularize-static input_dir="path_to_data" \ tabularization.min_code_inclusion_frequency=10 \ tabularization.window_sizes=[1d,30d,365d,full] \ do_overwrite=False \ @@ -62,19 +62,19 @@ See [`/tests/test_integration.py`](https://github.com/mmcdermott/MEDS_Tabular_Au meds-tab-tabularize-time-series --multirun \ worker="range(0,$N_PARALLEL_WORKERS)" \ hydra/launcher=joblib \ - meds_dir="path_to_data" \ + input_dir="path_to_data" \ tabularization.min_code_inclusion_frequency=10 \ do_overwrite=False \ tabularization.window_sizes=[1d,30d,365d,full] \ tabularization.aggs=[static/present,static/first,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max] ``` -4. **`meds-tab-cache-task`**: Aligns task-specific labels with the nearest prior event in the tabularized data. It requires a labeled dataset directory with three columns (`subject_id`, `timestamp`, `label`) structured similarly to the `meds_dir`. +4. **`meds-tab-cache-task`**: Aligns task-specific labels with the nearest prior event in the tabularized data. It requires a labeled dataset directory with three columns (`subject_id`, `timestamp`, `label`) structured similarly to the `input_dir`. **Example: Align tabularized data** for a specific task `$TASK` and labels that has pulled from [ACES](https://github.com/justin13601/ACES) ```console - meds-tab-cache-task meds_dir="path_to_data" \ + meds-tab-cache-task input_dir="path_to_data" \ task_name=$TASK \ tabularization.min_code_inclusion_frequency=10 \ do_overwrite=False \ @@ -86,7 +86,7 @@ See [`/tests/test_integration.py`](https://github.com/mmcdermott/MEDS_Tabular_Au ```console meds-tab-xgboost --multirun \ - meds_dir="path_to_data" \ + input_dir="path_to_data" \ task_name=$TASK \ output_dir="output_directory" \ tabularization.min_code_inclusion_frequency=10 \ diff --git a/docs/source/prediction.md b/docs/source/prediction.md index 719fb5b..18f19c0 100644 --- a/docs/source/prediction.md +++ b/docs/source/prediction.md @@ -14,7 +14,7 @@ A single XGBoost run was completed to profile time and memory usage. This was do ```console meds-tab-xgboost - meds_dir="path_to_data" \ + input_dir="path_to_data" \ task_name=$TASK \ output_dir="output_directory" \ do_overwrite=False \ @@ -84,7 +84,7 @@ The XGBoost sweep was run using the following command for each `$TASK`: ```console meds-tab-xgboost --multirun \ - meds_dir="path_to_data" \ + input_dir="path_to_data" \ task_name=$TASK \ output_dir="output_directory" \ tabularization.window_sizes=$(generate-permutations [1d,30d,365d,full]) \ diff --git a/src/MEDS_tabular_automl/configs/default.yaml b/src/MEDS_tabular_automl/configs/default.yaml index 9ee58e9..7d4e392 100644 --- a/src/MEDS_tabular_automl/configs/default.yaml +++ b/src/MEDS_tabular_automl/configs/default.yaml @@ -1,13 +1,13 @@ -meds_dir: ??? -tabularized_dir: ??? +input_dir: ??? +output_dir: ??? do_overwrite: False seed: 1 tqdm: False worker: 0 loguru_init: False -log_dir: ${tabularized_dir}/.logs/ -cache_dir: ${tabularized_dir}/.cache +log_dir: ${output_dir}/.logs/ +cache_dir: ${output_dir}/.cache hydra: verbose: False diff --git a/src/MEDS_tabular_automl/configs/describe_codes.yaml b/src/MEDS_tabular_automl/configs/describe_codes.yaml index fe5d07b..007307c 100644 --- a/src/MEDS_tabular_automl/configs/describe_codes.yaml +++ b/src/MEDS_tabular_automl/configs/describe_codes.yaml @@ -2,8 +2,7 @@ defaults: - default - _self_ -input_dir: ${meds_dir} # Where to store output code frequency data -output_filepath: ${tabularized_dir}/metadata/codes.parquet +output_filepath: ${output_dir}/metadata/codes.parquet name: describe_codes diff --git a/src/MEDS_tabular_automl/configs/launch_model.yaml b/src/MEDS_tabular_automl/configs/launch_model.yaml index bba2fea..7008acf 100644 --- a/src/MEDS_tabular_automl/configs/launch_model.yaml +++ b/src/MEDS_tabular_automl/configs/launch_model.yaml @@ -10,9 +10,9 @@ defaults: task_name: ??? # Location of task, split, and shard specific tabularized data -input_tabularized_cache_dir: ${tabularized_dir}/${task_name}/task_cache +input_tabularized_cache_dir: ${output_dir}/${task_name}/task_cache # Location of task, split, and shard specific label data -input_label_cache_dir: ${tabularized_dir}/${task_name}/labels +input_label_cache_dir: ${output_dir}/${task_name}/labels # Where to output the model and cached data output_model_dir: ??? diff --git a/src/MEDS_tabular_automl/configs/model_launcher/path/default.yaml b/src/MEDS_tabular_automl/configs/model_launcher/path/default.yaml index 0c6d4d1..e3ef9bb 100644 --- a/src/MEDS_tabular_automl/configs/model_launcher/path/default.yaml +++ b/src/MEDS_tabular_automl/configs/model_launcher/path/default.yaml @@ -1,6 +1,7 @@ -input_dir: ${input_dir} -input_label_dir: ${input_label_dir} -output_dir: ${output_model_dir} +# input_dir: ${input_dir} +input_tabularized_cache_dir: ${input_tabularized_cache_dir} +input_label_cache_dir: ${input_label_cache_dir} +output_model_dir: ${output_model_dir} model_file_stem: model model_file_extension: .json log_dir: ${log_dir} diff --git a/src/MEDS_tabular_automl/configs/tabularization.yaml b/src/MEDS_tabular_automl/configs/tabularization.yaml index f356d4f..5d74eb9 100644 --- a/src/MEDS_tabular_automl/configs/tabularization.yaml +++ b/src/MEDS_tabular_automl/configs/tabularization.yaml @@ -5,8 +5,8 @@ defaults: # Raw data # Where the code metadata is stored -input_code_metadata_fp: ${tabularized_dir}/metadata/codes.parquet -input_dir: ${meds_dir} -output_dir: ${tabularized_dir}/tabularize +input_code_metadata_fp: ${output_dir}/metadata/codes.parquet +input_dir: ${input_dir} +output_tabularized_dir: ${output_dir}/tabularize name: tabularization diff --git a/src/MEDS_tabular_automl/configs/tabularization/default.yaml b/src/MEDS_tabular_automl/configs/tabularization/default.yaml index 820fe48..8c51383 100644 --- a/src/MEDS_tabular_automl/configs/tabularization/default.yaml +++ b/src/MEDS_tabular_automl/configs/tabularization/default.yaml @@ -1,5 +1,5 @@ # User inputs -filtered_code_metadata_fp: ${tabularized_dir}/metadata/codes.parquet +filtered_code_metadata_fp: ${output_dir}/metadata/codes.parquet allowed_codes: null min_code_inclusion_count: 10 min_code_inclusion_frequency: null diff --git a/src/MEDS_tabular_automl/configs/task_specific_caching.yaml b/src/MEDS_tabular_automl/configs/task_specific_caching.yaml index 54ad9d3..a372134 100644 --- a/src/MEDS_tabular_automl/configs/task_specific_caching.yaml +++ b/src/MEDS_tabular_automl/configs/task_specific_caching.yaml @@ -5,13 +5,13 @@ defaults: task_name: ??? # Directory of tabularized data -input_tabularized_dir: ${tabularized_dir}/tabularize +input_tabularized_dir: ${output_dir}/tabularize # Where the labels are stored, with columns subject_id, timestamp, label input_label_dir: ??? # Where to output the task, split, and shard specific tabularized data -output_tabularized_cache_dir: ${tabularized_dir}/${task_name}/task_cache +output_tabularized_cache_dir: ${output_dir}/${task_name}/task_cache # Where to output the task, split, and shard specific label data -output_label_cache_dir: ${tabularized_dir}/${task_name}/labels +output_label_cache_dir: ${output_dir}/${task_name}/labels label_column: "boolean_value" diff --git a/src/MEDS_tabular_automl/file_name.py b/src/MEDS_tabular_automl/file_name.py index 4ca144a..36b2861 100644 --- a/src/MEDS_tabular_automl/file_name.py +++ b/src/MEDS_tabular_automl/file_name.py @@ -73,7 +73,7 @@ def get_model_files(cfg: DictConfig, split: str, shard: str) -> list[Path]: Examples: >>> cfg = DictConfig({ - ... "input_dir": "data", + ... "path": DictConfig({"input_tabularized_cache_dir" : "data"}), ... "tabularization": { ... "window_sizes": ["1d", "7d"], ... "aggs": ["code/count", "value/sum", "static/present"], @@ -94,7 +94,7 @@ def get_model_files(cfg: DictConfig, split: str, shard: str) -> list[Path]: """ window_sizes = cfg.tabularization.window_sizes aggs = cfg.tabularization.aggs - shard_dir = Path(cfg.input_dir) / split / shard + shard_dir = Path(cfg.path.input_tabularized_cache_dir) / split / shard # Given a shard number, returns the model files model_files = [] for window_size in window_sizes: diff --git a/src/MEDS_tabular_automl/scripts/cache_task.py b/src/MEDS_tabular_automl/scripts/cache_task.py index 07fa0be..dc0dd89 100644 --- a/src/MEDS_tabular_automl/scripts/cache_task.py +++ b/src/MEDS_tabular_automl/scripts/cache_task.py @@ -86,7 +86,8 @@ def main(cfg: DictConfig): "input_dir", "input_label_dir", "output_dir", - "output_label_dir", + "output_tabularized_cache_dir", + "output_label_cache_dir", "tabularization.filtered_code_metadata_fp", ], ) @@ -99,7 +100,8 @@ def main(cfg: DictConfig): tabularization_tasks = list_subdir_files(cfg.input_tabularized_dir, "npz") if len(tabularization_tasks) == 0: raise FileNotFoundError( - f"No tabularized data found, `tabularized_dir`: {cfg.input_tabularized_dir}, is likely incorrect" + f"No tabularized data found, `input_tabularized_dir`: {cfg.input_tabularized_dir}, " + "is likely incorrect" ) np.random.shuffle(tabularization_tasks) @@ -127,11 +129,11 @@ def main(cfg: DictConfig): for data_fp in iter_wrapper(tabularization_tasks): # parse as time series agg split, shard_num, window_size, code_type, agg_name = Path(data_fp).with_suffix("").parts[-5:] - meds_data_in_fp = Path(cfg.meds_dir) / split / f"{shard_num}.parquet" - shard_label_fp = Path(cfg.output_label_dir) / split / f"{shard_num}.parquet" - out_fp = (Path(cfg.output_dir) / get_shard_prefix(cfg.input_tabularized_dir, data_fp)).with_suffix( - ".npz" - ) + meds_data_in_fp = Path(cfg.input_dir) / split / f"{shard_num}.parquet" + shard_label_fp = Path(cfg.output_label_cache_dir) / split / f"{shard_num}.parquet" + out_fp = ( + Path(cfg.output_tabularized_cache_dir) / get_shard_prefix(cfg.input_tabularized_dir, data_fp) + ).with_suffix(".npz") def read_meds_data_df(meds_data_fp): if "numeric_value" not in pl.scan_parquet(meds_data_fp).columns: diff --git a/src/MEDS_tabular_automl/scripts/describe_codes.py b/src/MEDS_tabular_automl/scripts/describe_codes.py index 0e46ac3..604b589 100644 --- a/src/MEDS_tabular_automl/scripts/describe_codes.py +++ b/src/MEDS_tabular_automl/scripts/describe_codes.py @@ -32,7 +32,7 @@ def main(cfg: DictConfig): cfg: The configuration object for the tabularization process, loaded from a Hydra YAML configuration file. """ - stage_init(cfg, ["input_dir", "meds_dir"]) + stage_init(cfg, ["input_dir", "input_dir"]) iter_wrapper = load_tqdm(cfg.tqdm) if not cfg.loguru_init: hydra_loguru_init() diff --git a/src/MEDS_tabular_automl/scripts/launch_autogluon.py b/src/MEDS_tabular_automl/scripts/launch_autogluon.py index 0cda6da..ccbc93d 100644 --- a/src/MEDS_tabular_automl/scripts/launch_autogluon.py +++ b/src/MEDS_tabular_automl/scripts/launch_autogluon.py @@ -1,10 +1,11 @@ +import json from importlib.resources import files from pathlib import Path import hydra import pandas as pd from loguru import logger -from omegaconf import DictConfig +from omegaconf import DictConfig, OmegaConf try: import autogluon.tabular as ag @@ -15,7 +16,7 @@ from ..utils import hydra_loguru_init, stage_init -config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_autogluon.yaml") +config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_model.yaml") if not config_yaml.is_file(): raise FileNotFoundError("Core configuration not successfully installed!") @@ -36,7 +37,7 @@ def main(cfg: DictConfig) -> float: """ check_autogluon() stage_init( - cfg, ["input_dir", "input_label_dir", "output_dir", "tabularization.filtered_code_metadata_fp"] + cfg, ["input_dir", "input_label_cache_dir", "output_dir", "tabularization.filtered_code_metadata_fp"] ) if not cfg.loguru_init: hydra_loguru_init() @@ -66,8 +67,13 @@ def main(cfg: DictConfig) -> float: held_out_dataset = ag.TabularDataset(held_out_df) # train model with AutoGluon + log_filepath = Path(cfg.path.model_log_dir) / f"{cfg.path.config_log_stem}_log.txt" + predictor = ag.TabularPredictor( - label=cfg.task_name, log_to_file=True, log_file_path=cfg.log_filepath, path=cfg.output_filepath + label=cfg.task_name, + log_to_file=True, + log_file_path=str(log_filepath.resolve()), + path=cfg.output_model_dir, ).fit(train_data=train_dataset, tuning_data=tuning_dataset) # predict @@ -77,12 +83,17 @@ def main(cfg: DictConfig) -> float: score = predictor.evaluate(held_out_dataset) logger.info("Test score:", score) - log_fp = Path(cfg.model_log_dir) - log_fp.mkdir(parents=True, exist_ok=True) - # log hyperparameters - out_fp = log_fp / "trial_performance_results.log" - with open(out_fp, "w") as f: - f.write(f"{cfg.output_filepath}\t{cfg.tabularization}\t{cfg.model_params}\t{None}\t{score}\n") + model_performance_log_filepath = Path(cfg.path.model_log_dir) / f"{cfg.path.performance_log_stem}.json" + model_performance_log_filepath.parent.mkdir(parents=True, exist_ok=True) + # store results + performance_dict = { + "output_model_dir": cfg.path.output_model_dir, + "tabularization": OmegaConf.to_container(cfg.tabularization), + "model_launcher": OmegaConf.to_container(cfg.model_launcher), + "score": score, + } + with open(model_performance_log_filepath, "w") as f: + json.dump(performance_dict, f) if __name__ == "__main__": diff --git a/src/MEDS_tabular_automl/scripts/launch_model.py b/src/MEDS_tabular_automl/scripts/launch_model.py index 1dc0792..9f8b8da 100644 --- a/src/MEDS_tabular_automl/scripts/launch_model.py +++ b/src/MEDS_tabular_automl/scripts/launch_model.py @@ -25,7 +25,7 @@ def main(cfg: DictConfig) -> float: The evaluation result as the ROC AUC score on the held-out test set. """ stage_init( - cfg, ["input_dir", "input_label_dir", "output_dir", "tabularization.filtered_code_metadata_fp"] + cfg, ["input_dir", "input_label_cache_dir", "output_dir", "tabularization.filtered_code_metadata_fp"] ) if not cfg.loguru_init: diff --git a/src/MEDS_tabular_automl/scripts/tabularize_static.py b/src/MEDS_tabular_automl/scripts/tabularize_static.py index 1a616bf..0ea91a4 100644 --- a/src/MEDS_tabular_automl/scripts/tabularize_static.py +++ b/src/MEDS_tabular_automl/scripts/tabularize_static.py @@ -57,7 +57,7 @@ def main( Args: cfg: - meds_dir: directory of MEDS format dataset that is ingested. + input_dir: directory of MEDS format dataset that is ingested. tabularized_data_dir: output directory of tabularized data. min_code_inclusion_frequency: The base feature inclusion frequency that should be used to dictate what features can be included in the flat representation. It can either be a float, in which @@ -83,7 +83,13 @@ def main( .. _link: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.groupby_rolling.html # noqa: E501 """ stage_init( - cfg, ["input_code_metadata_fp", "input_dir", "output_dir", "tabularization.filtered_code_metadata_fp"] + cfg, + [ + "input_code_metadata_fp", + "input_dir", + "output_tabularized_dir", + "tabularization.filtered_code_metadata_fp", + ], ) iter_wrapper = load_tqdm(cfg.tqdm) if not cfg.loguru_init: @@ -134,7 +140,7 @@ def write_fn(data, out_fp): np.random.shuffle(tabularization_tasks) for shard_fp, agg in iter_wrapper(tabularization_tasks): out_fp = ( - Path(cfg.output_dir) / get_shard_prefix(cfg.input_dir, shard_fp) / "none" / agg + Path(cfg.output_tabularized_dir) / get_shard_prefix(cfg.input_dir, shard_fp) / "none" / agg ).with_suffix(".npz") if out_fp.exists() and not cfg.do_overwrite: raise FileExistsError(f"do_overwrite is {cfg.do_overwrite} and {out_fp} exists!") diff --git a/src/MEDS_tabular_automl/scripts/tabularize_time_series.py b/src/MEDS_tabular_automl/scripts/tabularize_time_series.py index 9dd508e..e6d1337 100644 --- a/src/MEDS_tabular_automl/scripts/tabularize_time_series.py +++ b/src/MEDS_tabular_automl/scripts/tabularize_time_series.py @@ -87,7 +87,7 @@ def main( # iterate through them for shard_fp, window_size, agg in iter_wrapper(tabularization_tasks): out_fp = ( - Path(cfg.output_dir) / get_shard_prefix(cfg.input_dir, shard_fp) / window_size / agg + Path(cfg.output_tabularized_dir) / get_shard_prefix(cfg.input_dir, shard_fp) / window_size / agg ).with_suffix(".npz") def read_fn(in_fp): diff --git a/src/MEDS_tabular_automl/sklearn_model.py b/src/MEDS_tabular_automl/sklearn_model.py index 8edd245..12ed980 100644 --- a/src/MEDS_tabular_automl/sklearn_model.py +++ b/src/MEDS_tabular_automl/sklearn_model.py @@ -225,7 +225,7 @@ def save_model(self, output_fp: str): if not hasattr(self.model, "save_model"): logger.info(f"Model {self.model.__class__.__name__} does not have a save_model method.") logger.info("Model will be saved using pickle dump.") - if not output_fp.endswith(".pkl"): + if not str(output_fp.resolve()).endswith(".pkl"): raise ValueError("Model file extension must be .pkl.") with open(output_fp, "wb") as f: dump(self.model, f, protocol=5) diff --git a/src/MEDS_tabular_automl/tabular_dataset.py b/src/MEDS_tabular_automl/tabular_dataset.py index db73041..fa402b7 100644 --- a/src/MEDS_tabular_automl/tabular_dataset.py +++ b/src/MEDS_tabular_automl/tabular_dataset.py @@ -52,11 +52,15 @@ def __init__(self, cfg: DictConfig, split: str = "train"): self.split = split # Load shards for this split self._data_shards = sorted( - [shard.stem for shard in list_subdir_files(Path(cfg.path.input_label_dir) / split, "parquet")] + [ + shard.stem + for shard in list_subdir_files(Path(cfg.path.input_label_cache_dir) / split, "parquet") + ] ) if len(self._data_shards) == 0: raise ValueError( - f"No labels found in the `input_label_dir` {str(Path(cfg.path.input_label_dir).resolve())}" + "No labels found in the `input_label_cache_dir` " + + str(Path(cfg.path.input_label_cache_dir).resolve()) ) self.valid_event_ids, self.labels = None, None @@ -118,7 +122,7 @@ def _load_ids_and_labels( to lists of corresponding labels. """ label_fps = { - shard: (Path(self.cfg.input_label_dir) / self.split / shard).with_suffix(".parquet") + shard: (Path(self.cfg.path.input_label_cache_dir) / self.split / shard).with_suffix(".parquet") for shard in self._data_shards for shard in self._data_shards } diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py index 1296483..290fdbf 100644 --- a/src/MEDS_tabular_automl/utils.py +++ b/src/MEDS_tabular_automl/utils.py @@ -410,18 +410,18 @@ def log_to_logfile(model, cfg, output_fp): cfg: The configuration dictionary. output_fp: The relative output file path. """ - log_fp = Path(cfg.model_logging.model_log_dir) + log_fp = Path(cfg.path.model_log_dir) # make a folder to log everything for this model out_fp = log_fp / output_fp out_fp.mkdir(parents=True, exist_ok=True) # config as a json - config_fp = out_fp / f"{cfg.model_logging.config_log_stem}.json" + config_fp = out_fp / f"{cfg.path.config_log_stem}.json" with open(config_fp, "w") as f: f.write(OmegaConf.to_yaml(cfg)) - model_performance_fp = out_fp / f"{cfg.model_logging.performance_log_stem}.csv" + model_performance_fp = out_fp / f"{cfg.path.performance_log_stem}.csv" with open(model_performance_fp, "w") as f: f.write("model_fp,tuning_auc,test_auc\n") f.write(f"{output_fp},{model.evaluate()},{model.evaluate(split='held_out')}\n") diff --git a/src/MEDS_tabular_automl/xgboost_model.py b/src/MEDS_tabular_automl/xgboost_model.py index 5ff6fd2..f95563f 100644 --- a/src/MEDS_tabular_automl/xgboost_model.py +++ b/src/MEDS_tabular_automl/xgboost_model.py @@ -130,13 +130,11 @@ def _build(self): def _train(self): """Trains the model.""" - self.model = self.cfg.model - self.model = self.model.train( + self.model = xgb.train( OmegaConf.to_container(self.cfg.model), self.dtrain, num_boost_round=self.cfg.training_params.num_boost_round, early_stopping_rounds=self.cfg.training_params.early_stopping_rounds, - # nthreads=self.cfg.nthreads, evals=[(self.dtrain, "train"), (self.dtuning, "tuning")], verbose_eval=0, ) diff --git a/tests/test_configs.py b/tests/test_configs.py index b16f34f..5ad1a13 100644 --- a/tests/test_configs.py +++ b/tests/test_configs.py @@ -56,11 +56,11 @@ def make_config_mutable(cfg): @pytest.mark.parametrize("imputer", ["default", "mean_imputer", "mode_imputer", "median_imputer"]) @pytest.mark.parametrize("normalization", ["standard_scaler", "max_abs_scaler"]) def test_model_config(model_launcher_override, imputer, normalization, tmp_path): - meds_dir = "/foo/" + input_dir = "/foo/" code_metadata_fp = f"/{str(tmp_path)}/codes.parquet" model_launcher_config_kwargs = { - "meds_dir": meds_dir, - "tabularized_dir": "/bar/", + "input_dir": input_dir, + "output_dir": "/bar/", "output_model_dir": "/baz/", "++tabularization.filtered_code_metadata_fp": code_metadata_fp, "++tabularization.min_code_inclusion_count": "0", @@ -95,12 +95,12 @@ def test_model_config(model_launcher_override, imputer, normalization, tmp_path) def test_generate_subsets_configs(): - meds_dir = "blah" + input_dir = "blah" stderr, stdout_ws = run_command("generate-subsets", ["[30d]"], {}, "generate-subsets window_sizes") stderr, stdout_agg = run_command("generate-subsets", ["[static/present]"], {}, "generate-subsets aggs") xgboost_config_kwargs = { - "meds_dir": meds_dir, - "tabularized_dir": "blah", + "input_dir": input_dir, + "output_dir": "blah", "do_overwrite": False, "seed": 1, "hydra.verbose": True, @@ -110,9 +110,7 @@ def test_generate_subsets_configs(): "tabularization.window_sizes": f"{stdout_ws.strip()}", } - with initialize( - version_base=None, config_path="../src/MEDS_tabular_automl/configs/" - ): # path to config.yaml + with initialize(version_base=None, config_path="../src/MEDS_tabular_automl/configs/"): overrides = [f"{k}={v}" for k, v in xgboost_config_kwargs.items()] - cfg = compose(config_name="launch_model", overrides=overrides) # config.yaml + cfg = compose(config_name="launch_model", overrides=overrides) assert cfg.tabularization.window_sizes diff --git a/tests/test_integration.py b/tests/test_integration.py index 800a14e..6c1e85b 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -43,12 +43,12 @@ def run_command(script: str, args: list[str], hydra_kwargs: dict[str, str], test def test_integration(tmp_path): # Step 0: Setup Environment - meds_dir = Path(tmp_path) / "meds_dir" - tabularized_dir = Path(tmp_path) / "tabularized_dir" + input_dir = Path(tmp_path) / "input_dir" + output_dir = Path(tmp_path) / "output_dir" shared_config = { - "meds_dir": str(meds_dir.resolve()), - "tabularized_dir": str(tabularized_dir.resolve()), + "input_dir": str(input_dir.resolve()), + "output_dir": str(output_dir.resolve()), "do_overwrite": False, "seed": 1, "hydra.verbose": True, @@ -58,19 +58,17 @@ def test_integration(tmp_path): describe_codes_config = {**shared_config} - with initialize( - version_base=None, config_path="../src/MEDS_tabular_automl/configs/" - ): # path to config.yaml + with initialize(version_base=None, config_path="../src/MEDS_tabular_automl/configs/"): overrides = [f"{k}={v}" for k, v in describe_codes_config.items()] - cfg = compose(config_name="describe_codes", overrides=overrides) # config.yaml + cfg = compose(config_name="describe_codes", overrides=overrides) # Create the directories - (tabularized_dir).mkdir(parents=True, exist_ok=True) + (output_dir).mkdir(parents=True, exist_ok=True) # Store MEDS outputs all_data = [] for split, data in MEDS_OUTPUTS.items(): - file_path = tabularized_dir / f"{split}.parquet" + file_path = output_dir / f"{split}.parquet" file_path.parent.mkdir(exist_ok=True) df = pl.read_csv(StringIO(data)).with_columns(pl.col("time").str.to_datetime("%Y-%m-%dT%H:%M:%S%.f")) df.write_parquet(file_path) @@ -86,7 +84,7 @@ def test_integration(tmp_path): for f in meds_files: assert pl.read_parquet(f).shape[0] > 0, "MEDS Data Tabular Dataframe Should not be Empty!" split_json = json.load(StringIO(SPLITS_JSON)) - splits_fp = tabularized_dir / ".shards.json" + splits_fp = output_dir / ".shards.json" json.dump(split_json, splits_fp.open("w")) # Step 1: Run the describe_codes script @@ -117,11 +115,9 @@ def test_integration(tmp_path): tabularize_config, "tabularization", ) - with initialize( - version_base=None, config_path="../src/MEDS_tabular_automl/configs/" - ): # path to config.yaml + with initialize(version_base=None, config_path="../src/MEDS_tabular_automl/configs/"): overrides = [f"{k}={v}" for k, v in tabularize_config.items()] - cfg = compose(config_name="tabularization", overrides=overrides) # config.yaml + cfg = compose(config_name="tabularization", overrides=overrides) output_files = list(Path(cfg.output_dir).glob("**/static/**/*.npz")) actual_files = [get_shard_prefix(Path(cfg.output_dir), each) + ".npz" for each in output_files] @@ -200,11 +196,9 @@ def test_integration(tmp_path): "tabularization.min_code_inclusion_count": 1, "tabularization.window_sizes": "[30d,365d,full]", } - with initialize( - version_base=None, config_path="../src/MEDS_tabular_automl/configs/" - ): # path to config.yaml + with initialize(version_base=None, config_path="../src/MEDS_tabular_automl/configs/"): overrides = [f"{k}={v}" for k, v in cache_config.items()] - cfg = compose(config_name="task_specific_caching", overrides=overrides) # config.yaml + cfg = compose(config_name="task_specific_caching", overrides=overrides) df = get_unique_time_events_df(get_events_df(all_data.lazy(), feature_columns)).collect() pseudo_labels = pl.Series(([0, 1] * df.shape[0])[: df.shape[0]]) diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index 481d601..b84fb54 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -2,13 +2,14 @@ root = rootutils.setup_root(__file__, dotenv=True, pythonpath=True, cwd=True) +import importlib import json +import shutil from io import StringIO from pathlib import Path import polars as pl from hydra import compose, initialize -from hydra.core.hydra_config import HydraConfig from loguru import logger from MEDS_tabular_automl.describe_codes import get_feature_columns @@ -147,14 +148,14 @@ def test_tabularize(tmp_path): - meds_dir = Path(tmp_path) / "meds_dir" - tabularized_dir = Path(tmp_path) / "tabularized_dir" + input_dir = Path(tmp_path) / "input_dir" + output_dir = Path(tmp_path) / "output_dir" input_label_dir = Path(tmp_path) / "label_dir" output_model_dir = Path(tmp_path) / "output_model_dir" shared_config = { - "meds_dir": str(meds_dir.resolve()), - "tabularized_dir": str(tabularized_dir.resolve()), + "input_dir": str(input_dir.resolve()), + "output_dir": str(output_dir.resolve()), "do_overwrite": False, "seed": 1, "hydra.verbose": True, @@ -164,19 +165,17 @@ def test_tabularize(tmp_path): describe_codes_config = {**shared_config} - with initialize( - version_base=None, config_path="../src/MEDS_tabular_automl/configs/" - ): # path to config.yaml + with initialize(version_base=None, config_path="../src/MEDS_tabular_automl/configs/"): overrides = [f"{k}={v}" for k, v in describe_codes_config.items()] - cfg = compose(config_name="describe_codes", overrides=overrides) # config.yaml + cfg = compose(config_name="describe_codes", overrides=overrides) # Create the directories - (tabularized_dir).mkdir(parents=True, exist_ok=True) + (output_dir).mkdir(parents=True, exist_ok=True) # Store MEDS outputs all_data = [] for split, data in MEDS_OUTPUTS.items(): - file_path = meds_dir / f"{split}.parquet" + file_path = input_dir / f"{split}.parquet" file_path.parent.mkdir(exist_ok=True, parents=True) df = pl.read_csv(StringIO(data)).with_columns(pl.col("time").str.to_datetime("%Y-%m-%dT%H:%M:%S%.f")) df.write_parquet(file_path) @@ -193,7 +192,7 @@ def test_tabularize(tmp_path): for f in meds_files: assert pl.read_parquet(f).shape[0] > 0, "MEDS Data Tabular Dataframe Should not be Empty!" split_json = json.load(StringIO(SPLITS_JSON)) - splits_fp = meds_dir / ".shards.json" + splits_fp = input_dir / ".shards.json" json.dump(split_json, splits_fp.open("w")) # Step 1: Describe Codes - compute code frequencies describe_codes.main(cfg) @@ -214,14 +213,12 @@ def test_tabularize(tmp_path): "tabularization.window_sizes": "[30d,365d,full]", } - with initialize( - version_base=None, config_path="../src/MEDS_tabular_automl/configs/" - ): # path to config.yaml + with initialize(version_base=None, config_path="../src/MEDS_tabular_automl/configs/"): overrides = [f"{k}={v}" for k, v in tabularize_static_config.items()] - cfg = compose(config_name="tabularization", overrides=overrides) # config.yaml + cfg = compose(config_name="tabularization", overrides=overrides) tabularize_static.main(cfg) - output_dir = Path(cfg.tabularized_dir) / "tabularize" + output_dir = Path(cfg.output_dir) / "tabularize" output_files = list(output_dir.glob("**/static/**/*.npz")) actual_files = [get_shard_prefix(output_dir, each) + ".npz" for each in output_files] @@ -280,16 +277,16 @@ def test_tabularize(tmp_path): assert ts_matrix.shape[0] == expected_num_rows, ( f"Time-Series Data matrix Should have {expected_num_rows}" f" rows but has {ts_matrix.shape[0]}!" ) - output_files = list_subdir_files(str(output_dir.resolve()), "npz") + output_files = list_subdir_files(str(Path(cfg.output_tabularized_dir).resolve()), "npz") for split in split_json.keys(): for window in cfg.tabularization.window_sizes: for agg in cfg.tabularization.aggs: if agg.startswith("static"): if window != cfg.tabularization.window_sizes[0]: continue - expected_fp = Path(cfg.output_dir) / split / "none" / f"{agg}.npz" + expected_fp = Path(cfg.output_tabularized_dir) / split / "none" / f"{agg}.npz" else: - expected_fp = Path(cfg.output_dir) / split / window / f"{agg}.npz" + expected_fp = Path(cfg.output_tabularized_dir) / split / window / f"{agg}.npz" assert expected_fp in output_files, f"Missing {expected_fp}" expected_num_time_tabs = ( NUM_SHARDS * len(cfg.tabularization.window_sizes) * (len(cfg.tabularization.aggs) - 2) @@ -303,15 +300,12 @@ def test_tabularize(tmp_path): "tabularization.min_code_inclusion_count": 1, "tabularization.window_sizes": "[30d,365d,full]", "task_name": "test_task", - "tabularized_dir": str(tabularized_dir.resolve()), "input_label_dir": str(input_label_dir.resolve()), } - with initialize( - version_base=None, config_path="../src/MEDS_tabular_automl/configs/" - ): # path to config.yaml + with initialize(version_base=None, config_path="../src/MEDS_tabular_automl/configs/"): overrides = [f"{k}={v}" for k, v in cache_config.items()] - cfg = compose(config_name="task_specific_caching", overrides=overrides) # config.yaml + cfg = compose(config_name="task_specific_caching", overrides=overrides) # Create fake labels df = get_unique_time_events_df(get_events_df(all_data.lazy(), feature_columns)).collect() @@ -330,102 +324,98 @@ def test_tabularize(tmp_path): if agg.startswith("static"): if window != cfg.tabularization.window_sizes[0]: continue - expected_fp = Path(cfg.output_dir) / split / "none" / f"{agg}.npz" + expected_fp = Path(cfg.output_tabularized_cache_dir) / split / "none" / f"{agg}.npz" else: - expected_fp = Path(cfg.output_dir) / split / window / f"{agg}.npz" - output_files = list_subdir_files(str(Path(cfg.output_dir).resolve()), "npz") + expected_fp = Path(cfg.output_tabularized_cache_dir) / split / window / f"{agg}.npz" + output_files = list_subdir_files(str(Path(cfg.output_tabularized_cache_dir).resolve()), "npz") assert expected_fp in output_files, f"Missing {expected_fp}" [each for each in output_files if "0/30d" in str(each) and "code/count" in str(each)] - assert len(list_subdir_files(cfg.output_dir, "npz")) == expected_num_time_tabs + expected_num_static_tabs + assert ( + len(list_subdir_files(cfg.output_tabularized_cache_dir, "npz")) + == expected_num_time_tabs + expected_num_static_tabs + ) xgboost_config = { **shared_config, "tabularization.min_code_inclusion_count": 1, "tabularization.window_sizes": "[30d,365d,full]", "task_name": "test_task", - "tabularized_dir": str(tabularized_dir.resolve()), - "input_label_dir": str(input_label_dir.resolve()), "output_model_dir": str(output_model_dir.resolve()), } - with initialize( - version_base=None, config_path="../src/MEDS_tabular_automl/configs/" - ): # path to config.yaml + with initialize(version_base=None, config_path="../src/MEDS_tabular_automl/configs/"): overrides = ["model_launcher=xgboost"] + [f"{k}={v}" for k, v in xgboost_config.items()] - cfg = compose( - config_name="launch_model", overrides=overrides, return_hydra_config=True - ) # config.yaml + cfg = compose(config_name="launch_model", overrides=overrides, return_hydra_config=True) - output_dir = Path(cfg.tabularized_dir) / "model" - - HydraConfig().set_config(cfg) launch_model.main(cfg) - output_files = list(output_dir.glob("**/*.json")) + + expected_output_dir = Path(cfg.output_model_dir) + output_files = list(expected_output_dir.glob("**/*.json")) assert len(output_files) == 2 + shutil.rmtree(expected_output_dir) + + sklearnmodel_config = { + **shared_config, + "tabularization.min_code_inclusion_count": 1, + "tabularization.window_sizes": "[30d,365d,full]", + "task_name": "test_task", + "output_model_dir": str(output_model_dir.resolve()), + } + + with initialize(version_base=None, config_path="../src/MEDS_tabular_automl/configs/"): + overrides = ["model_launcher=sgd_classifier"] + [f"{k}={v}" for k, v in sklearnmodel_config.items()] + cfg = compose(config_name="launch_model", overrides=overrides) + + launch_model.main(cfg) + + expected_output_dir = Path(cfg.output_model_dir) + output_files = list(expected_output_dir.glob("**/*.pkl")) + assert len(output_files) == 1 + shutil.rmtree(expected_output_dir) + + sklearnmodel_config = { + **shared_config, + "tabularization.min_code_inclusion_count": 1, + "tabularization.window_sizes": "[30d,365d,full]", + "data_loading_params.keep_data_in_memory": False, + "task_name": "test_task", + "output_model_dir": str(output_model_dir.resolve()), + } + + with initialize(version_base=None, config_path="../src/MEDS_tabular_automl/configs/"): + overrides = ["model_launcher=sgd_classifier"] + [f"{k}={v}" for k, v in sklearnmodel_config.items()] + cfg = compose(config_name="launch_model", overrides=overrides) + assert not cfg.data_loading_params.keep_data_in_memory + assert cfg.model_launcher.data_loading_params.binarize_task + + output_dir = Path(cfg.output_dir) / "model_online" + + launch_model.main(cfg) + + expected_output_dir = Path(cfg.output_model_dir) + output_files = list(expected_output_dir.glob("**/*.pkl")) + assert len(output_files) == 1 + shutil.rmtree(expected_output_dir) + + if importlib.util.find_spec("autogluon") is not None: + import autogluon as ag + + from MEDS_tabular_automl.scripts import launch_autogluon + + autogluon_config = { + **shared_config, + "tabularization.min_code_inclusion_count": 1, + "tabularization.window_sizes": "[30d,365d,full]", + "task_name": "test_task", + "output_model_dir": str(output_model_dir.resolve()), + } + + with initialize(version_base=None, config_path="../src/MEDS_tabular_automl/configs/"): + overrides = ["model_launcher=autogluon"] + [f"{k}={v}" for k, v in autogluon_config.items()] + cfg = compose(config_name="launch_model", overrides=overrides) + + launch_autogluon.main(cfg) - # sklearnmodel_config = { - # **shared_config, - # "tabularization.min_code_inclusion_count": 1, - # "tabularization.window_sizes": "[30d,365d,full]", - # "task_name": "test_task", - # } - - # with initialize( - # version_base=None, config_path="../src/MEDS_tabular_automl/configs/" - # ): # path to config.yaml - # overrides = ["model_launcher=sgd_classifier"] + [f"{k}={v}" for k, v in sklearnmodel_config.items()] - # cfg = compose(config_name="launch_model", overrides=overrides) # config.yaml - - # output_dir = Path(cfg.tabularized_dir) / "model" - - # launch_model.main(cfg) - # output_files = list(output_dir.glob("**/*.pkl")) - # assert len(output_files) == 1 - - # sklearnmodel_config = { - # **shared_config, - # "tabularization.min_code_inclusion_count": 1, - # "tabularization.window_sizes": "[30d,365d,full]", - # "model_params.iterator.keep_data_in_memory": False, - # "model_saving.model_dir": "${tabularized_dir}/model_online/model_${now:%Y-%m-%d_%H-%M-%S}", - # "task_name": "test_task", - # } - - # with initialize( - # version_base=None, config_path="../src/MEDS_tabular_automl/configs/" - # ): # path to config.yaml - # overrides = ["model_launcher=sgd_classifier"] + [f"{k}={v}" for k, v in sklearnmodel_config.items()] - # cfg = compose(config_name="launch_model", overrides=overrides) # config.yaml - - # output_dir = Path(cfg.tabularized_dir) / "model_online" - - # launch_model.main(cfg) - # output_files = list(output_dir.glob("**/*.pkl")) - # assert len(output_files) == 1 - - # if importlib.util.find_spec("autogluon") is not None: - # import autogluon as ag - - # from MEDS_tabular_automl.scripts import launch_autogluon - - # autogluon_config = { - # **shared_config, - # "tabularization.min_code_inclusion_count": 1, - # "tabularization.window_sizes": "[30d,365d,full]", - # "model_params.iterator.keep_data_in_memory": False, - # "model_saving.model_dir": "${tabularized_dir}/model_online/model_${now:%Y-%m-%d_%H-%M-%S}", - # "task_name": "test_task", - # } - - # with initialize( - # version_base=None, config_path="../src/MEDS_tabular_automl/configs/" - # ): # path to config.yaml - # overrides = [f"{k}={v}" for k, v in autogluon_config.items()] - # cfg = compose(config_name="launch_autogluon", overrides=overrides) # config.yaml - - # output_dir = Path(cfg.tabularized_dir) / "model_online" - - # launch_autogluon.main(cfg) - # output_files = list(output_dir.glob("*")) - # most_recent_file = max(output_files, key=os.path.getmtime) - # ag.tabular.TabularPredictor.load(most_recent_file) + expected_output_filepath = Path(cfg.output_model_dir) / "predictor.pkl" + assert expected_output_filepath.is_file() + ag.tabular.TabularPredictor.load(cfg.output_model_dir) From d64e237e8493c348db10349e58f28fff4afd1024 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Mon, 9 Sep 2024 04:53:26 +0000 Subject: [PATCH 37/54] added integration tests covering multirun for all launch_model models, and added an integration test for autogluon --- .../model_launcher/knn_classifier.yaml | 14 +-- .../model_launcher/logistic_regression.yaml | 13 +-- .../configs/model_launcher/path/default.yaml | 1 - .../random_forest_classifier.yaml | 22 ++-- .../model_launcher/sgd_classifier.yaml | 14 +-- .../configs/model_launcher/xgboost.yaml | 16 +-- src/MEDS_tabular_automl/scripts/cache_task.py | 3 +- .../scripts/describe_codes.py | 2 +- .../scripts/tabularize_static.py | 1 - .../scripts/tabularize_time_series.py | 7 +- tests/test_integration.py | 102 +++++++++++++++--- tests/test_tabularize.py | 3 +- 12 files changed, 131 insertions(+), 67 deletions(-) diff --git a/src/MEDS_tabular_automl/configs/model_launcher/knn_classifier.yaml b/src/MEDS_tabular_automl/configs/model_launcher/knn_classifier.yaml index c2c32f4..9f85e97 100644 --- a/src/MEDS_tabular_automl/configs/model_launcher/knn_classifier.yaml +++ b/src/MEDS_tabular_automl/configs/model_launcher/knn_classifier.yaml @@ -21,10 +21,10 @@ model_launcher: hydra: sweeper: params: - model.n_neighbors: range(1, 20) - model.weights: choice(['uniform', 'distance']) - model.leaf_size: range(10, 50) - model.p: choice([1, 2]) - model.metric: choice(['minkowski', 'euclidean', 'manhattan']) - epochs: range(10, 100) - early_stopping_rounds: range(1, 10) + +model_launcher.model.n_neighbors: range(1, 20) + model_launcher.model.weights: choice('uniform', 'distance') + model_launcher.model.leaf_size: range(10, 50) + model_launcher.model.p: choice(1, 2) + model_launcher.model.metric: choice('minkowski', 'euclidean', 'manhattan') + model_launcher.training_params.epochs: range(10, 100) + model_launcher.training_params.early_stopping_rounds: range(1, 10) diff --git a/src/MEDS_tabular_automl/configs/model_launcher/logistic_regression.yaml b/src/MEDS_tabular_automl/configs/model_launcher/logistic_regression.yaml index 1871e51..4531efc 100644 --- a/src/MEDS_tabular_automl/configs/model_launcher/logistic_regression.yaml +++ b/src/MEDS_tabular_automl/configs/model_launcher/logistic_regression.yaml @@ -21,18 +21,13 @@ model_launcher: solver: "lbfgs" max_iter: 100 - training_params: - epochs: 20 - early_stopping_rounds: 5 - path: model_file_extension: .pkl hydra: sweeper: params: - model.C: tag(log, interval(1e-6, 1)) - model.penalty: choice(['l1', 'l2', 'elasticnet']) - model.solver: choice(['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']) - epochs: range(10, 100) - early_stopping_rounds: range(1, 10) + model_launcher.model.C: tag(log, interval(1e-6, 1)) + model_launcher.model.solver: choice('lbfgs', 'sag', 'saga') + model_launcher.training_params.epochs: range(10, 100) + model_launcher.training_params.early_stopping_rounds: range(1, 10) diff --git a/src/MEDS_tabular_automl/configs/model_launcher/path/default.yaml b/src/MEDS_tabular_automl/configs/model_launcher/path/default.yaml index e3ef9bb..d739ce3 100644 --- a/src/MEDS_tabular_automl/configs/model_launcher/path/default.yaml +++ b/src/MEDS_tabular_automl/configs/model_launcher/path/default.yaml @@ -1,4 +1,3 @@ -# input_dir: ${input_dir} input_tabularized_cache_dir: ${input_tabularized_cache_dir} input_label_cache_dir: ${input_label_cache_dir} output_model_dir: ${output_model_dir} diff --git a/src/MEDS_tabular_automl/configs/model_launcher/random_forest_classifier.yaml b/src/MEDS_tabular_automl/configs/model_launcher/random_forest_classifier.yaml index ccc5f2e..4a50beb 100644 --- a/src/MEDS_tabular_automl/configs/model_launcher/random_forest_classifier.yaml +++ b/src/MEDS_tabular_automl/configs/model_launcher/random_forest_classifier.yaml @@ -20,22 +20,18 @@ model_launcher: min_impurity_decrease: 0.0 bootstrap: True - training_params: - epochs: 20 - early_stopping_rounds: 5 - path: model_file_extension: .pkl hydra: sweeper: params: - model.n_estimators: range(50, 300, 50) - model.max_depth: choice([null, 10, 20, 30, 40, 50]) - model.min_samples_split: range(2, 11) - model.min_samples_leaf: range(1, 5) - model.max_features: choice(['sqrt', 'log2', null]) - model.bootstrap: choice([True, False]) - model.criterion: choice(['gini', 'entropy']) - epochs: range(10, 100) - early_stopping_rounds: range(1, 10) + +model_launcher.model.n_estimators: range(50, 300, 50) + model_launcher.model.max_depth: choice(10, 20, 30, 40, 50) + model_launcher.model.min_samples_split: range(2, 11) + model_launcher.model.min_samples_leaf: range(1, 5) + model_launcher.model.max_features: choice('sqrt', 'log2') + model_launcher.model.bootstrap: choice(True, False) + model_launcher.model.criterion: choice('gini', 'entropy') + model_launcher.training_params.epochs: range(10, 100) + model_launcher.training_params.early_stopping_rounds: range(1, 10) diff --git a/src/MEDS_tabular_automl/configs/model_launcher/sgd_classifier.yaml b/src/MEDS_tabular_automl/configs/model_launcher/sgd_classifier.yaml index c62b7e1..9f6cb1d 100644 --- a/src/MEDS_tabular_automl/configs/model_launcher/sgd_classifier.yaml +++ b/src/MEDS_tabular_automl/configs/model_launcher/sgd_classifier.yaml @@ -11,18 +11,14 @@ model_launcher: _target_: sklearn.linear_model.SGDClassifier loss: log_loss - training_params: - epochs: 20 - early_stopping_rounds: 5 - path: model_file_extension: .pkl hydra: sweeper: params: - model.alpha: tag(log, interval(1e-6, 1)) - model.l1_ratio: interval(0, 1) - model.penalty: choice(['l1', 'l2', 'elasticnet']) - epochs: range(10, 100) - early_stopping_rounds: range(1, 10) + +model_launcher.model.alpha: tag(log, interval(1e-6, 1)) + +model_launcher.model.l1_ratio: interval(0, 1) + +model_launcher.model.penalty: choice('l1', 'l2', 'elasticnet') + model_launcher.training_params.epochs: range(10, 100) + model_launcher.training_params.early_stopping_rounds: range(1, 10) diff --git a/src/MEDS_tabular_automl/configs/model_launcher/xgboost.yaml b/src/MEDS_tabular_automl/configs/model_launcher/xgboost.yaml index 71e4637..b7e9065 100644 --- a/src/MEDS_tabular_automl/configs/model_launcher/xgboost.yaml +++ b/src/MEDS_tabular_automl/configs/model_launcher/xgboost.yaml @@ -20,12 +20,12 @@ model_launcher: hydra: sweeper: params: - model.eta: tag(log, interval(0.001, 1)) - model.lambda: tag(log, interval(0.001, 1)) - model.alpha: tag(log, interval(0.001, 1)) - model.subsample: interval(0.5, 1) - model.min_child_weight: interval(1e-2, 100) - num_boost_round: range(100, 1000) - early_stopping_rounds: range(1, 10) - model.max_depth: range(2, 16) + +model_launcher.model.eta: tag(log, interval(0.001, 1)) + +model_launcher.model.lambda: tag(log, interval(0.001, 1)) + +model_launcher.model.alpha: tag(log, interval(0.001, 1)) + +model_launcher.model.subsample: interval(0.5, 1) + +model_launcher.model.min_child_weight: interval(1e-2, 100) + +model_launcher.model.max_depth: range(2, 16) + model_launcher.training_params.num_boost_round: range(100, 1000) + model_launcher.training_params.early_stopping_rounds: range(1, 10) tabularization.min_code_inclusion_frequency: tag(log, range(10, 1000000)) diff --git a/src/MEDS_tabular_automl/scripts/cache_task.py b/src/MEDS_tabular_automl/scripts/cache_task.py index dc0dd89..a437119 100644 --- a/src/MEDS_tabular_automl/scripts/cache_task.py +++ b/src/MEDS_tabular_automl/scripts/cache_task.py @@ -85,9 +85,8 @@ def main(cfg: DictConfig): [ "input_dir", "input_label_dir", + "input_tabularized_dir", "output_dir", - "output_tabularized_cache_dir", - "output_label_cache_dir", "tabularization.filtered_code_metadata_fp", ], ) diff --git a/src/MEDS_tabular_automl/scripts/describe_codes.py b/src/MEDS_tabular_automl/scripts/describe_codes.py index 604b589..4391ccf 100644 --- a/src/MEDS_tabular_automl/scripts/describe_codes.py +++ b/src/MEDS_tabular_automl/scripts/describe_codes.py @@ -32,7 +32,7 @@ def main(cfg: DictConfig): cfg: The configuration object for the tabularization process, loaded from a Hydra YAML configuration file. """ - stage_init(cfg, ["input_dir", "input_dir"]) + stage_init(cfg, ["input_dir"]) iter_wrapper = load_tqdm(cfg.tqdm) if not cfg.loguru_init: hydra_loguru_init() diff --git a/src/MEDS_tabular_automl/scripts/tabularize_static.py b/src/MEDS_tabular_automl/scripts/tabularize_static.py index 0ea91a4..2baf406 100644 --- a/src/MEDS_tabular_automl/scripts/tabularize_static.py +++ b/src/MEDS_tabular_automl/scripts/tabularize_static.py @@ -87,7 +87,6 @@ def main( [ "input_code_metadata_fp", "input_dir", - "output_tabularized_dir", "tabularization.filtered_code_metadata_fp", ], ) diff --git a/src/MEDS_tabular_automl/scripts/tabularize_time_series.py b/src/MEDS_tabular_automl/scripts/tabularize_time_series.py index e6d1337..98cca20 100644 --- a/src/MEDS_tabular_automl/scripts/tabularize_time_series.py +++ b/src/MEDS_tabular_automl/scripts/tabularize_time_series.py @@ -66,7 +66,12 @@ def main( ValueError: If required columns like 'code' or 'value' are missing in the data files. """ stage_init( - cfg, ["input_code_metadata_fp", "input_dir", "output_dir", "tabularization.filtered_code_metadata_fp"] + cfg, + [ + "input_code_metadata_fp", + "input_dir", + "tabularization.filtered_code_metadata_fp", + ], ) iter_wrapper = load_tqdm(cfg.tqdm) if not cfg.loguru_init: diff --git a/tests/test_integration.py b/tests/test_integration.py index 6c1e85b..d231be1 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -13,6 +13,7 @@ CODE_COLS, EXPECTED_STATIC_FILES, MEDS_OUTPUTS, + NUM_SHARDS, SPLITS_JSON, STATIC_FIRST_COLS, STATIC_PRESENT_COLS, @@ -45,6 +46,8 @@ def test_integration(tmp_path): # Step 0: Setup Environment input_dir = Path(tmp_path) / "input_dir" output_dir = Path(tmp_path) / "output_dir" + input_label_dir = Path(tmp_path) / "label_dir" + output_model_dir = Path(tmp_path) / "output_model_dir" shared_config = { "input_dir": str(input_dir.resolve()), @@ -68,23 +71,24 @@ def test_integration(tmp_path): # Store MEDS outputs all_data = [] for split, data in MEDS_OUTPUTS.items(): - file_path = output_dir / f"{split}.parquet" - file_path.parent.mkdir(exist_ok=True) + file_path = input_dir / f"{split}.parquet" + file_path.parent.mkdir(exist_ok=True, parents=True) df = pl.read_csv(StringIO(data)).with_columns(pl.col("time").str.to_datetime("%Y-%m-%dT%H:%M:%S%.f")) df.write_parquet(file_path) all_data.append(df) + assert file_path.exists() all_data = pl.concat(all_data, how="diagonal_relaxed").sort(by=["subject_id", "time"]) # Check the files are not empty meds_files = list_subdir_files(Path(cfg.input_dir), "parquet") assert ( - len(list_subdir_files(Path(cfg.input_dir).parent, "parquet")) == 4 + len(list_subdir_files(Path(cfg.input_dir), "parquet")) == 4 ), "MEDS train split Data Files Should be 4!" for f in meds_files: assert pl.read_parquet(f).shape[0] > 0, "MEDS Data Tabular Dataframe Should not be Empty!" split_json = json.load(StringIO(SPLITS_JSON)) - splits_fp = output_dir / ".shards.json" + splits_fp = input_dir / ".shards.json" json.dump(split_json, splits_fp.open("w")) # Step 1: Run the describe_codes script @@ -94,6 +98,7 @@ def test_integration(tmp_path): describe_codes_config, "describe_codes", ) + assert Path(cfg.output_filepath).is_file() feature_columns = get_feature_columns(cfg.output_filepath) @@ -104,7 +109,7 @@ def test_integration(tmp_path): assert get_feature_names(value_agg, feature_columns) == sorted(VALUE_COLS) # Step 2: Run the static data tabularization script - tabularize_config = { + tabularize_static_config = { **shared_config, "tabularization.min_code_inclusion_count": 1, "tabularization.window_sizes": "[30d,365d,full]", @@ -112,15 +117,17 @@ def test_integration(tmp_path): stderr, stdout = run_command( "meds-tab-tabularize-static", [], - tabularize_config, + tabularize_static_config, "tabularization", ) with initialize(version_base=None, config_path="../src/MEDS_tabular_automl/configs/"): - overrides = [f"{k}={v}" for k, v in tabularize_config.items()] + overrides = [f"{k}={v}" for k, v in tabularize_static_config.items()] cfg = compose(config_name="tabularization", overrides=overrides) - output_files = list(Path(cfg.output_dir).glob("**/static/**/*.npz")) - actual_files = [get_shard_prefix(Path(cfg.output_dir), each) + ".npz" for each in output_files] + output_dir = Path(cfg.output_dir) / "tabularize" + + output_files = list(output_dir.glob("**/static/**/*.npz")) + actual_files = [get_shard_prefix(output_dir, each) + ".npz" for each in output_files] assert set(actual_files) == set(EXPECTED_STATIC_FILES) # Check the files are not empty for f in output_files: @@ -164,11 +171,9 @@ def test_integration(tmp_path): ) # confirm summary files exist: - output_files = list_subdir_files(cfg.output_dir, "npz") + output_files = list_subdir_files(str(output_dir.resolve()), "npz") actual_files = [ - get_shard_prefix(Path(cfg.output_dir), each) + ".npz" - for each in output_files - if "none/static" not in str(each) + get_shard_prefix(output_dir, each) + ".npz" for each in output_files if "none/static" not in str(each) ] assert len(actual_files) > 0 for f in output_files: @@ -190,16 +195,36 @@ def test_integration(tmp_path): assert ts_matrix.shape[0] == expected_num_rows, ( f"Time-Series Data matrix Should have {expected_num_rows}" f" rows but has {ts_matrix.shape[0]}!" ) + output_files = list_subdir_files(str(Path(cfg.output_tabularized_dir).resolve()), "npz") + for split in split_json.keys(): + for window in cfg.tabularization.window_sizes: + for agg in cfg.tabularization.aggs: + if agg.startswith("static"): + if window != cfg.tabularization.window_sizes[0]: + continue + expected_fp = Path(cfg.output_tabularized_dir) / split / "none" / f"{agg}.npz" + else: + expected_fp = Path(cfg.output_tabularized_dir) / split / window / f"{agg}.npz" + assert expected_fp in output_files, f"Missing {expected_fp}" + expected_num_time_tabs = ( + NUM_SHARDS * len(cfg.tabularization.window_sizes) * (len(cfg.tabularization.aggs) - 2) + ) + expected_num_static_tabs = NUM_SHARDS * 2 + assert len(list_subdir_files(cfg.output_dir, "npz")) == expected_num_time_tabs + expected_num_static_tabs + # Step 4: Run the task_specific_caching script cache_config = { **shared_config, "tabularization.min_code_inclusion_count": 1, "tabularization.window_sizes": "[30d,365d,full]", + "task_name": "test_task", + "input_label_dir": str(input_label_dir.resolve()), } with initialize(version_base=None, config_path="../src/MEDS_tabular_automl/configs/"): overrides = [f"{k}={v}" for k, v in cache_config.items()] cfg = compose(config_name="task_specific_caching", overrides=overrides) + # Create fake labels df = get_unique_time_events_df(get_events_df(all_data.lazy(), feature_columns)).collect() pseudo_labels = pl.Series(([0, 1] * df.shape[0])[: df.shape[0]]) df = df.with_columns(pl.Series(name="boolean_value", values=pseudo_labels)) @@ -223,3 +248,54 @@ def test_integration(tmp_path): cache_config, "task_specific_caching", ) + for split in split_json.keys(): + for window in cfg.tabularization.window_sizes: + for agg in cfg.tabularization.aggs: + if agg.startswith("static"): + if window != cfg.tabularization.window_sizes[0]: + continue + expected_fp = Path(cfg.output_tabularized_cache_dir) / split / "none" / f"{agg}.npz" + else: + expected_fp = Path(cfg.output_tabularized_cache_dir) / split / window / f"{agg}.npz" + output_files = list_subdir_files(str(Path(cfg.output_tabularized_cache_dir).resolve()), "npz") + assert expected_fp in output_files, f"Missing {expected_fp}" + [each for each in output_files if "0/30d" in str(each) and "code/count" in str(each)] + assert ( + len(list_subdir_files(cfg.output_tabularized_cache_dir, "npz")) + == expected_num_time_tabs + expected_num_static_tabs + ) + + stderr, stdout = run_command( + "meds-tab-cache-task", + [ + "--multirun", + f"tabularization.aggs={stdout_agg.strip()}", + ], + cache_config, + "task_specific_caching", + ) + + for model in [ + "xgboost", + "knn_classifier", + "logistic_regression", + "random_forest_classifier", + "sgd_classifier", + ]: + model_config = { + **shared_config, + "tabularization.min_code_inclusion_count": 1, + "tabularization.window_sizes": "[30d,365d,full]", + "task_name": "test_task", + "output_model_dir": str(output_model_dir.resolve()), + "model_launcher": model, + "hydra.sweeper.n_trials": 1, + } + overrides = [f"tabularization.aggs={stdout_agg.strip()}"] + if model == "autogluon": + script = "meds-tab-autogluon" + else: + script = "meds-tab-model" + overrides = ["--multirun"] + overrides + + stderr, stdout = run_command(script, overrides, model_config, f"launch_model_{model}") diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index b84fb54..707b776 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -187,7 +187,7 @@ def test_tabularize(tmp_path): # Check the files are not empty meds_files = list_subdir_files(Path(cfg.input_dir), "parquet") assert ( - len(list_subdir_files(Path(cfg.input_dir).parent, "parquet")) == 4 + len(list_subdir_files(Path(cfg.input_dir), "parquet")) == 4 ), "MEDS train split Data Files Should be 4!" for f in meds_files: assert pl.read_parquet(f).shape[0] > 0, "MEDS Data Tabular Dataframe Should not be Empty!" @@ -293,7 +293,6 @@ def test_tabularize(tmp_path): ) expected_num_static_tabs = NUM_SHARDS * 2 assert len(list_subdir_files(cfg.output_dir, "npz")) == expected_num_time_tabs + expected_num_static_tabs - cfg.output_dir # Step 3: Cache Task data cache_config = { **shared_config, From c631e9395564c98c853755cd123d231d78d4ba63 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Mon, 9 Sep 2024 05:35:58 +0000 Subject: [PATCH 38/54] fixed tests --- tests/test_configs.py | 1 - tests/test_tabularize.py | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/test_configs.py b/tests/test_configs.py index 5ad1a13..d0bbc45 100644 --- a/tests/test_configs.py +++ b/tests/test_configs.py @@ -65,7 +65,6 @@ def test_model_config(model_launcher_override, imputer, normalization, tmp_path) "++tabularization.filtered_code_metadata_fp": code_metadata_fp, "++tabularization.min_code_inclusion_count": "0", "task_name": "foo_bar", - "input_label_dir": "/qux/", } pl.DataFrame({"code": ["E", "D", "A"], "count": [4, 3, 2]}).write_parquet(code_metadata_fp) diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index d8a005f..006252d 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -351,11 +351,11 @@ def test_tabularize(tmp_path): expected_output_dir = Path(cfg.output_model_dir) output_files = list(expected_output_dir.glob("**/*.json")) assert len(output_files) == 2 - shutil.rmtree(expected_output_dir) log_dir = Path(cfg.path.model_log_dir) - log_csv = list(log_dir.glob("**/*.log")) - assert len(log_csv) == 2 + log_files = list(log_dir.glob("**/*.log")) + assert len(log_files) == 1 + shutil.rmtree(expected_output_dir) sklearnmodel_config = { **shared_config, From a4ad03c0f42adc25b6f30d1ff6a812e6d6c1fef6 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Mon, 9 Sep 2024 06:05:25 +0000 Subject: [PATCH 39/54] resolved review feedback. Added a based_model docstring. Added version for polars. Added github workflow test matrix over python versions, removed redundant run_command definition from test_configs --- .github/workflows/tests.yaml | 6 +++-- pyproject.toml | 2 +- src/MEDS_tabular_automl/base_model.py | 2 ++ tests/__init__.py | 0 tests/test_configs.py | 32 ++------------------------- tests/test_integration.py | 20 ++++++++--------- tests/test_tabularize.py | 3 --- 7 files changed, 19 insertions(+), 46 deletions(-) create mode 100644 tests/__init__.py diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index c96be0e..0ac7cd5 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -12,6 +12,8 @@ jobs: strategy: fail-fast: false + matrix: + python-version: ["3.11", "3.12"] timeout-minutes: 30 @@ -19,10 +21,10 @@ jobs: - name: Checkout uses: actions/checkout@v3 - - name: Set up Python 3.12 + - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v3 with: - python-version: "3.12" + python-version: ${{ matrix.python-version }} - name: Install packages run: | diff --git a/pyproject.toml b/pyproject.toml index 1b75489..a30e14b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,7 +15,7 @@ classifiers = [ "Operating System :: OS Independent", ] dependencies = [ - "polars", "pyarrow", "loguru", "hydra-core==1.3.2", "numpy", "scipy<1.14.0", "pandas", "tqdm", "xgboost", + "polars==1.6.0", "pyarrow", "loguru", "hydra-core==1.3.2", "numpy", "scipy<1.14.0", "pandas", "tqdm", "xgboost", "scikit-learn", "hydra-optuna-sweeper", "hydra-joblib-launcher", "ml-mixins", "meds==0.3.3", "meds-transforms==0.0.7", ] diff --git a/src/MEDS_tabular_automl/base_model.py b/src/MEDS_tabular_automl/base_model.py index 35a9ccf..4663542 100644 --- a/src/MEDS_tabular_automl/base_model.py +++ b/src/MEDS_tabular_automl/base_model.py @@ -9,6 +9,8 @@ class BaseModel(ABC, TimeableMixin): + """Defines the interface for a model that can be trained and evaluated via the launch_model script.""" + @abstractmethod def __init__(self): pass diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_configs.py b/tests/test_configs.py index d0bbc45..5183214 100644 --- a/tests/test_configs.py +++ b/tests/test_configs.py @@ -2,44 +2,16 @@ root = rootutils.setup_root(__file__, dotenv=True, pythonpath=True, cwd=True) -import subprocess import hydra import polars as pl import pytest from hydra import compose, initialize -from omegaconf import DictConfig, OmegaConf +from omegaconf import DictConfig from MEDS_tabular_automl.sklearn_model import SklearnModel from MEDS_tabular_automl.xgboost_model import XGBoostModel - - -def run_command(script: str, args: list[str], hydra_kwargs: dict[str, str], test_name: str): - command_parts = [script] + args + [f"{k}={v}" for k, v in hydra_kwargs.items()] - command_out = subprocess.run(" ".join(command_parts), shell=True, capture_output=True) - stderr = command_out.stderr.decode() - stdout = command_out.stdout.decode() - if command_out.returncode != 0: - raise AssertionError(f"{test_name} failed!\nstdout:\n{stdout}\nstderr:\n{stderr}") - return stderr, stdout - - -def make_config_mutable(cfg): - if OmegaConf.is_config(cfg): - OmegaConf.set_readonly(cfg, False) - for key in cfg.keys(): - print(key) - # try: - cfg[key] = make_config_mutable(cfg[key]) - # except: - # import pdb; pdb.set_trace() - return cfg - # elif isinstance(cfg, list): - # return [make_config_mutable(item) for item in cfg] - # elif isinstance(cfg, dict): - # return {key: make_config_mutable(value) for key, value in cfg.items()} - else: - return cfg +from tests.test_integration import run_command @pytest.mark.parametrize( diff --git a/tests/test_integration.py b/tests/test_integration.py index d231be1..d623914 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -9,16 +9,6 @@ import polars as pl from hydra import compose, initialize -from test_tabularize import ( - CODE_COLS, - EXPECTED_STATIC_FILES, - MEDS_OUTPUTS, - NUM_SHARDS, - SPLITS_JSON, - STATIC_FIRST_COLS, - STATIC_PRESENT_COLS, - VALUE_COLS, -) from MEDS_tabular_automl.describe_codes import get_feature_columns from MEDS_tabular_automl.file_name import list_subdir_files @@ -30,6 +20,16 @@ get_unique_time_events_df, load_matrix, ) +from tests.test_tabularize import ( + CODE_COLS, + EXPECTED_STATIC_FILES, + MEDS_OUTPUTS, + NUM_SHARDS, + SPLITS_JSON, + STATIC_FIRST_COLS, + STATIC_PRESENT_COLS, + VALUE_COLS, +) def run_command(script: str, args: list[str], hydra_kwargs: dict[str, str], test_name: str): diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index 006252d..7cf975a 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -10,7 +10,6 @@ import polars as pl from hydra import compose, initialize -from loguru import logger from MEDS_tabular_automl.describe_codes import get_feature_columns from MEDS_tabular_automl.file_name import list_subdir_files @@ -30,8 +29,6 @@ load_matrix, ) -logger.disable("MEDS_tabular_automl") - SPLITS_JSON = """{"train/0": [239684, 1195293], "train/1": [68729, 814703], "tuning/0": [754281], "held_out/0": [1500733]}""" # noqa: E501 NUM_SHARDS = 4 From 0db7bd69f3a25317489b8348267a8ae36fc7bbb3 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Mon, 9 Sep 2024 06:37:00 +0000 Subject: [PATCH 40/54] fixed min_code_inclusion_frequency kwarg --- .../configs/launch_model.yaml | 2 ++ .../configs/model_launcher/xgboost.yaml | 2 +- .../scripts/tabularize_static.py | 2 +- src/MEDS_tabular_automl/utils.py | 22 ++++++++++++++++--- 4 files changed, 23 insertions(+), 5 deletions(-) diff --git a/src/MEDS_tabular_automl/configs/launch_model.yaml b/src/MEDS_tabular_automl/configs/launch_model.yaml index 7008acf..f6dc949 100644 --- a/src/MEDS_tabular_automl/configs/launch_model.yaml +++ b/src/MEDS_tabular_automl/configs/launch_model.yaml @@ -26,3 +26,5 @@ hydra: subdir: "1" run: dir: ${path.model_log_dir} + sweeper: + direction: "maximize" diff --git a/src/MEDS_tabular_automl/configs/model_launcher/xgboost.yaml b/src/MEDS_tabular_automl/configs/model_launcher/xgboost.yaml index b7e9065..6f364ae 100644 --- a/src/MEDS_tabular_automl/configs/model_launcher/xgboost.yaml +++ b/src/MEDS_tabular_automl/configs/model_launcher/xgboost.yaml @@ -28,4 +28,4 @@ hydra: +model_launcher.model.max_depth: range(2, 16) model_launcher.training_params.num_boost_round: range(100, 1000) model_launcher.training_params.early_stopping_rounds: range(1, 10) - tabularization.min_code_inclusion_frequency: tag(log, range(10, 1000000)) + tabularization.min_code_inclusion_count: tag(log, range(10, 1000000)) diff --git a/src/MEDS_tabular_automl/scripts/tabularize_static.py b/src/MEDS_tabular_automl/scripts/tabularize_static.py index 2baf406..e2ef51f 100644 --- a/src/MEDS_tabular_automl/scripts/tabularize_static.py +++ b/src/MEDS_tabular_automl/scripts/tabularize_static.py @@ -59,7 +59,7 @@ def main( cfg: input_dir: directory of MEDS format dataset that is ingested. tabularized_data_dir: output directory of tabularized data. - min_code_inclusion_frequency: The base feature inclusion frequency that should be used to dictate + min_code_inclusion_count: The base feature inclusion count that should be used to dictate what features can be included in the flat representation. It can either be a float, in which case it applies across all measurements, or `None`, in which case no filtering is applied, or a dictionary from measurement type to a float dictating a per-measurement-type inclusion diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py index 97a74f9..d5eac76 100644 --- a/src/MEDS_tabular_automl/utils.py +++ b/src/MEDS_tabular_automl/utils.py @@ -49,9 +49,14 @@ def filter_to_codes( """Filters and returns codes based on allowed list and minimum frequency. Args: - allowed_codes: List of allowed codes, None means all codes are allowed. - min_code_inclusion_count: Minimum frequency a code must have to be included. code_metadata_fp: Path to the metadata file containing code information. + allowed_codes: List of allowed codes, None means all codes are allowed. + min_code_inclusion_count: Minimum count a code must have to be included. + min_code_inclusion_frequency: The minimum frequency a code must have, + normalized by dividing its count by the total number of observations + across all codes in the dataset, to be included. + max_include_codes: Maximum number of codes to include (selecting the most + prevelent codes). Returns: Sorted list of the intersection of allowed codes (if they are specified) and filters based on @@ -65,6 +70,14 @@ def filter_to_codes( ['D'] >>> with NamedTemporaryFile() as f: ... pl.DataFrame({"code": ["E", "D", "A"], "count": [4, 3, 2]}).write_parquet(f.name) + ... filter_to_codes( f.name, None, None, .35, None) + ['E'] + >>> with NamedTemporaryFile() as f: + ... pl.DataFrame({"code": ["E", "D", "A"], "count": [4, 3, 2]}).write_parquet(f.name) + ... filter_to_codes( f.name, None, None, None, 1) + ['E'] + >>> with NamedTemporaryFile() as f: + ... pl.DataFrame({"code": ["E", "D", "A"], "count": [4, 3, 2]}).write_parquet(f.name) ... filter_to_codes( f.name, ["A", "D"], 10, None, None) Traceback (most recent call last): ... @@ -77,7 +90,10 @@ def filter_to_codes( feature_freqs = feature_freqs.filter(pl.col("code").is_in(allowed_codes)) if min_code_inclusion_frequency is not None: - pass + if min_code_inclusion_frequency < 0 or min_code_inclusion_frequency > 1: + raise ValueError("min_code_inclusion_frequency must be between 0 and 1.") + dataset_size = feature_freqs["count"].sum() + feature_freqs = feature_freqs.filter((pl.col("count") / dataset_size) >= min_code_inclusion_frequency) if min_code_inclusion_count is not None: feature_freqs = feature_freqs.filter(pl.col("count") >= min_code_inclusion_count) From b289033b7ffc9173b851021477c317ae02dcd80a Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Mon, 9 Sep 2024 07:29:21 +0000 Subject: [PATCH 41/54] added mimic iv tutorial --- MIMICIV_TUTORIAL/README.MD | 69 +++++++++++++++++++++++++++++ MIMICIV_TUTORIAL/tabularize_meds.sh | 63 ++++++++++++++++++++++++++ 2 files changed, 132 insertions(+) create mode 100644 MIMICIV_TUTORIAL/README.MD create mode 100644 MIMICIV_TUTORIAL/tabularize_meds.sh diff --git a/MIMICIV_TUTORIAL/README.MD b/MIMICIV_TUTORIAL/README.MD new file mode 100644 index 0000000..e8ea9ac --- /dev/null +++ b/MIMICIV_TUTORIAL/README.MD @@ -0,0 +1,69 @@ +# MIMIC-IV Example + +This is an example of how to extract a MEDS dataset from MIMIC-IV. All scripts in this README are assumed to +be run **not** from this directory but from the root directory of this entire repository (e.g., one directory +up from this one). + +## Extract MIMIC-IV MEDS Data + +### Download pre-extracted data from gpc + +Install the [gcloud client](https://cloud.google.com/sdk/docs/install) and then run the following command to download the MEDS data from the gcp bucket: + +```console +export MIMICIV_MEDS_DIR=??? # set to the directory in which you want to store the raw MIMIC-IV data +export OUTPUT_TABULARIZATION_DIR=??? # set to the output directory for the tabularized data +export OUTPUT_MODEL_DIR=${OUTPUT_TABULARIZATION_DIR}/results/ # set to the base results directory + +cd $MIMICIV_MEDS_DIR +gcloud storage cp gs://ehr_standardization_schema/MEDS_Extract_v0.0.7_test.zip meds_extract_0.0.7_data.zip +unzip meds_extract_0.0.7_data.zip +rm meds_extract_0.0.7_data.zip +``` + +```console +conda create -n meds_tab python=3.12 +conda activate meds_tab +pip install "meds-tab==0.0.5" +``` + +Next we need to get some labels for our tasks. We will use the `long_los` and `icu_mortality` tasks as examples. + +### Download pre-extracted labels from gcp: + +```console +TASKS=("long_los" "icu_mortality") +TASKS_DIR="$MIMICIV_MEDS_DIR/tasks/" # set to the directory in which you want to store all tasks + +mkdir -p "${TASKS_DIR}" # create a directory for the task + +for TASK_NAME in "${TASKS[@]}" +do + gcloud storage cp "gs://ehr_standardization_schema/benchmark_v1/data/labels/${TASK_NAME}.parquet" "${TASKS_DIR}/${TASK_NAME}.parquet" +done +``` + +## Pre-Processing for Tabularization + +```console +export N_PARALLEL_WORKERS=48 # Set number of workers +export RESHARD_DIR=??? # set to directory to output reshareded meds data +bash MIMICIV_TUTORIAL/tabularize_meds.sh "${MIMICIV_MEDS_DIR}" "$RESHARD_DIR" $OUTPUT_TABULARIZATION_DIR \ + ["long_los","icu_mortality"] $TASKS_DIR $OUTPUT_MODEL_DIR $N_PARALLEL_WORKERS \ + "tabularization.aggs=[static/present,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]" \ + "tabularization.window_sizes=[2h,12h,1d,7d,30d,365d,full]" +``` + +## Train XGBOOST Baseline + +```console +meds-tab-model "input_dir=${MIMICIV_MEDS_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" output_model_dir=$OUTPUT_MODEL_DIR task_name=long_los + "tabularization.aggs=[static/present,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]" \ + "tabularization.window_sizes=[2h,12h,1d,7d,30d,365d,full]" +``` + +```console +meds-tab-model "input_dir=${MIMICIV_MEDS_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" task_name=icu_mortality + "tabularization.aggs=[static/present,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]" \ + "tabularization.window_sizes=[2h,12h,1d,7d,30d,365d,full]" +``` diff --git a/MIMICIV_TUTORIAL/tabularize_meds.sh b/MIMICIV_TUTORIAL/tabularize_meds.sh new file mode 100644 index 0000000..221dfb1 --- /dev/null +++ b/MIMICIV_TUTORIAL/tabularize_meds.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash + +set -e + +MIMICIV_MEDS_DIR="$1" +MIMICIV_MEDS_RESHARD_DIR="$2" +OUTPUT_TABULARIZATION_DIR="$3" +TASKS="$4" +TASKS_DIR="$5" +OUTPUT_MODEL_DIR="$6" +N_PARALLEL_WORKERS="$7" + +shift 7 + + +IFS=',' read -r -a TASK_ARRAY <<< "$TASKS" + +MEDS_transform-reshard_to_split \ + --multirun \ + worker="range(0,$N_PARALLEL_WORKERS)" \ + hydra/launcher=joblib \ + input_dir="$MIMICIV_MEDS_DIR" \ + cohort_dir="$MIMICIV_MEDS_RESHARD_DIR" \ + 'stages=["reshard_to_split"]' \ + stage="reshard_to_split" \ + stage_configs.reshard_to_split.n_subjects_per_shard=2500 \ + "hydra.sweeper.polling_time=5" + +# describe codes +echo "Describing codes" +meds-tab-describe \ + "input_dir=$MIMICIV_MEDS_RESHARD_DIR" "output_dir=$OUTPUT_TABULARIZATION_DIR" + +echo "Tabularizing static data" +echo meds-tab-tabularize-static \ + "input_dir=$MIMICIV_MEDS_RESHARD_DIR" "output_dir=$OUTPUT_TABULARIZATION_DIR" \ + do_overwrite=False "$@" + +meds-tab-tabularize-time-series \ + --multirun \ + worker="range(0,$N_PARALLEL_WORKERS)" \ + hydra/launcher=joblib \ + "input_dir=$MIMICIV_MEDS_RESHARD_DIR" "output_dir=$OUTPUT_TABULARIZATION_DIR" \ + do_overwrite=False "$@" + + +for TASK in "${TASK_ARRAY[@]}" +do + echo "Running task_specific_caching.py" + meds-tab-cache-task \ + --multirun \ + worker="range(0,$N_PARALLEL_WORKERS)" \ + hydra/launcher=joblib \ + "input_dir=$MIMICIV_MEDS_DIR" "output_dir=$OUTPUT_TABULARIZATION_DIR" \ + "input_label_dir=${TASKS_DIR}" "task_name=${TASK}" do_overwrite=False "$@" + + echo "Running xgboost" + meds-tab-xgboost \ + --multirun \ + worker="range(0,$N_PARALLEL_WORKERS)" \ + "input_dir=$MIMICIV_MEDS_DIR" "output_dir=$OUTPUT_TABULARIZATION_DIR" \ + "output_model_dir=${OUTPUT_MODEL_DIR}/${TASK}/" "task_name=$TASK" do_overwrite=False "$@" +done From 9294920cc1911617abb26f7f91392430ef55f3fd Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Mon, 9 Sep 2024 07:38:54 +0000 Subject: [PATCH 42/54] updated tabularization script to fix bugs --- MIMICIV_TUTORIAL/tabularize_meds.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/MIMICIV_TUTORIAL/tabularize_meds.sh b/MIMICIV_TUTORIAL/tabularize_meds.sh index 221dfb1..5ba5807 100644 --- a/MIMICIV_TUTORIAL/tabularize_meds.sh +++ b/MIMICIV_TUTORIAL/tabularize_meds.sh @@ -24,23 +24,23 @@ MEDS_transform-reshard_to_split \ 'stages=["reshard_to_split"]' \ stage="reshard_to_split" \ stage_configs.reshard_to_split.n_subjects_per_shard=2500 \ - "hydra.sweeper.polling_time=5" + "polling_time=5" # describe codes echo "Describing codes" meds-tab-describe \ - "input_dir=$MIMICIV_MEDS_RESHARD_DIR" "output_dir=$OUTPUT_TABULARIZATION_DIR" + "input_dir=${MIMICIV_MEDS_RESHARD_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" echo "Tabularizing static data" echo meds-tab-tabularize-static \ - "input_dir=$MIMICIV_MEDS_RESHARD_DIR" "output_dir=$OUTPUT_TABULARIZATION_DIR" \ + "input_dir=${MIMICIV_MEDS_RESHARD_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" \ do_overwrite=False "$@" meds-tab-tabularize-time-series \ --multirun \ worker="range(0,$N_PARALLEL_WORKERS)" \ hydra/launcher=joblib \ - "input_dir=$MIMICIV_MEDS_RESHARD_DIR" "output_dir=$OUTPUT_TABULARIZATION_DIR" \ + "input_dir=${MIMICIV_MEDS_RESHARD_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" \ do_overwrite=False "$@" @@ -51,13 +51,13 @@ do --multirun \ worker="range(0,$N_PARALLEL_WORKERS)" \ hydra/launcher=joblib \ - "input_dir=$MIMICIV_MEDS_DIR" "output_dir=$OUTPUT_TABULARIZATION_DIR" \ + "input_dir=${MIMICIV_MEDS_RESHARD_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" \ "input_label_dir=${TASKS_DIR}" "task_name=${TASK}" do_overwrite=False "$@" echo "Running xgboost" meds-tab-xgboost \ --multirun \ worker="range(0,$N_PARALLEL_WORKERS)" \ - "input_dir=$MIMICIV_MEDS_DIR" "output_dir=$OUTPUT_TABULARIZATION_DIR" \ + "input_dir=${MIMICIV_MEDS_RESHARD_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" \ "output_model_dir=${OUTPUT_MODEL_DIR}/${TASK}/" "task_name=$TASK" do_overwrite=False "$@" done From d71f9dcf602800f779cf2a845a16ca128dd1bd68 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Mon, 9 Sep 2024 07:42:44 +0000 Subject: [PATCH 43/54] reduced the number of workers for resharding --- MIMICIV_TUTORIAL/tabularize_meds.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MIMICIV_TUTORIAL/tabularize_meds.sh b/MIMICIV_TUTORIAL/tabularize_meds.sh index 5ba5807..55f5b4f 100644 --- a/MIMICIV_TUTORIAL/tabularize_meds.sh +++ b/MIMICIV_TUTORIAL/tabularize_meds.sh @@ -17,7 +17,7 @@ IFS=',' read -r -a TASK_ARRAY <<< "$TASKS" MEDS_transform-reshard_to_split \ --multirun \ - worker="range(0,$N_PARALLEL_WORKERS)" \ + worker="range(0,6)" \ hydra/launcher=joblib \ input_dir="$MIMICIV_MEDS_DIR" \ cohort_dir="$MIMICIV_MEDS_RESHARD_DIR" \ From 0dc2bc697a6e91b470e932d3c49760a209fe99f6 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Mon, 9 Sep 2024 13:42:12 +0000 Subject: [PATCH 44/54] updated tabularize meds to take string input for tasks --- MIMICIV_TUTORIAL/README.MD | 2 +- MIMICIV_TUTORIAL/tabularize_meds.sh | 51 ++++++++++++++++++++++++++--- 2 files changed, 48 insertions(+), 5 deletions(-) diff --git a/MIMICIV_TUTORIAL/README.MD b/MIMICIV_TUTORIAL/README.MD index e8ea9ac..e340a09 100644 --- a/MIMICIV_TUTORIAL/README.MD +++ b/MIMICIV_TUTORIAL/README.MD @@ -49,7 +49,7 @@ done export N_PARALLEL_WORKERS=48 # Set number of workers export RESHARD_DIR=??? # set to directory to output reshareded meds data bash MIMICIV_TUTORIAL/tabularize_meds.sh "${MIMICIV_MEDS_DIR}" "$RESHARD_DIR" $OUTPUT_TABULARIZATION_DIR \ - ["long_los","icu_mortality"] $TASKS_DIR $OUTPUT_MODEL_DIR $N_PARALLEL_WORKERS \ + "long_los,icu_mortality" $TASKS_DIR $OUTPUT_MODEL_DIR $N_PARALLEL_WORKERS \ "tabularization.aggs=[static/present,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]" \ "tabularization.window_sizes=[2h,12h,1d,7d,30d,365d,full]" ``` diff --git a/MIMICIV_TUTORIAL/tabularize_meds.sh b/MIMICIV_TUTORIAL/tabularize_meds.sh index 55f5b4f..9374c22 100644 --- a/MIMICIV_TUTORIAL/tabularize_meds.sh +++ b/MIMICIV_TUTORIAL/tabularize_meds.sh @@ -2,6 +2,36 @@ set -e +# Function to print help message +print_help() { + echo "Usage: $0 [additional arguments]" + echo + echo "Arguments:" + echo " MIMICIV_MEDS_DIR Directory containing MIMIC-IV medications data" + echo " MIMICIV_MEDS_RESHARD_DIR Directory for resharded MIMIC-IV medications data" + echo " OUTPUT_TABULARIZATION_DIR Output directory for tabularized data" + echo " TASKS Comma-separated list of tasks to run (e.g., 'long_los,icu_mortality')" + echo " TASKS_DIR Directory containing task-specific data" + echo " OUTPUT_MODEL_DIR Output directory for models" + echo " N_PARALLEL_WORKERS Number of parallel workers to use" + echo + echo "Additional arguments will be passed to the underlying commands." +} + +# Check for help flag +if [[ "$1" == "--help" || "$1" == "-h" ]]; then + print_help + exit 0 +fi + +# Check if we have the minimum required number of arguments +if [ "$#" -lt 7 ]; then + echo "Error: Not enough arguments provided." + print_help + exit 1 +fi + +# Assign arguments to variables MIMICIV_MEDS_DIR="$1" MIMICIV_MEDS_RESHARD_DIR="$2" OUTPUT_TABULARIZATION_DIR="$3" @@ -12,9 +42,23 @@ N_PARALLEL_WORKERS="$7" shift 7 +# Split the TASKS string into an array +IFS=',' read -ra TASK_ARRAY <<< "$TASKS" -IFS=',' read -r -a TASK_ARRAY <<< "$TASKS" +# Print input arguments +echo "Input arguments:" +echo "MIMICIV_MEDS_DIR: $MIMICIV_MEDS_DIR" +echo "MIMICIV_MEDS_RESHARD_DIR: $MIMICIV_MEDS_RESHARD_DIR" +echo "OUTPUT_TABULARIZATION_DIR: $OUTPUT_TABULARIZATION_DIR" +echo "TASKS:" "${TASK_ARRAY[@]}" +echo "TASKS_DIR: $TASKS_DIR" +echo "OUTPUT_MODEL_DIR: $OUTPUT_MODEL_DIR" +echo "N_PARALLEL_WORKERS: $N_PARALLEL_WORKERS" +echo "Additional arguments:" "$@" +echo +# Reshard the data +echo "Resharding data" MEDS_transform-reshard_to_split \ --multirun \ worker="range(0,6)" \ @@ -43,10 +87,9 @@ meds-tab-tabularize-time-series \ "input_dir=${MIMICIV_MEDS_RESHARD_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" \ do_overwrite=False "$@" - for TASK in "${TASK_ARRAY[@]}" do - echo "Running task_specific_caching.py" + echo "Running task_specific_caching.py for task: $TASK" meds-tab-cache-task \ --multirun \ worker="range(0,$N_PARALLEL_WORKERS)" \ @@ -54,7 +97,7 @@ do "input_dir=${MIMICIV_MEDS_RESHARD_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" \ "input_label_dir=${TASKS_DIR}" "task_name=${TASK}" do_overwrite=False "$@" - echo "Running xgboost" + echo "Running xgboost for task: $TASK" meds-tab-xgboost \ --multirun \ worker="range(0,$N_PARALLEL_WORKERS)" \ From 2aa4feb259f599e7d376de8f4cc90090ae8074e9 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Mon, 9 Sep 2024 10:11:21 -0400 Subject: [PATCH 45/54] Improved error handling per https://github.com/mmcdermott/MEDS_Tabular_AutoML/pull/81#discussion_r1748999196 --- src/MEDS_tabular_automl/evaluation_callback.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/MEDS_tabular_automl/evaluation_callback.py b/src/MEDS_tabular_automl/evaluation_callback.py index 4382ff2..18a86be 100644 --- a/src/MEDS_tabular_automl/evaluation_callback.py +++ b/src/MEDS_tabular_automl/evaluation_callback.py @@ -14,7 +14,7 @@ def on_multirun_end(self, config: DictConfig, **kwargs): try: performance = pl.read_csv(log_fp / f"*/*{config.model_logging.performance_log_stem}.log") except Exception as e: - raise FileNotFoundError(f"Log files incomplete or not found at {log_fp}, exception {e}.") + raise FileNotFoundError(f"Log files incomplete or not found at {log_fp}") from e performance = performance.sort("tuning_auc", descending=True, nulls_last=True) logger.info(f"\nPerformance of the top 10 models:\n{performance.head(10)}") From a6d91037422b881b566ea7d62a94ba6587fdcf8d Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Mon, 9 Sep 2024 10:13:57 -0400 Subject: [PATCH 46/54] Update README.md Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ac704f8..d091bb3 100644 --- a/README.md +++ b/README.md @@ -136,7 +136,7 @@ By following these steps, you can seamlessly transform your dataset, define nece 4. **`meds-tab-cache-task`**: Aligns task-specific labels with the nearest prior event in the tabularized data. It requires a labeled dataset directory with three columns (`subject_id`, `timestamp`, `label`) structured similarly to the `input_dir`. - **Example: Align tabularized data** for a specific task `$TASK` and labels that has pulled from [ACES](https://github.com/justin13601/ACES) + **Example: Align tabularized data** for a specific task `$TASK` and labels that have been pulled from [ACES](https://github.com/justin13601/ACES) ```console meds-tab-cache-task input_dir="path_to_data" \ From 23eb4d4efb2dd208d9a52e1be49e7ba78fad9115 Mon Sep 17 00:00:00 2001 From: Nassim Date: Mon, 9 Sep 2024 23:18:30 -0400 Subject: [PATCH 47/54] added try except around loading 0 codes --- src/MEDS_tabular_automl/scripts/launch_model.py | 7 +++++++ tests/test_tabularize.py | 14 ++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/src/MEDS_tabular_automl/scripts/launch_model.py b/src/MEDS_tabular_automl/scripts/launch_model.py index 9f8b8da..23c52f7 100644 --- a/src/MEDS_tabular_automl/scripts/launch_model.py +++ b/src/MEDS_tabular_automl/scripts/launch_model.py @@ -4,6 +4,7 @@ import hydra from omegaconf import DictConfig +from loguru import logger from MEDS_tabular_automl.base_model import BaseModel @@ -31,6 +32,12 @@ def main(cfg: DictConfig) -> float: if not cfg.loguru_init: hydra_loguru_init() + try: + cfg.tabularization._resolved_codes + except ValueError as e: + logger.warning(f"No codes meet loading critera, trial returning 0 AUC: {str(e)}") + return 0.0 + model_launcher: BaseModel = hydra.utils.instantiate(cfg.model_launcher) model_launcher.train() diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index c3bea0c..565ef5b 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -331,6 +331,20 @@ def test_tabularize(tmp_path): == expected_num_time_tabs + expected_num_static_tabs ) + failure_xgboost_config = { + **shared_config, + "tabularization.min_code_inclusion_count": 100_000_000, + "tabularization.window_sizes": "[30d,365d,full]", + "task_name": "test_task", + "output_model_dir": str(output_model_dir.resolve()), + } + + with initialize(version_base=None, config_path="../src/MEDS_tabular_automl/configs/"): + overrides = ["model_launcher=xgboost"] + [f"{k}={v}" for k, v in failure_xgboost_config.items()] + cfg = compose(config_name="launch_model", overrides=overrides, return_hydra_config=True) + + assert 0.0 == launch_model.main(cfg) + xgboost_config = { **shared_config, "tabularization.min_code_inclusion_count": 1, From be5f723cd895d31f4a42ba67dc6d610995aaed0c Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Tue, 10 Sep 2024 03:21:26 +0000 Subject: [PATCH 48/54] fixed job name config bug where we were missing the $ so it was not resolved. Fixed bugs in e2e meds-tab mimic script --- MIMICIV_TUTORIAL/README.MD | 18 ++---------------- MIMICIV_TUTORIAL/tabularize_meds.sh | 10 +++++----- src/MEDS_tabular_automl/configs/default.yaml | 2 +- .../configs/launch_model.yaml | 2 +- 4 files changed, 9 insertions(+), 23 deletions(-) diff --git a/MIMICIV_TUTORIAL/README.MD b/MIMICIV_TUTORIAL/README.MD index e340a09..5084824 100644 --- a/MIMICIV_TUTORIAL/README.MD +++ b/MIMICIV_TUTORIAL/README.MD @@ -39,11 +39,11 @@ mkdir -p "${TASKS_DIR}" # create a directory for the task for TASK_NAME in "${TASKS[@]}" do - gcloud storage cp "gs://ehr_standardization_schema/benchmark_v1/data/labels/${TASK_NAME}.parquet" "${TASKS_DIR}/${TASK_NAME}.parquet" + gcloud storage cp "gs://ehr_standardization_schema/benchmark_v1/data/labels/${TASK_NAME}.parquet" "${TASKS_DIR}/${TASK_NAME}/0.parquet" done ``` -## Pre-Processing for Tabularization +## Run Tabularization and XGBoost Baseline ```console export N_PARALLEL_WORKERS=48 # Set number of workers @@ -53,17 +53,3 @@ bash MIMICIV_TUTORIAL/tabularize_meds.sh "${MIMICIV_MEDS_DIR}" "$RESHARD_DIR" $O "tabularization.aggs=[static/present,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]" \ "tabularization.window_sizes=[2h,12h,1d,7d,30d,365d,full]" ``` - -## Train XGBOOST Baseline - -```console -meds-tab-model "input_dir=${MIMICIV_MEDS_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" output_model_dir=$OUTPUT_MODEL_DIR task_name=long_los - "tabularization.aggs=[static/present,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]" \ - "tabularization.window_sizes=[2h,12h,1d,7d,30d,365d,full]" -``` - -```console -meds-tab-model "input_dir=${MIMICIV_MEDS_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" task_name=icu_mortality - "tabularization.aggs=[static/present,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]" \ - "tabularization.window_sizes=[2h,12h,1d,7d,30d,365d,full]" -``` diff --git a/MIMICIV_TUTORIAL/tabularize_meds.sh b/MIMICIV_TUTORIAL/tabularize_meds.sh index 9374c22..d81a9d6 100644 --- a/MIMICIV_TUTORIAL/tabularize_meds.sh +++ b/MIMICIV_TUTORIAL/tabularize_meds.sh @@ -76,7 +76,7 @@ meds-tab-describe \ "input_dir=${MIMICIV_MEDS_RESHARD_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" echo "Tabularizing static data" -echo meds-tab-tabularize-static \ +meds-tab-tabularize-static \ "input_dir=${MIMICIV_MEDS_RESHARD_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" \ do_overwrite=False "$@" @@ -91,16 +91,16 @@ for TASK in "${TASK_ARRAY[@]}" do echo "Running task_specific_caching.py for task: $TASK" meds-tab-cache-task \ - --multirun \ - worker="range(0,$N_PARALLEL_WORKERS)" \ hydra/launcher=joblib \ "input_dir=${MIMICIV_MEDS_RESHARD_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" \ - "input_label_dir=${TASKS_DIR}" "task_name=${TASK}" do_overwrite=False "$@" + "input_label_dir=${TASKS_DIR}/${TASK}/" "task_name=${TASK}" do_overwrite=False "$@" echo "Running xgboost for task: $TASK" meds-tab-xgboost \ --multirun \ worker="range(0,$N_PARALLEL_WORKERS)" \ "input_dir=${MIMICIV_MEDS_RESHARD_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" \ - "output_model_dir=${OUTPUT_MODEL_DIR}/${TASK}/" "task_name=$TASK" do_overwrite=False "$@" + "output_model_dir=${OUTPUT_MODEL_DIR}/${TASK}/" "task_name=$TASK" do_overwrite=False \ + "hydra.sweeper.n_trials=1000" "hydra.sweeper.n_jobs=${N_PARALLEL_WORKERS}" \ + "$@" done diff --git a/src/MEDS_tabular_automl/configs/default.yaml b/src/MEDS_tabular_automl/configs/default.yaml index 7d4e392..538bc18 100644 --- a/src/MEDS_tabular_automl/configs/default.yaml +++ b/src/MEDS_tabular_automl/configs/default.yaml @@ -12,7 +12,7 @@ cache_dir: ${output_dir}/.cache hydra: verbose: False job: - name: MEDS_TAB_${name}_${worker}_{now:%Y-%m-%d_%H-%M-%S} + name: MEDS_TAB_${name}_${worker}_${now:%Y-%m-%d_%H-%M-%S} sweep: dir: ${log_dir} run: diff --git a/src/MEDS_tabular_automl/configs/launch_model.yaml b/src/MEDS_tabular_automl/configs/launch_model.yaml index f6dc949..3d886b1 100644 --- a/src/MEDS_tabular_automl/configs/launch_model.yaml +++ b/src/MEDS_tabular_automl/configs/launch_model.yaml @@ -22,7 +22,7 @@ name: launch_model hydra: sweep: - dir: ${output_model_dir}/sweeps/{now:%Y-%m-%d-%H-%M-%S}/ + dir: ${output_model_dir}/sweeps/${now:%Y-%m-%d-%H-%M-%S}/ subdir: "1" run: dir: ${path.model_log_dir} From d390658cd81356f9629817920f193cb939eab5af Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Tue, 10 Sep 2024 03:26:57 +0000 Subject: [PATCH 49/54] fixed precommit issues --- src/MEDS_tabular_automl/scripts/launch_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/MEDS_tabular_automl/scripts/launch_model.py b/src/MEDS_tabular_automl/scripts/launch_model.py index 23c52f7..68a4de1 100644 --- a/src/MEDS_tabular_automl/scripts/launch_model.py +++ b/src/MEDS_tabular_automl/scripts/launch_model.py @@ -3,8 +3,8 @@ from pathlib import Path import hydra -from omegaconf import DictConfig from loguru import logger +from omegaconf import DictConfig from MEDS_tabular_automl.base_model import BaseModel @@ -35,7 +35,7 @@ def main(cfg: DictConfig) -> float: try: cfg.tabularization._resolved_codes except ValueError as e: - logger.warning(f"No codes meet loading critera, trial returning 0 AUC: {str(e)}") + logger.warning(f"No codes meet loading criteria, trial returning 0 AUC: {str(e)}") return 0.0 model_launcher: BaseModel = hydra.utils.instantiate(cfg.model_launcher) From a5648869e6584b978fadbdc99eadcc0cbaa92846 Mon Sep 17 00:00:00 2001 From: teyaberg Date: Tue, 10 Sep 2024 12:12:41 +0000 Subject: [PATCH 50/54] fix paths for eval_callback and add check to test_integration --- .../evaluation_callback.py | 19 ++++++++++--------- tests/test_integration.py | 3 ++- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/src/MEDS_tabular_automl/evaluation_callback.py b/src/MEDS_tabular_automl/evaluation_callback.py index 18a86be..9805cb9 100644 --- a/src/MEDS_tabular_automl/evaluation_callback.py +++ b/src/MEDS_tabular_automl/evaluation_callback.py @@ -9,20 +9,16 @@ class EvaluationCallback(Callback): def on_multirun_end(self, config: DictConfig, **kwargs): """Find best model based on log files and logger.info its performance and hyperparameters.""" - log_fp = Path(config.model_logging.model_log_dir) + log_fp = Path(config.path.model_log_dir) try: - performance = pl.read_csv(log_fp / f"*/*{config.model_logging.performance_log_stem}.log") + performance = pl.read_csv(log_fp / f"*/*{config.path.performance_log_stem}.log") except Exception as e: raise FileNotFoundError(f"Log files incomplete or not found at {log_fp}") from e performance = performance.sort("tuning_auc", descending=True, nulls_last=True) logger.info(f"\nPerformance of the top 10 models:\n{performance.head(10)}") - # get best model_fp - best_model = performance[0, 0] - - logger.info(f"The best model can be found at {best_model}") self.log_performance(performance[0, :]) if hasattr(config, "model_saving.delete_below_top_k") and config.delete_below_top_k >= 0: self.delete_below_top_k_models( @@ -33,11 +29,16 @@ def on_multirun_end(self, config: DictConfig, **kwargs): def log_performance(self, best_model_performance): """logger.info performance of the best model with nice formatting.""" + best_model = best_model_performance["model_fp"][0] tuning_auc = best_model_performance["tuning_auc"][0] test_auc = best_model_performance["test_auc"][0] - logger.info( - f"\nPerformance of best model:\nTuning AUC: {tuning_auc}\nTest AUC: {test_auc}", - ) + log_performance_message = [ + f"\nBest model can be found at {best_model}", + "Performance of best model:", + f"Tuning AUC: {tuning_auc}", + f"Test AUC: {test_auc}", + ] + logger.info("\n".join(log_performance_message)) def delete_below_top_k_models(self, performance, k, model_dir): """Save only top k models from the model directory and delete all other files.""" diff --git a/tests/test_integration.py b/tests/test_integration.py index d623914..411be80 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -289,7 +289,7 @@ def test_integration(tmp_path): "task_name": "test_task", "output_model_dir": str(output_model_dir.resolve()), "model_launcher": model, - "hydra.sweeper.n_trials": 1, + "hydra.sweeper.n_trials": 2, } overrides = [f"tabularization.aggs={stdout_agg.strip()}"] if model == "autogluon": @@ -299,3 +299,4 @@ def test_integration(tmp_path): overrides = ["--multirun"] + overrides stderr, stdout = run_command(script, overrides, model_config, f"launch_model_{model}") + assert "Performance of best model:" in stderr From 430afba35d815dd5dfb82b9b3c08f4ab351356ca Mon Sep 17 00:00:00 2001 From: teyaberg Date: Tue, 10 Sep 2024 13:17:36 +0000 Subject: [PATCH 51/54] fixing tests for delete_below_top_k --- .../evaluation_callback.py | 42 ++++++++++++++++--- src/MEDS_tabular_automl/utils.py | 2 +- tests/test_integration.py | 6 +++ tests/test_tabularize.py | 4 +- 4 files changed, 46 insertions(+), 8 deletions(-) diff --git a/src/MEDS_tabular_automl/evaluation_callback.py b/src/MEDS_tabular_automl/evaluation_callback.py index 9805cb9..f5cc10a 100644 --- a/src/MEDS_tabular_automl/evaluation_callback.py +++ b/src/MEDS_tabular_automl/evaluation_callback.py @@ -20,9 +20,13 @@ def on_multirun_end(self, config: DictConfig, **kwargs): logger.info(f"\nPerformance of the top 10 models:\n{performance.head(10)}") self.log_performance(performance[0, :]) - if hasattr(config, "model_saving.delete_below_top_k") and config.delete_below_top_k >= 0: + if hasattr(config, "delete_below_top_k") and config.delete_below_top_k >= 0: self.delete_below_top_k_models( - performance, config.model_saving.delete_below_top_k, config.model_saving.model_dir + performance, config.delete_below_top_k, config.path.output_model_dir + ) + else: + logger.info( + "All models were saved. To automatically delete models, set delete_below_top_k in config." ) return performance.head(1) @@ -41,8 +45,36 @@ def log_performance(self, best_model_performance): logger.info("\n".join(log_performance_message)) def delete_below_top_k_models(self, performance, k, model_dir): - """Save only top k models from the model directory and delete all other files.""" - top_k_models = performance.head(k)["model_fp"].values + """Save only top k models from the model directory and delete all other files. + + Args: + performance: DataFrame containing model_fp and performance metrics. + k: Number of top models to save. + model_dir: Directory containing models. + + Example: + >>> import tempfile + >>> import json + >>> performance = pl.DataFrame( + ... { + ... "model_fp": ["model1", "model2", "model3", "model4"], + ... "tuning_auc": [0.9, 0.8, 0.7, 0.6], + ... "test_auc": [0.9, 0.8, 0.7, 0.6], + ... } + ... ) + >>> k = 2 + >>> with tempfile.TemporaryDirectory() as model_dir: + ... for model in performance["model_fp"]: + ... with open(Path(model_dir) / f"{model}.json", 'w') as f: + ... json.dump({"model_name": model, "content": "dummy data"}, f) + ... cb = EvaluationCallback() + ... cb.delete_below_top_k_models(performance, k, model_dir) + ... remaining_models = sorted(p.stem for p in Path(model_dir).iterdir()) + >>> remaining_models + ['model1', 'model2'] + """ + logger.info(f"Deleting all models except top {k} models.") + top_k_models = performance.head(k)["model_fp"].to_list() for model_fp in Path(model_dir).iterdir(): - if model_fp.is_file() and model_fp.suffix != ".log" and str(model_fp) not in top_k_models: + if model_fp.is_file() and model_fp.suffix != ".log" and str(model_fp.stem) not in top_k_models: model_fp.unlink() diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py index d5eac76..9f9d9e7 100644 --- a/src/MEDS_tabular_automl/utils.py +++ b/src/MEDS_tabular_automl/utils.py @@ -433,7 +433,7 @@ def log_to_logfile(model, cfg, output_fp): out_fp.mkdir(parents=True, exist_ok=True) # config as a json - config_fp = out_fp / f"{cfg.path.config_log_stem}.json" + config_fp = out_fp / f"{cfg.path.config_log_stem}.log" with open(config_fp, "w") as f: f.write(OmegaConf.to_yaml(cfg)) diff --git a/tests/test_integration.py b/tests/test_integration.py index 411be80..d31bc5b 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -289,7 +289,9 @@ def test_integration(tmp_path): "task_name": "test_task", "output_model_dir": str(output_model_dir.resolve()), "model_launcher": model, + "path.model_file_stem": model, "hydra.sweeper.n_trials": 2, + "delete_below_top_k": 1, } overrides = [f"tabularization.aggs={stdout_agg.strip()}"] if model == "autogluon": @@ -300,3 +302,7 @@ def test_integration(tmp_path): stderr, stdout = run_command(script, overrides, model_config, f"launch_model_{model}") assert "Performance of best model:" in stderr + if model == "xgboost": + assert len(list_subdir_files(str(output_model_dir.resolve()), "json")) == 1 + else: + assert len(list_subdir_files(str(output_model_dir.resolve()), "pkl")) == 1 diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index 565ef5b..cb5d4c4 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -361,11 +361,11 @@ def test_tabularize(tmp_path): expected_output_dir = Path(cfg.output_model_dir) output_files = list(expected_output_dir.glob("**/*.json")) - assert len(output_files) == 2 + assert len(output_files) == 1 log_dir = Path(cfg.path.model_log_dir) log_files = list(log_dir.glob("**/*.log")) - assert len(log_files) == 1 + assert len(log_files) == 2 shutil.rmtree(expected_output_dir) sklearnmodel_config = { From 9e6d99acfebe4ce1a450969866f9d1ff99d47af3 Mon Sep 17 00:00:00 2001 From: teyaberg Date: Tue, 10 Sep 2024 15:21:04 +0000 Subject: [PATCH 52/54] fix out of memory xgboost training and added test --- src/MEDS_tabular_automl/xgboost_model.py | 2 +- tests/test_integration.py | 34 +++++++++++++++++++ tests/test_tabularize.py | 43 ++++++++++++++++++++++++ 3 files changed, 78 insertions(+), 1 deletion(-) diff --git a/src/MEDS_tabular_automl/xgboost_model.py b/src/MEDS_tabular_automl/xgboost_model.py index 8326ace..a3233c3 100644 --- a/src/MEDS_tabular_automl/xgboost_model.py +++ b/src/MEDS_tabular_automl/xgboost_model.py @@ -43,7 +43,7 @@ def __init__(self, cfg: DictConfig, split: str): cfg: The configuration dictionary. split: The data split to use. """ - xgb.DataIter.__init__(self, cache_prefix=Path(cfg.path.cache_dir)) + xgb.DataIter.__init__(self, cache_prefix=cfg.path.cache_dir) TabularDataset.__init__(self, cfg=cfg, split=split) self._it = 0 diff --git a/tests/test_integration.py b/tests/test_integration.py index d31bc5b..df0e833 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -3,6 +3,7 @@ root = rootutils.setup_root(__file__, dotenv=True, pythonpath=True, cwd=True) import json +import shutil import subprocess from io import StringIO from pathlib import Path @@ -292,6 +293,7 @@ def test_integration(tmp_path): "path.model_file_stem": model, "hydra.sweeper.n_trials": 2, "delete_below_top_k": 1, + "data_loading_params.keep_data_in_memory": True, } overrides = [f"tabularization.aggs={stdout_agg.strip()}"] if model == "autogluon": @@ -306,3 +308,35 @@ def test_integration(tmp_path): assert len(list_subdir_files(str(output_model_dir.resolve()), "json")) == 1 else: assert len(list_subdir_files(str(output_model_dir.resolve()), "pkl")) == 1 + shutil.rmtree(output_model_dir) + + for model in [ + "xgboost", + "sgd_classifier", + ]: + model_config = { + **shared_config, + "tabularization.min_code_inclusion_count": 1, + "tabularization.window_sizes": "[30d,365d,full]", + "task_name": "test_task", + "output_model_dir": str(output_model_dir.resolve()), + "model_launcher": model, + "path.model_file_stem": model, + "hydra.sweeper.n_trials": 2, + "delete_below_top_k": 1, + "data_loading_params.keep_data_in_memory": False, + } + overrides = [f"tabularization.aggs={stdout_agg.strip()}"] + if model == "autogluon": + script = "meds-tab-autogluon" + else: + script = "meds-tab-model" + overrides = ["--multirun"] + overrides + + stderr, stdout = run_command(script, overrides, model_config, f"launch_model_{model}") + assert "Performance of best model:" in stderr + if model == "xgboost": + assert len(list_subdir_files(str(output_model_dir.resolve()), "json")) == 1 + else: + assert len(list_subdir_files(str(output_model_dir.resolve()), "pkl")) == 1 + shutil.rmtree(output_model_dir) diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index cb5d4c4..6bb4e6d 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -9,6 +9,7 @@ from pathlib import Path import polars as pl +import pytest from hydra import compose, initialize from MEDS_tabular_automl.describe_codes import get_feature_columns @@ -368,6 +369,31 @@ def test_tabularize(tmp_path): assert len(log_files) == 2 shutil.rmtree(expected_output_dir) + xgboost_config = { + **shared_config, + "tabularization.min_code_inclusion_count": 1, + "tabularization.window_sizes": "[30d,365d,full]", + "task_name": "test_task", + "path.model_file_stem": "xgboost_oom_test", + "output_model_dir": str(output_model_dir.resolve()), + "data_loading_params.keep_data_in_memory": False, + } + + with initialize(version_base=None, config_path="../src/MEDS_tabular_automl/configs/"): + overrides = ["model_launcher=xgboost"] + [f"{k}={v}" for k, v in xgboost_config.items()] + cfg = compose(config_name="launch_model", overrides=overrides, return_hydra_config=True) + + launch_model.main(cfg) + + expected_output_dir = Path(cfg.output_model_dir) + output_files = list(expected_output_dir.glob("**/*.json")) + assert len(output_files) == 1 + + log_dir = Path(cfg.path.model_log_dir) + log_files = list(log_dir.glob("**/*.log")) + assert len(log_files) == 2 + shutil.rmtree(expected_output_dir) + sklearnmodel_config = { **shared_config, "tabularization.min_code_inclusion_count": 1, @@ -387,6 +413,23 @@ def test_tabularize(tmp_path): assert len(output_files) == 1 shutil.rmtree(expected_output_dir) + sklearnmodel_config = { + **shared_config, + "tabularization.min_code_inclusion_count": 1, + "tabularization.window_sizes": "[30d,365d,full]", + "task_name": "test_task", + "output_model_dir": str(output_model_dir.resolve()), + "data_loading_params.keep_data_in_memory": False, + } + + with initialize(version_base=None, config_path="../src/MEDS_tabular_automl/configs/"): + overrides = ["model_launcher=knn_classifier"] + [f"{k}={v}" for k, v in sklearnmodel_config.items()] + cfg = compose(config_name="launch_model", overrides=overrides) + with pytest.raises( + ValueError, match="Data is loaded in shards, but KNeighborsClassifier does not support partial_fit." + ): + launch_model.main(cfg) + sklearnmodel_config = { **shared_config, "tabularization.min_code_inclusion_count": 1, From 83163656aceaae9798fc0e04b4e90cda70215f50 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Tue, 10 Sep 2024 17:35:10 +0000 Subject: [PATCH 53/54] simplified pathing for results and evaluation callback --- .../configs/launch_model.yaml | 6 ++- .../configs/model_launcher/autogluon.yaml | 3 ++ .../model_launcher/knn_classifier.yaml | 5 ++- .../model_launcher/logistic_regression.yaml | 5 ++- .../configs/model_launcher/path/default.yaml | 5 +-- .../random_forest_classifier.yaml | 5 ++- .../model_launcher/sgd_classifier.yaml | 5 ++- .../configs/model_launcher/xgboost.yaml | 3 ++ .../evaluation_callback.py | 28 ++++++++------ .../scripts/launch_autogluon.py | 10 +++-- .../scripts/launch_model.py | 38 +++++++++++++------ src/MEDS_tabular_automl/sklearn_model.py | 3 +- src/MEDS_tabular_automl/utils.py | 27 ------------- tests/test_integration.py | 21 ++++++---- tests/test_tabularize.py | 18 ++++----- 15 files changed, 96 insertions(+), 86 deletions(-) diff --git a/src/MEDS_tabular_automl/configs/launch_model.yaml b/src/MEDS_tabular_automl/configs/launch_model.yaml index 3d886b1..f59f30c 100644 --- a/src/MEDS_tabular_automl/configs/launch_model.yaml +++ b/src/MEDS_tabular_automl/configs/launch_model.yaml @@ -16,15 +16,17 @@ input_label_cache_dir: ${output_dir}/${task_name}/labels # Where to output the model and cached data output_model_dir: ??? +time_output_model_dir: ${output_model_dir}/${now:%Y-%m-%d_%H-%M-%S} + delete_below_top_k: -1 name: launch_model hydra: sweep: - dir: ${output_model_dir}/sweeps/${now:%Y-%m-%d-%H-%M-%S}/ + dir: ${time_output_model_dir}/hydra/ subdir: "1" run: - dir: ${path.model_log_dir} + dir: ${path.sweep_results_dir} sweeper: direction: "maximize" diff --git a/src/MEDS_tabular_automl/configs/model_launcher/autogluon.yaml b/src/MEDS_tabular_automl/configs/model_launcher/autogluon.yaml index b7d02cd..d67d249 100644 --- a/src/MEDS_tabular_automl/configs/model_launcher/autogluon.yaml +++ b/src/MEDS_tabular_automl/configs/model_launcher/autogluon.yaml @@ -1,3 +1,6 @@ defaults: - default - _self_ + +path: + model_file_stem: "autogluon" diff --git a/src/MEDS_tabular_automl/configs/model_launcher/knn_classifier.yaml b/src/MEDS_tabular_automl/configs/model_launcher/knn_classifier.yaml index 9f85e97..4d13963 100644 --- a/src/MEDS_tabular_automl/configs/model_launcher/knn_classifier.yaml +++ b/src/MEDS_tabular_automl/configs/model_launcher/knn_classifier.yaml @@ -15,8 +15,9 @@ model_launcher: p: 2 metric: "minkowski" - path: - model_file_extension: .pkl +path: + model_file_extension: .pkl + model_file_stem: "knn_classifier" hydra: sweeper: diff --git a/src/MEDS_tabular_automl/configs/model_launcher/logistic_regression.yaml b/src/MEDS_tabular_automl/configs/model_launcher/logistic_regression.yaml index 4531efc..4d679a7 100644 --- a/src/MEDS_tabular_automl/configs/model_launcher/logistic_regression.yaml +++ b/src/MEDS_tabular_automl/configs/model_launcher/logistic_regression.yaml @@ -21,8 +21,9 @@ model_launcher: solver: "lbfgs" max_iter: 100 - path: - model_file_extension: .pkl +path: + model_file_extension: .pkl + model_file_stem: "logistic_regression" hydra: sweeper: diff --git a/src/MEDS_tabular_automl/configs/model_launcher/path/default.yaml b/src/MEDS_tabular_automl/configs/model_launcher/path/default.yaml index d739ce3..42139b3 100644 --- a/src/MEDS_tabular_automl/configs/model_launcher/path/default.yaml +++ b/src/MEDS_tabular_automl/configs/model_launcher/path/default.yaml @@ -1,10 +1,9 @@ input_tabularized_cache_dir: ${input_tabularized_cache_dir} input_label_cache_dir: ${input_label_cache_dir} -output_model_dir: ${output_model_dir} model_file_stem: model model_file_extension: .json -log_dir: ${log_dir} cache_dir: ${cache_dir} -model_log_dir: ${output_model_dir}/.logs/ +sweep_results_dir: ${time_output_model_dir}/sweep_results/ +best_trial_dir: ${time_output_model_dir}/best_trial/ performance_log_stem: performance config_log_stem: config diff --git a/src/MEDS_tabular_automl/configs/model_launcher/random_forest_classifier.yaml b/src/MEDS_tabular_automl/configs/model_launcher/random_forest_classifier.yaml index 4a50beb..b4f9452 100644 --- a/src/MEDS_tabular_automl/configs/model_launcher/random_forest_classifier.yaml +++ b/src/MEDS_tabular_automl/configs/model_launcher/random_forest_classifier.yaml @@ -20,8 +20,9 @@ model_launcher: min_impurity_decrease: 0.0 bootstrap: True - path: - model_file_extension: .pkl +path: + model_file_extension: .pkl + model_file_stem: "random_forest_classifier" hydra: sweeper: diff --git a/src/MEDS_tabular_automl/configs/model_launcher/sgd_classifier.yaml b/src/MEDS_tabular_automl/configs/model_launcher/sgd_classifier.yaml index 9f6cb1d..6c1ad00 100644 --- a/src/MEDS_tabular_automl/configs/model_launcher/sgd_classifier.yaml +++ b/src/MEDS_tabular_automl/configs/model_launcher/sgd_classifier.yaml @@ -11,8 +11,9 @@ model_launcher: _target_: sklearn.linear_model.SGDClassifier loss: log_loss - path: - model_file_extension: .pkl +path: + model_file_extension: .pkl + model_file_stem: "sgd_classifier" hydra: sweeper: diff --git a/src/MEDS_tabular_automl/configs/model_launcher/xgboost.yaml b/src/MEDS_tabular_automl/configs/model_launcher/xgboost.yaml index 6f364ae..401611b 100644 --- a/src/MEDS_tabular_automl/configs/model_launcher/xgboost.yaml +++ b/src/MEDS_tabular_automl/configs/model_launcher/xgboost.yaml @@ -17,6 +17,9 @@ model_launcher: num_boost_round: 1000 early_stopping_rounds: 5 +path: + model_file_stem: "xgboost" + hydra: sweeper: params: diff --git a/src/MEDS_tabular_automl/evaluation_callback.py b/src/MEDS_tabular_automl/evaluation_callback.py index f5cc10a..ceb64e0 100644 --- a/src/MEDS_tabular_automl/evaluation_callback.py +++ b/src/MEDS_tabular_automl/evaluation_callback.py @@ -1,3 +1,4 @@ +import shutil from pathlib import Path import polars as pl @@ -9,7 +10,7 @@ class EvaluationCallback(Callback): def on_multirun_end(self, config: DictConfig, **kwargs): """Find best model based on log files and logger.info its performance and hyperparameters.""" - log_fp = Path(config.path.model_log_dir) + log_fp = Path(config.path.sweep_results_dir) try: performance = pl.read_csv(log_fp / f"*/*{config.path.performance_log_stem}.log") @@ -22,18 +23,22 @@ def on_multirun_end(self, config: DictConfig, **kwargs): self.log_performance(performance[0, :]) if hasattr(config, "delete_below_top_k") and config.delete_below_top_k >= 0: self.delete_below_top_k_models( - performance, config.delete_below_top_k, config.path.output_model_dir + performance, config.delete_below_top_k, config.path.sweep_results_dir ) else: logger.info( "All models were saved. To automatically delete models, set delete_below_top_k in config." ) + best_trial_dir = Path(config.path.sweep_results_dir) / performance["trial_name"].cast(pl.String)[0] + output_best_trial_dir = Path(config.path.best_trial_dir) + shutil.copytree(best_trial_dir, output_best_trial_dir) + performance.write_parquet(config.time_output_model_dir / "sweep_results_summary.parquet") return performance.head(1) def log_performance(self, best_model_performance): """logger.info performance of the best model with nice formatting.""" - best_model = best_model_performance["model_fp"][0] + best_model = best_model_performance["trial_name"][0] tuning_auc = best_model_performance["tuning_auc"][0] test_auc = best_model_performance["test_auc"][0] log_performance_message = [ @@ -44,11 +49,11 @@ def log_performance(self, best_model_performance): ] logger.info("\n".join(log_performance_message)) - def delete_below_top_k_models(self, performance, k, model_dir): + def delete_below_top_k_models(self, performance, k, sweep_results_dir): """Save only top k models from the model directory and delete all other files. Args: - performance: DataFrame containing model_fp and performance metrics. + performance: DataFrame containing trial_name and performance metrics. k: Number of top models to save. model_dir: Directory containing models. @@ -57,14 +62,14 @@ def delete_below_top_k_models(self, performance, k, model_dir): >>> import json >>> performance = pl.DataFrame( ... { - ... "model_fp": ["model1", "model2", "model3", "model4"], + ... "trial_name": ["model1", "model2", "model3", "model4"], ... "tuning_auc": [0.9, 0.8, 0.7, 0.6], ... "test_auc": [0.9, 0.8, 0.7, 0.6], ... } ... ) >>> k = 2 >>> with tempfile.TemporaryDirectory() as model_dir: - ... for model in performance["model_fp"]: + ... for model in performance["trial_name"]: ... with open(Path(model_dir) / f"{model}.json", 'w') as f: ... json.dump({"model_name": model, "content": "dummy data"}, f) ... cb = EvaluationCallback() @@ -74,7 +79,8 @@ def delete_below_top_k_models(self, performance, k, model_dir): ['model1', 'model2'] """ logger.info(f"Deleting all models except top {k} models.") - top_k_models = performance.head(k)["model_fp"].to_list() - for model_fp in Path(model_dir).iterdir(): - if model_fp.is_file() and model_fp.suffix != ".log" and str(model_fp.stem) not in top_k_models: - model_fp.unlink() + top_k_models = performance.head(k)["trial_name"].cast(pl.String).to_list() + logger.debug(f"Top {k} models: {top_k_models}") + for trial_dir in Path(sweep_results_dir).iterdir(): + if trial_dir.stem not in top_k_models: + shutil.rmtree(trial_dir) diff --git a/src/MEDS_tabular_automl/scripts/launch_autogluon.py b/src/MEDS_tabular_automl/scripts/launch_autogluon.py index 6f645c2..1f61902 100644 --- a/src/MEDS_tabular_automl/scripts/launch_autogluon.py +++ b/src/MEDS_tabular_automl/scripts/launch_autogluon.py @@ -67,13 +67,13 @@ def main(cfg: DictConfig) -> float: held_out_dataset = ag.TabularDataset(held_out_df) # train model with AutoGluon - log_filepath = Path(cfg.path.model_log_dir) / f"{cfg.path.config_log_stem}_log.txt" + log_filepath = Path(cfg.path.sweep_results_dir) / f"{cfg.path.config_log_stem}_log.txt" predictor = ag.TabularPredictor( label=cfg.task_name, log_to_file=True, log_file_path=str(log_filepath.resolve()), - path=cfg.output_model_dir, + path=cfg.time_output_model_dir, ).fit(train_data=train_dataset, tuning_data=tuning_dataset) # predict @@ -83,11 +83,13 @@ def main(cfg: DictConfig) -> float: score = predictor.evaluate(held_out_dataset) logger.info("Test score:", score) - model_performance_log_filepath = Path(cfg.path.model_log_dir) / f"{cfg.path.performance_log_stem}.json" + model_performance_log_filepath = ( + Path(cfg.path.sweep_results_dir) / f"{cfg.path.performance_log_stem}.json" + ) model_performance_log_filepath.parent.mkdir(parents=True, exist_ok=True) # store results performance_dict = { - "output_model_dir": cfg.path.output_model_dir, + "output_model_dir": cfg.path.time_output_model_dir, "tabularization": OmegaConf.to_container(cfg.tabularization), "model_launcher": OmegaConf.to_container(cfg.model_launcher), "score": score, diff --git a/src/MEDS_tabular_automl/scripts/launch_model.py b/src/MEDS_tabular_automl/scripts/launch_model.py index 68a4de1..c1ca295 100644 --- a/src/MEDS_tabular_automl/scripts/launch_model.py +++ b/src/MEDS_tabular_automl/scripts/launch_model.py @@ -1,14 +1,14 @@ -import time +import json from importlib.resources import files from pathlib import Path import hydra from loguru import logger -from omegaconf import DictConfig +from omegaconf import DictConfig, OmegaConf from MEDS_tabular_automl.base_model import BaseModel -from ..utils import hydra_loguru_init, log_to_logfile, stage_init +from ..utils import hydra_loguru_init, stage_init config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_model.yaml") if not config_yaml.is_file(): @@ -43,17 +43,31 @@ def main(cfg: DictConfig) -> float: model_launcher.train() auc = model_launcher.evaluate() - # save model - output_model_dir = Path(cfg.output_model_dir) + # Make output model directory path_cfg = model_launcher.cfg.path - model_filename = f"{path_cfg.model_file_stem}_{auc:.4f}_{time.time()}{path_cfg.model_file_extension}" - output_fp = output_model_dir / model_filename - output_model_dir.parent.mkdir(parents=True, exist_ok=True) - - # log to logfile - log_to_logfile(model_launcher, cfg, output_fp.stem) + model_filename = f"{path_cfg.model_file_stem}{path_cfg.model_file_extension}" + model_config_hash = abs(hash(json.dumps(OmegaConf.to_container(cfg), sort_keys=True))) + trial_output_dir = Path(path_cfg.sweep_results_dir) / str(model_config_hash) + trial_output_dir.mkdir(parents=True, exist_ok=True) - model_launcher.save_model(output_fp) + # save model + model_launcher.save_model(trial_output_dir / model_filename) + + # save model config + config_fp = trial_output_dir / f"{cfg.path.config_log_stem}.log" + with open(config_fp, "w") as f: + f.write(OmegaConf.to_yaml(cfg)) + + # save model performance + model_performance_fp = trial_output_dir / f"{cfg.path.performance_log_stem}.log" + with open(model_performance_fp, "w") as f: + f.write("trial_name,tuning_auc,test_auc\n") + f.write( + f"{trial_output_dir.stem},{model_launcher.evaluate()}," + f"{model_launcher.evaluate(split='held_out')}\n" + ) + + logger.debug(f"Model config and performance logged to {config_fp} and {model_performance_fp}") return auc diff --git a/src/MEDS_tabular_automl/sklearn_model.py b/src/MEDS_tabular_automl/sklearn_model.py index 795ae36..e5ef225 100644 --- a/src/MEDS_tabular_automl/sklearn_model.py +++ b/src/MEDS_tabular_automl/sklearn_model.py @@ -175,13 +175,12 @@ def evaluate(self, split: str = "tuning") -> float: raise ValueError("Predictions or true labels are empty.") return roc_auc_score(y_true, y_pred) - def save_model(self, output_fp: str): + def save_model(self, output_fp: Path): """Saves the model to the specified file path. Args: output_fp: The file path to save the model to. """ - output_fp = Path(output_fp) # check if model has save method if not hasattr(self.model, "save_model"): logger.info(f"Model {self.model.__class__.__name__} does not have a save_model method.") diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py index 9f9d9e7..e275f9f 100644 --- a/src/MEDS_tabular_automl/utils.py +++ b/src/MEDS_tabular_automl/utils.py @@ -418,33 +418,6 @@ def get_shard_prefix(base_path: Path, fp: Path) -> str: return str(relative_parent / file_name) -def log_to_logfile(model, cfg, output_fp): - """Log model hyperparameters and performance to two log files. - - Args: - model: The model to log. - cfg: The configuration dictionary. - output_fp: The relative output file path. - """ - log_fp = Path(cfg.path.model_log_dir) - - # make a folder to log everything for this model - out_fp = log_fp / output_fp - out_fp.mkdir(parents=True, exist_ok=True) - - # config as a json - config_fp = out_fp / f"{cfg.path.config_log_stem}.log" - with open(config_fp, "w") as f: - f.write(OmegaConf.to_yaml(cfg)) - - model_performance_fp = out_fp / f"{cfg.path.performance_log_stem}.log" - with open(model_performance_fp, "w") as f: - f.write("model_fp,tuning_auc,test_auc\n") - f.write(f"{output_fp},{model.evaluate()},{model.evaluate(split='held_out')}\n") - - logger.debug(f"Model config and performance logged to {config_fp} and {model_performance_fp}") - - def current_script_name() -> str: """Returns the name of the module that called this function.""" diff --git a/tests/test_integration.py b/tests/test_integration.py index df0e833..e2af97a 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -2,6 +2,7 @@ root = rootutils.setup_root(__file__, dotenv=True, pythonpath=True, cwd=True) +import glob import json import shutil import subprocess @@ -291,8 +292,8 @@ def test_integration(tmp_path): "output_model_dir": str(output_model_dir.resolve()), "model_launcher": model, "path.model_file_stem": model, - "hydra.sweeper.n_trials": 2, - "delete_below_top_k": 1, + "hydra.sweeper.n_trials": 3, + "delete_below_top_k": 2, "data_loading_params.keep_data_in_memory": True, } overrides = [f"tabularization.aggs={stdout_agg.strip()}"] @@ -305,9 +306,11 @@ def test_integration(tmp_path): stderr, stdout = run_command(script, overrides, model_config, f"launch_model_{model}") assert "Performance of best model:" in stderr if model == "xgboost": - assert len(list_subdir_files(str(output_model_dir.resolve()), "json")) == 1 + assert len(glob.glob(str(output_model_dir / "*/sweep_results/**/*.json"))) == 2 + assert len(glob.glob(str(output_model_dir / "*/best_trial/*.json"))) == 1 else: - assert len(list_subdir_files(str(output_model_dir.resolve()), "pkl")) == 1 + assert len(glob.glob(str(output_model_dir / "*/sweep_results/**/*.pkl"))) == 2 + assert len(glob.glob(str(output_model_dir / "*/best_trial/*.pkl"))) == 1 shutil.rmtree(output_model_dir) for model in [ @@ -322,8 +325,8 @@ def test_integration(tmp_path): "output_model_dir": str(output_model_dir.resolve()), "model_launcher": model, "path.model_file_stem": model, - "hydra.sweeper.n_trials": 2, - "delete_below_top_k": 1, + "hydra.sweeper.n_trials": 3, + "delete_below_top_k": 2, "data_loading_params.keep_data_in_memory": False, } overrides = [f"tabularization.aggs={stdout_agg.strip()}"] @@ -336,7 +339,9 @@ def test_integration(tmp_path): stderr, stdout = run_command(script, overrides, model_config, f"launch_model_{model}") assert "Performance of best model:" in stderr if model == "xgboost": - assert len(list_subdir_files(str(output_model_dir.resolve()), "json")) == 1 + assert len(glob.glob(str(output_model_dir / "*/sweep_results/**/*.json"))) == 2 + assert len(glob.glob(str(output_model_dir / "*/best_trial/*.json"))) == 1 else: - assert len(list_subdir_files(str(output_model_dir.resolve()), "pkl")) == 1 + assert len(glob.glob(str(output_model_dir / "*/sweep_results/**/*.pkl"))) == 2 + assert len(glob.glob(str(output_model_dir / "*/best_trial/*.pkl"))) == 1 shutil.rmtree(output_model_dir) diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index 6bb4e6d..091948d 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -315,7 +315,7 @@ def test_tabularize(tmp_path): df.write_parquet(out_fp) cache_task.main(cfg) - for split in split_json.keys(): + for split in split_json: for window in cfg.tabularization.window_sizes: for agg in cfg.tabularization.aggs: if agg.startswith("static"): @@ -360,11 +360,11 @@ def test_tabularize(tmp_path): launch_model.main(cfg) - expected_output_dir = Path(cfg.output_model_dir) + expected_output_dir = Path(cfg.time_output_model_dir) output_files = list(expected_output_dir.glob("**/*.json")) assert len(output_files) == 1 - log_dir = Path(cfg.path.model_log_dir) + log_dir = Path(cfg.path.sweep_results_dir) log_files = list(log_dir.glob("**/*.log")) assert len(log_files) == 2 shutil.rmtree(expected_output_dir) @@ -385,11 +385,11 @@ def test_tabularize(tmp_path): launch_model.main(cfg) - expected_output_dir = Path(cfg.output_model_dir) + expected_output_dir = Path(cfg.time_output_model_dir) output_files = list(expected_output_dir.glob("**/*.json")) assert len(output_files) == 1 - log_dir = Path(cfg.path.model_log_dir) + log_dir = Path(cfg.path.sweep_results_dir) log_files = list(log_dir.glob("**/*.log")) assert len(log_files) == 2 shutil.rmtree(expected_output_dir) @@ -408,7 +408,7 @@ def test_tabularize(tmp_path): launch_model.main(cfg) - expected_output_dir = Path(cfg.output_model_dir) + expected_output_dir = Path(cfg.time_output_model_dir) output_files = list(expected_output_dir.glob("**/*.pkl")) assert len(output_files) == 1 shutil.rmtree(expected_output_dir) @@ -449,7 +449,7 @@ def test_tabularize(tmp_path): launch_model.main(cfg) - expected_output_dir = Path(cfg.output_model_dir) + expected_output_dir = Path(cfg.time_output_model_dir) output_files = list(expected_output_dir.glob("**/*.pkl")) assert len(output_files) == 1 shutil.rmtree(expected_output_dir) @@ -473,6 +473,6 @@ def test_tabularize(tmp_path): launch_autogluon.main(cfg) - expected_output_filepath = Path(cfg.output_model_dir) / "predictor.pkl" + expected_output_filepath = Path(cfg.time_output_model_dir) / "predictor.pkl" assert expected_output_filepath.is_file() - ag.tabular.TabularPredictor.load(cfg.output_model_dir) + ag.tabular.TabularPredictor.load(cfg.time_output_model_dir) From f7e03dd17903802d83db60e8c52eccb9d4c61d7c Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Tue, 10 Sep 2024 17:45:28 +0000 Subject: [PATCH 54/54] fixed doctest for deleting below top k models --- .../evaluation_callback.py | 26 +++++++++++-------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/src/MEDS_tabular_automl/evaluation_callback.py b/src/MEDS_tabular_automl/evaluation_callback.py index ceb64e0..b2befb5 100644 --- a/src/MEDS_tabular_automl/evaluation_callback.py +++ b/src/MEDS_tabular_automl/evaluation_callback.py @@ -50,33 +50,37 @@ def log_performance(self, best_model_performance): logger.info("\n".join(log_performance_message)) def delete_below_top_k_models(self, performance, k, sweep_results_dir): - """Save only top k models from the model directory and delete all other files. + """Save only top k models from the sweep results directory and delete all other directories. Args: performance: DataFrame containing trial_name and performance metrics. k: Number of top models to save. - model_dir: Directory containing models. + sweep_results_dir: Directory containing trial results. Example: >>> import tempfile >>> import json + >>> import polars as pl + >>> from pathlib import Path >>> performance = pl.DataFrame( ... { - ... "trial_name": ["model1", "model2", "model3", "model4"], + ... "trial_name": ["trial1", "trial2", "trial3", "trial4"], ... "tuning_auc": [0.9, 0.8, 0.7, 0.6], ... "test_auc": [0.9, 0.8, 0.7, 0.6], ... } ... ) >>> k = 2 - >>> with tempfile.TemporaryDirectory() as model_dir: - ... for model in performance["trial_name"]: - ... with open(Path(model_dir) / f"{model}.json", 'w') as f: - ... json.dump({"model_name": model, "content": "dummy data"}, f) + >>> with tempfile.TemporaryDirectory() as sweep_dir: + ... for trial in performance["trial_name"]: + ... trial_dir = Path(sweep_dir) / trial + ... trial_dir.mkdir() + ... with open(trial_dir / "model.json", 'w') as f: + ... json.dump({"model_name": trial, "content": "dummy data"}, f) ... cb = EvaluationCallback() - ... cb.delete_below_top_k_models(performance, k, model_dir) - ... remaining_models = sorted(p.stem for p in Path(model_dir).iterdir()) - >>> remaining_models - ['model1', 'model2'] + ... cb.delete_below_top_k_models(performance, k, sweep_dir) + ... remaining_trials = sorted(p.name for p in Path(sweep_dir).iterdir()) + >>> remaining_trials + ['trial1', 'trial2'] """ logger.info(f"Deleting all models except top {k} models.") top_k_models = performance.head(k)["trial_name"].cast(pl.String).to_list()