From 0d5e9e8a0057997ae64dcff3f971e144d48948db Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Sun, 8 Sep 2024 15:56:57 -0400 Subject: [PATCH 1/6] Updated pre-commit config too. --- .pre-commit-config.yaml | 4 +- src/MEDS_tabular_automl/dense_iterator.py | 37 ----------------- .../scripts/launch_autogluon.py | 2 +- src/MEDS_tabular_automl/sklearn_model.py | 41 +------------------ src/MEDS_tabular_automl/tabular_dataset.py | 19 +++++++++ src/MEDS_tabular_automl/xgboost_model.py | 5 +-- 6 files changed, 23 insertions(+), 85 deletions(-) delete mode 100644 src/MEDS_tabular_automl/dense_iterator.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1533f74..6fd8933 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,7 +1,5 @@ default_language_version: - python: python3.12 - -exclude: "sample_data|docs/MIMIC_IV_tutorial/wandb_reports" + python: python3.11 repos: - repo: https://github.com/pre-commit/pre-commit-hooks diff --git a/src/MEDS_tabular_automl/dense_iterator.py b/src/MEDS_tabular_automl/dense_iterator.py deleted file mode 100644 index 33d13b0..0000000 --- a/src/MEDS_tabular_automl/dense_iterator.py +++ /dev/null @@ -1,37 +0,0 @@ -import numpy as np -import scipy.sparse as sp -from mixins import TimeableMixin -from omegaconf import DictConfig - -from .tabular_dataset import TabularDataset - - -class DenseIterator(TabularDataset, TimeableMixin): - def __init__(self, cfg: DictConfig, split: str): - """Initializes the SklearnIterator with the provided configuration and data split. - - Args: - cfg: The configuration dictionary. - split: The data split to use. - """ - TabularDataset.__init__(self, cfg=cfg, split=split) - TimeableMixin.__init__(self) - self.valid_event_ids, self.labels = self._load_ids_and_labels() - # check if the labels are empty - if len(self.labels) == 0: - raise ValueError("No labels found.") - # self._it = 0 - - def densify(self) -> np.ndarray: - """Builds the data as a dense matrix based on column subselection.""" - - # get the dense matrix by iterating through the data shards - data = [] - labels = [] - for shard_idx in range(len(self._data_shards)): - shard_data, shard_labels = self.get_data_shards(shard_idx) - data.append(shard_data) - labels.append(shard_labels) - data = sp.vstack(data) - labels = np.concatenate(labels, axis=0) - return data, labels diff --git a/src/MEDS_tabular_automl/scripts/launch_autogluon.py b/src/MEDS_tabular_automl/scripts/launch_autogluon.py index db61e9f..d184603 100644 --- a/src/MEDS_tabular_automl/scripts/launch_autogluon.py +++ b/src/MEDS_tabular_automl/scripts/launch_autogluon.py @@ -6,7 +6,7 @@ from loguru import logger from omegaconf import DictConfig -from MEDS_tabular_automl.dense_iterator import DenseIterator +from MEDS_tabular_automl.tabular_dataset import TabularDataset as DenseIterator from ..utils import hydra_loguru_init diff --git a/src/MEDS_tabular_automl/sklearn_model.py b/src/MEDS_tabular_automl/sklearn_model.py index 56063fd..dbc519f 100644 --- a/src/MEDS_tabular_automl/sklearn_model.py +++ b/src/MEDS_tabular_automl/sklearn_model.py @@ -7,46 +7,7 @@ from sklearn.metrics import roc_auc_score from .base_model import BaseModel -from .tabular_dataset import TabularDataset - - -class SklearnIterator(TabularDataset): - """SklearnIterator class for loading and processing data shards for use in SciKit-Learn models. - - This class provides functionality for iterating through data shards, loading - feature data and labels, and processing them based on the provided configuration. - - Args: - cfg: A configuration dictionary containing parameters for - data processing, feature selection, and other settings. - split: The data split to use, which can be one of "train", "tuning", - or "held_out". This determines which subset of the data is loaded and processed. - - Attributes: - cfg: Configuration dictionary containing parameters for - data processing, feature selection, and other settings. - file_name_resolver: Object for resolving file names and paths based on the configuration. - split: The data split being used for loading and processing data shards. - _data_shards: List of data shard names. - valid_event_ids: Dictionary mapping shard number to a list of valid event IDs. - labels: Dictionary mapping shard number to a list of labels for the corresponding event IDs. - codes_set: Set of codes to include in the data. - code_masks: Dictionary of code masks for filtering features based on aggregation. - num_features: Total number of features in the data. - """ - - def __init__(self, cfg: DictConfig, split: str): - """Initializes the SklearnIterator with the provided configuration and data split. - - Args: - cfg: The configuration dictionary. - split: The data split to use. - """ - super().__init__(cfg=cfg, split=split) - self.valid_event_ids, self.labels = self._load_ids_and_labels() - # check if the labels are empty - if len(self.labels) == 0: - raise ValueError("No labels found.") +from .tabular_dataset import TabularDataset as SklearnIterator class SklearnMatrix: diff --git a/src/MEDS_tabular_automl/tabular_dataset.py b/src/MEDS_tabular_automl/tabular_dataset.py index 84a6609..b698904 100644 --- a/src/MEDS_tabular_automl/tabular_dataset.py +++ b/src/MEDS_tabular_automl/tabular_dataset.py @@ -61,6 +61,11 @@ def __init__(self, cfg: DictConfig, split: str = "train"): self._set_scaler() self._set_imputer() + self.valid_event_ids, self.labels = self._load_ids_and_labels() + # check if the labels are empty + if len(self.labels) == 0: + raise ValueError("No labels found.") + @TimeableMixin.TimeAs def _get_code_masks(self, feature_columns: list, codes_set: set) -> Mapping[str, list[bool]]: """Creates boolean masks for filtering features. @@ -497,3 +502,17 @@ def extract_name(test_file): all_indices.extend(feature_ids) return all_feats, all_indices + + def densify(self) -> np.ndarray: + """Builds the data as a dense matrix based on column subselection.""" + + # get the dense matrix by iterating through the data shards + data = [] + labels = [] + for shard_idx in range(len(self._data_shards)): + shard_data, shard_labels = self.get_data_shards(shard_idx) + data.append(shard_data) + labels.append(shard_labels) + data = sp.vstack(data) + labels = np.concatenate(labels, axis=0) + return data, labels diff --git a/src/MEDS_tabular_automl/xgboost_model.py b/src/MEDS_tabular_automl/xgboost_model.py index ec33c15..ea7ab92 100644 --- a/src/MEDS_tabular_automl/xgboost_model.py +++ b/src/MEDS_tabular_automl/xgboost_model.py @@ -45,10 +45,7 @@ def __init__(self, cfg: DictConfig, split: str): """ xgb.DataIter.__init__(self, cache_prefix=Path(cfg.cache_dir)) TabularDataset.__init__(self, cfg=cfg, split=split) - self.valid_event_ids, self.labels = self._load_ids_and_labels() - # check if the labels are empty - if self.labels is None: - raise ValueError("No labels found.") + self._it = 0 def next(self, input_data: Callable) -> int: From 2563aafe29e0e4ca0a470234738a3bd804c0b611 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Sun, 8 Sep 2024 16:39:20 -0400 Subject: [PATCH 2/6] Removed a function that was not yet implemented. --- src/MEDS_tabular_automl/tabular_dataset.py | 28 ---------------------- 1 file changed, 28 deletions(-) diff --git a/src/MEDS_tabular_automl/tabular_dataset.py b/src/MEDS_tabular_automl/tabular_dataset.py index b698904..f06433c 100644 --- a/src/MEDS_tabular_automl/tabular_dataset.py +++ b/src/MEDS_tabular_automl/tabular_dataset.py @@ -475,34 +475,6 @@ def extract_name(test_file): all_feats = [all_feats[i] for i in indices] return all_feats - def get_columns_and_indices(self) -> tuple[list[str], list[int]]: - """Retrieves the names and indices of the columns in the data. - - Returns: - A tuple containing the names of the columns and their indices. - """ - raise NotImplementedError("This method is not implemented yet.") - files = get_model_files(self.cfg, self.split, self._data_shards[0]) - - def extract_name(test_file): - return str(Path(test_file.parent.parent.stem, test_file.parent.stem, test_file.stem)) - - agg_wind_combos = [extract_name(test_file) for test_file in files] - - feature_columns = get_feature_columns(self.cfg.tabularization.filtered_code_metadata_fp) - all_feats = [] - all_indices = [] - for agg_wind in agg_wind_combos: - window, feat, agg = agg_wind.split("/") - feature_ids = get_feature_indices(feat + "/" + agg, feature_columns) - feature_names = [feature_columns[i] for i in feature_ids] - for feat_name in feature_names: - all_feats.append(f"{feat_name}/{agg}/{window}") - # use mask to append indices - all_indices.extend(feature_ids) - - return all_feats, all_indices - def densify(self) -> np.ndarray: """Builds the data as a dense matrix based on column subselection.""" From 2d80905692c175b9777d9d3988a10aa9ff957cc8 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Sun, 8 Sep 2024 16:42:27 -0400 Subject: [PATCH 3/6] Removing unused function in evaluation callback. --- src/MEDS_tabular_automl/evaluation_callback.py | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/src/MEDS_tabular_automl/evaluation_callback.py b/src/MEDS_tabular_automl/evaluation_callback.py index d9236f8..0a394c5 100644 --- a/src/MEDS_tabular_automl/evaluation_callback.py +++ b/src/MEDS_tabular_automl/evaluation_callback.py @@ -3,13 +3,10 @@ import polars as pl from hydra.experimental.callback import Callback from loguru import logger -from omegaconf import DictConfig, OmegaConf +from omegaconf import DictConfig class EvaluationCallback(Callback): - def __init__(self, **kwargs): - self.kwargs = kwargs - def on_multirun_end(self, config: DictConfig, **kwargs): """Find best model based on log files and logger.info its performance and hyperparameters.""" log_fp = Path(config.model_logging.model_log_dir) @@ -27,7 +24,6 @@ def on_multirun_end(self, config: DictConfig, **kwargs): logger.info(f"The best model can be found at {best_model}") self.log_performance(perf[0, :]) - # self.log_hyperparams(log_fp / best_model / f"{config.model_logging.config_log_stem}.log") if hasattr(config, "model_saving.delete_below_top_k") and config.delete_below_top_k >= 0: self.delete_below_top_k_models( perf, config.model_saving.delete_below_top_k, config.model_saving.model_dir @@ -43,15 +39,6 @@ def log_performance(self, perf): f"\nPerformance of best model:\nTuning AUC: {tuning_auc}\nTest AUC: {test_auc}", ) - def log_hyperparams(self, best_params_fp): - """logger.info hyperparameters of the best model with nice formatting.""" - # check if this file exists - if not best_params_fp.is_file(): - raise FileNotFoundError(f"Best hyperparameters file not found at {best_params_fp}") - best_params = OmegaConf.load(best_params_fp) - # print using OmegaConf.to_yaml - logger.info(f"\nHyperparameters of the best model:\n{OmegaConf.to_yaml(best_params)}") - def delete_below_top_k_models(self, perf, k, model_dir): """Save only top k models from the model directory and delete all other files.""" top_k_models = perf.head(k)["model_fp"].values From 2f564e6c087736046825291eb00ca4242555405a Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Sun, 8 Sep 2024 21:14:52 -0400 Subject: [PATCH 4/6] Removed unused pass block. --- src/MEDS_tabular_automl/utils.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py index 3d6f496..bd272ef 100644 --- a/src/MEDS_tabular_automl/utils.py +++ b/src/MEDS_tabular_automl/utils.py @@ -77,16 +77,12 @@ def filter_to_codes( feature_freqs = feature_freqs.filter(pl.col("code").is_in(allowed_codes)) if min_code_inclusion_frequency is not None: - pass - # need to consider size of the dataset vs count - - # feature_freqs = feature_freqs.filter(pl.col("frequency") >= min_code_inclusion_frequency) + raise NotImplementedError("min_code_inclusion_frequency is not implemented yet") if min_code_inclusion_count is not None: feature_freqs = feature_freqs.filter(pl.col("count") >= min_code_inclusion_count) if max_include_codes is not None: - # feature_freqs = feature_freqs.sort("count", reverse=True).head(max_include_codes) feature_freqs = feature_freqs.sort("count", descending=True).head(max_include_codes) return sorted(feature_freqs["code"].to_list()) From 6f68a4b922b5fd9a1de78bf54457e2e607d202d4 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Sun, 8 Sep 2024 21:16:51 -0400 Subject: [PATCH 5/6] Removing unnecessary keys call --- tests/test_tabularize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index d110121..32ab69b 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -281,7 +281,7 @@ def test_tabularize(tmp_path): f"Time-Series Data matrix Should have {expected_num_rows}" f" rows but has {ts_matrix.shape[0]}!" ) output_files = list_subdir_files(str(output_dir.resolve()), "npz") - for split in split_json.keys(): + for split in split_json: for window in cfg.tabularization.window_sizes: for agg in cfg.tabularization.aggs: if agg.startswith("static"): From 6c2ba9a73cf26336f9d842b320f25cdcd397d2a0 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Sun, 8 Sep 2024 21:23:38 -0400 Subject: [PATCH 6/6] Fixed workflow files --- .github/workflows/code-quality-main.yaml | 6 ++++-- .github/workflows/code-quality-pr.yaml | 6 ++++-- .github/workflows/publish-to-pypi.yml | 26 +----------------------- .github/workflows/tests.yaml | 8 ++++---- 4 files changed, 13 insertions(+), 33 deletions(-) diff --git a/.github/workflows/code-quality-main.yaml b/.github/workflows/code-quality-main.yaml index 3703b1f..bb2d601 100644 --- a/.github/workflows/code-quality-main.yaml +++ b/.github/workflows/code-quality-main.yaml @@ -13,10 +13,12 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v3 + uses: actions/setup-python@v5 + with: + python-version: "3.11" - name: Run pre-commits uses: pre-commit/action@v3.0.1 diff --git a/.github/workflows/code-quality-pr.yaml b/.github/workflows/code-quality-pr.yaml index a97d2c0..46c9eec 100644 --- a/.github/workflows/code-quality-pr.yaml +++ b/.github/workflows/code-quality-pr.yaml @@ -16,10 +16,12 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v3 + uses: actions/setup-python@v5 + with: + python-version: "3.11" - name: Find modified files id: file_changes diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml index d86806f..34eddad 100644 --- a/.github/workflows/publish-to-pypi.yml +++ b/.github/workflows/publish-to-pypi.yml @@ -12,7 +12,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v5 with: - python-version: "3.x" + python-version: "3.11" - name: Install pypa/build run: >- python3 -m @@ -91,27 +91,3 @@ jobs: gh release upload '${{ github.ref_name }}' dist/** --repo '${{ github.repository }}' - - publish-to-testpypi: - name: Publish Python 🐍 distribution 📦 to TestPyPI - needs: - - build - runs-on: ubuntu-latest - - environment: - name: testpypi - url: https://test.pypi.org/p/ - - permissions: - id-token: write # IMPORTANT: mandatory for trusted publishing - - steps: - - name: Download all the dists - uses: actions/download-artifact@v3 - with: - name: python-package-distributions - path: dist/ - - name: Publish distribution 📦 to TestPyPI - uses: pypa/gh-action-pypi-publish@release/v1 - with: - repository-url: https://test.pypi.org/legacy/ diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index c96be0e..268269e 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -17,12 +17,12 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - - name: Set up Python 3.12 - uses: actions/setup-python@v3 + - name: Set up Python + uses: actions/setup-python@v5 with: - python-version: "3.12" + python-version: "3.11" - name: Install packages run: |