From 0d5e9e8a0057997ae64dcff3f971e144d48948db Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Sun, 8 Sep 2024 15:56:57 -0400
Subject: [PATCH 1/6] Updated pre-commit config too.

---
 .pre-commit-config.yaml                       |  4 +-
 src/MEDS_tabular_automl/dense_iterator.py     | 37 -----------------
 .../scripts/launch_autogluon.py               |  2 +-
 src/MEDS_tabular_automl/sklearn_model.py      | 41 +------------------
 src/MEDS_tabular_automl/tabular_dataset.py    | 19 +++++++++
 src/MEDS_tabular_automl/xgboost_model.py      |  5 +--
 6 files changed, 23 insertions(+), 85 deletions(-)
 delete mode 100644 src/MEDS_tabular_automl/dense_iterator.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 1533f74..6fd8933 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,7 +1,5 @@
 default_language_version:
-  python: python3.12
-
-exclude: "sample_data|docs/MIMIC_IV_tutorial/wandb_reports"
+  python: python3.11
 
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
diff --git a/src/MEDS_tabular_automl/dense_iterator.py b/src/MEDS_tabular_automl/dense_iterator.py
deleted file mode 100644
index 33d13b0..0000000
--- a/src/MEDS_tabular_automl/dense_iterator.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import numpy as np
-import scipy.sparse as sp
-from mixins import TimeableMixin
-from omegaconf import DictConfig
-
-from .tabular_dataset import TabularDataset
-
-
-class DenseIterator(TabularDataset, TimeableMixin):
-    def __init__(self, cfg: DictConfig, split: str):
-        """Initializes the SklearnIterator with the provided configuration and data split.
-
-        Args:
-            cfg: The configuration dictionary.
-            split: The data split to use.
-        """
-        TabularDataset.__init__(self, cfg=cfg, split=split)
-        TimeableMixin.__init__(self)
-        self.valid_event_ids, self.labels = self._load_ids_and_labels()
-        # check if the labels are empty
-        if len(self.labels) == 0:
-            raise ValueError("No labels found.")
-        # self._it = 0
-
-    def densify(self) -> np.ndarray:
-        """Builds the data as a dense matrix based on column subselection."""
-
-        # get the dense matrix by iterating through the data shards
-        data = []
-        labels = []
-        for shard_idx in range(len(self._data_shards)):
-            shard_data, shard_labels = self.get_data_shards(shard_idx)
-            data.append(shard_data)
-            labels.append(shard_labels)
-        data = sp.vstack(data)
-        labels = np.concatenate(labels, axis=0)
-        return data, labels
diff --git a/src/MEDS_tabular_automl/scripts/launch_autogluon.py b/src/MEDS_tabular_automl/scripts/launch_autogluon.py
index db61e9f..d184603 100644
--- a/src/MEDS_tabular_automl/scripts/launch_autogluon.py
+++ b/src/MEDS_tabular_automl/scripts/launch_autogluon.py
@@ -6,7 +6,7 @@
 from loguru import logger
 from omegaconf import DictConfig
 
-from MEDS_tabular_automl.dense_iterator import DenseIterator
+from MEDS_tabular_automl.tabular_dataset import TabularDataset as DenseIterator
 
 from ..utils import hydra_loguru_init
 
diff --git a/src/MEDS_tabular_automl/sklearn_model.py b/src/MEDS_tabular_automl/sklearn_model.py
index 56063fd..dbc519f 100644
--- a/src/MEDS_tabular_automl/sklearn_model.py
+++ b/src/MEDS_tabular_automl/sklearn_model.py
@@ -7,46 +7,7 @@
 from sklearn.metrics import roc_auc_score
 
 from .base_model import BaseModel
-from .tabular_dataset import TabularDataset
-
-
-class SklearnIterator(TabularDataset):
-    """SklearnIterator class for loading and processing data shards for use in SciKit-Learn models.
-
-    This class provides functionality for iterating through data shards, loading
-    feature data and labels, and processing them based on the provided configuration.
-
-    Args:
-        cfg: A configuration dictionary containing parameters for
-            data processing, feature selection, and other settings.
-        split: The data split to use, which can be one of "train", "tuning",
-            or "held_out". This determines which subset of the data is loaded and processed.
-
-    Attributes:
-        cfg: Configuration dictionary containing parameters for
-            data processing, feature selection, and other settings.
-        file_name_resolver: Object for resolving file names and paths based on the configuration.
-        split: The data split being used for loading and processing data shards.
-        _data_shards: List of data shard names.
-        valid_event_ids: Dictionary mapping shard number to a list of valid event IDs.
-        labels: Dictionary mapping shard number to a list of labels for the corresponding event IDs.
-        codes_set: Set of codes to include in the data.
-        code_masks: Dictionary of code masks for filtering features based on aggregation.
-        num_features: Total number of features in the data.
-    """
-
-    def __init__(self, cfg: DictConfig, split: str):
-        """Initializes the SklearnIterator with the provided configuration and data split.
-
-        Args:
-            cfg: The configuration dictionary.
-            split: The data split to use.
-        """
-        super().__init__(cfg=cfg, split=split)
-        self.valid_event_ids, self.labels = self._load_ids_and_labels()
-        # check if the labels are empty
-        if len(self.labels) == 0:
-            raise ValueError("No labels found.")
+from .tabular_dataset import TabularDataset as SklearnIterator
 
 
 class SklearnMatrix:
diff --git a/src/MEDS_tabular_automl/tabular_dataset.py b/src/MEDS_tabular_automl/tabular_dataset.py
index 84a6609..b698904 100644
--- a/src/MEDS_tabular_automl/tabular_dataset.py
+++ b/src/MEDS_tabular_automl/tabular_dataset.py
@@ -61,6 +61,11 @@ def __init__(self, cfg: DictConfig, split: str = "train"):
         self._set_scaler()
         self._set_imputer()
 
+        self.valid_event_ids, self.labels = self._load_ids_and_labels()
+        # check if the labels are empty
+        if len(self.labels) == 0:
+            raise ValueError("No labels found.")
+
     @TimeableMixin.TimeAs
     def _get_code_masks(self, feature_columns: list, codes_set: set) -> Mapping[str, list[bool]]:
         """Creates boolean masks for filtering features.
@@ -497,3 +502,17 @@ def extract_name(test_file):
             all_indices.extend(feature_ids)
 
         return all_feats, all_indices
+
+    def densify(self) -> np.ndarray:
+        """Builds the data as a dense matrix based on column subselection."""
+
+        # get the dense matrix by iterating through the data shards
+        data = []
+        labels = []
+        for shard_idx in range(len(self._data_shards)):
+            shard_data, shard_labels = self.get_data_shards(shard_idx)
+            data.append(shard_data)
+            labels.append(shard_labels)
+        data = sp.vstack(data)
+        labels = np.concatenate(labels, axis=0)
+        return data, labels
diff --git a/src/MEDS_tabular_automl/xgboost_model.py b/src/MEDS_tabular_automl/xgboost_model.py
index ec33c15..ea7ab92 100644
--- a/src/MEDS_tabular_automl/xgboost_model.py
+++ b/src/MEDS_tabular_automl/xgboost_model.py
@@ -45,10 +45,7 @@ def __init__(self, cfg: DictConfig, split: str):
         """
         xgb.DataIter.__init__(self, cache_prefix=Path(cfg.cache_dir))
         TabularDataset.__init__(self, cfg=cfg, split=split)
-        self.valid_event_ids, self.labels = self._load_ids_and_labels()
-        # check if the labels are empty
-        if self.labels is None:
-            raise ValueError("No labels found.")
+
         self._it = 0
 
     def next(self, input_data: Callable) -> int:

From 2563aafe29e0e4ca0a470234738a3bd804c0b611 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Sun, 8 Sep 2024 16:39:20 -0400
Subject: [PATCH 2/6] Removed a function that was not yet implemented.

---
 src/MEDS_tabular_automl/tabular_dataset.py | 28 ----------------------
 1 file changed, 28 deletions(-)

diff --git a/src/MEDS_tabular_automl/tabular_dataset.py b/src/MEDS_tabular_automl/tabular_dataset.py
index b698904..f06433c 100644
--- a/src/MEDS_tabular_automl/tabular_dataset.py
+++ b/src/MEDS_tabular_automl/tabular_dataset.py
@@ -475,34 +475,6 @@ def extract_name(test_file):
             all_feats = [all_feats[i] for i in indices]
         return all_feats
 
-    def get_columns_and_indices(self) -> tuple[list[str], list[int]]:
-        """Retrieves the names and indices of the columns in the data.
-
-        Returns:
-            A tuple containing the names of the columns and their indices.
-        """
-        raise NotImplementedError("This method is not implemented yet.")
-        files = get_model_files(self.cfg, self.split, self._data_shards[0])
-
-        def extract_name(test_file):
-            return str(Path(test_file.parent.parent.stem, test_file.parent.stem, test_file.stem))
-
-        agg_wind_combos = [extract_name(test_file) for test_file in files]
-
-        feature_columns = get_feature_columns(self.cfg.tabularization.filtered_code_metadata_fp)
-        all_feats = []
-        all_indices = []
-        for agg_wind in agg_wind_combos:
-            window, feat, agg = agg_wind.split("/")
-            feature_ids = get_feature_indices(feat + "/" + agg, feature_columns)
-            feature_names = [feature_columns[i] for i in feature_ids]
-            for feat_name in feature_names:
-                all_feats.append(f"{feat_name}/{agg}/{window}")
-            # use mask to append indices
-            all_indices.extend(feature_ids)
-
-        return all_feats, all_indices
-
     def densify(self) -> np.ndarray:
         """Builds the data as a dense matrix based on column subselection."""
 

From 2d80905692c175b9777d9d3988a10aa9ff957cc8 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Sun, 8 Sep 2024 16:42:27 -0400
Subject: [PATCH 3/6] Removing unused function in evaluation callback.

---
 src/MEDS_tabular_automl/evaluation_callback.py | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/src/MEDS_tabular_automl/evaluation_callback.py b/src/MEDS_tabular_automl/evaluation_callback.py
index d9236f8..0a394c5 100644
--- a/src/MEDS_tabular_automl/evaluation_callback.py
+++ b/src/MEDS_tabular_automl/evaluation_callback.py
@@ -3,13 +3,10 @@
 import polars as pl
 from hydra.experimental.callback import Callback
 from loguru import logger
-from omegaconf import DictConfig, OmegaConf
+from omegaconf import DictConfig
 
 
 class EvaluationCallback(Callback):
-    def __init__(self, **kwargs):
-        self.kwargs = kwargs
-
     def on_multirun_end(self, config: DictConfig, **kwargs):
         """Find best model based on log files and logger.info its performance and hyperparameters."""
         log_fp = Path(config.model_logging.model_log_dir)
@@ -27,7 +24,6 @@ def on_multirun_end(self, config: DictConfig, **kwargs):
 
         logger.info(f"The best model can be found at {best_model}")
         self.log_performance(perf[0, :])
-        # self.log_hyperparams(log_fp / best_model / f"{config.model_logging.config_log_stem}.log")
         if hasattr(config, "model_saving.delete_below_top_k") and config.delete_below_top_k >= 0:
             self.delete_below_top_k_models(
                 perf, config.model_saving.delete_below_top_k, config.model_saving.model_dir
@@ -43,15 +39,6 @@ def log_performance(self, perf):
             f"\nPerformance of best model:\nTuning AUC: {tuning_auc}\nTest AUC: {test_auc}",
         )
 
-    def log_hyperparams(self, best_params_fp):
-        """logger.info hyperparameters of the best model with nice formatting."""
-        # check if this file exists
-        if not best_params_fp.is_file():
-            raise FileNotFoundError(f"Best hyperparameters file not found at {best_params_fp}")
-        best_params = OmegaConf.load(best_params_fp)
-        # print using OmegaConf.to_yaml
-        logger.info(f"\nHyperparameters of the best model:\n{OmegaConf.to_yaml(best_params)}")
-
     def delete_below_top_k_models(self, perf, k, model_dir):
         """Save only top k models from the model directory and delete all other files."""
         top_k_models = perf.head(k)["model_fp"].values

From 2f564e6c087736046825291eb00ca4242555405a Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Sun, 8 Sep 2024 21:14:52 -0400
Subject: [PATCH 4/6] Removed unused pass block.

---
 src/MEDS_tabular_automl/utils.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py
index 3d6f496..bd272ef 100644
--- a/src/MEDS_tabular_automl/utils.py
+++ b/src/MEDS_tabular_automl/utils.py
@@ -77,16 +77,12 @@ def filter_to_codes(
         feature_freqs = feature_freqs.filter(pl.col("code").is_in(allowed_codes))
 
     if min_code_inclusion_frequency is not None:
-        pass
-        # need to consider size of the dataset vs count
-
-        # feature_freqs = feature_freqs.filter(pl.col("frequency") >= min_code_inclusion_frequency)
+        raise NotImplementedError("min_code_inclusion_frequency is not implemented yet")
 
     if min_code_inclusion_count is not None:
         feature_freqs = feature_freqs.filter(pl.col("count") >= min_code_inclusion_count)
 
     if max_include_codes is not None:
-        # feature_freqs = feature_freqs.sort("count", reverse=True).head(max_include_codes)
         feature_freqs = feature_freqs.sort("count", descending=True).head(max_include_codes)
 
     return sorted(feature_freqs["code"].to_list())

From 6f68a4b922b5fd9a1de78bf54457e2e607d202d4 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Sun, 8 Sep 2024 21:16:51 -0400
Subject: [PATCH 5/6] Removing unnecessary keys call

---
 tests/test_tabularize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py
index d110121..32ab69b 100644
--- a/tests/test_tabularize.py
+++ b/tests/test_tabularize.py
@@ -281,7 +281,7 @@ def test_tabularize(tmp_path):
             f"Time-Series Data matrix Should have {expected_num_rows}" f" rows but has {ts_matrix.shape[0]}!"
         )
     output_files = list_subdir_files(str(output_dir.resolve()), "npz")
-    for split in split_json.keys():
+    for split in split_json:
         for window in cfg.tabularization.window_sizes:
             for agg in cfg.tabularization.aggs:
                 if agg.startswith("static"):

From 6c2ba9a73cf26336f9d842b320f25cdcd397d2a0 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Sun, 8 Sep 2024 21:23:38 -0400
Subject: [PATCH 6/6] Fixed workflow files

---
 .github/workflows/code-quality-main.yaml |  6 ++++--
 .github/workflows/code-quality-pr.yaml   |  6 ++++--
 .github/workflows/publish-to-pypi.yml    | 26 +-----------------------
 .github/workflows/tests.yaml             |  8 ++++----
 4 files changed, 13 insertions(+), 33 deletions(-)

diff --git a/.github/workflows/code-quality-main.yaml b/.github/workflows/code-quality-main.yaml
index 3703b1f..bb2d601 100644
--- a/.github/workflows/code-quality-main.yaml
+++ b/.github/workflows/code-quality-main.yaml
@@ -13,10 +13,12 @@ jobs:
 
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Set up Python
-        uses: actions/setup-python@v3
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
 
       - name: Run pre-commits
         uses: pre-commit/action@v3.0.1
diff --git a/.github/workflows/code-quality-pr.yaml b/.github/workflows/code-quality-pr.yaml
index a97d2c0..46c9eec 100644
--- a/.github/workflows/code-quality-pr.yaml
+++ b/.github/workflows/code-quality-pr.yaml
@@ -16,10 +16,12 @@ jobs:
 
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Set up Python
-        uses: actions/setup-python@v3
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
 
       - name: Find modified files
         id: file_changes
diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml
index d86806f..34eddad 100644
--- a/.github/workflows/publish-to-pypi.yml
+++ b/.github/workflows/publish-to-pypi.yml
@@ -12,7 +12,7 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v5
         with:
-          python-version: "3.x"
+          python-version: "3.11"
       - name: Install pypa/build
         run: >-
           python3 -m
@@ -91,27 +91,3 @@ jobs:
           gh release upload
           '${{ github.ref_name }}' dist/**
           --repo '${{ github.repository }}'
-
-  publish-to-testpypi:
-    name: Publish Python 🐍 distribution 📦 to TestPyPI
-    needs:
-      - build
-    runs-on: ubuntu-latest
-
-    environment:
-      name: testpypi
-      url: https://test.pypi.org/p/<package-name>
-
-    permissions:
-      id-token: write # IMPORTANT: mandatory for trusted publishing
-
-    steps:
-      - name: Download all the dists
-        uses: actions/download-artifact@v3
-        with:
-          name: python-package-distributions
-          path: dist/
-      - name: Publish distribution 📦 to TestPyPI
-        uses: pypa/gh-action-pypi-publish@release/v1
-        with:
-          repository-url: https://test.pypi.org/legacy/
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index c96be0e..268269e 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -17,12 +17,12 @@ jobs:
 
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
-      - name: Set up Python 3.12
-        uses: actions/setup-python@v3
+      - name: Set up Python
+        uses: actions/setup-python@v5
         with:
-          python-version: "3.12"
+          python-version: "3.11"
 
       - name: Install packages
         run: |