mmcdermott · Oufattole · Sep 10, 2024 · Aug 19, 2024 · Aug 19, 2024 · Aug 20, 2024
diff --git a/pyproject.toml b/pyproject.toml
@@ -8,7 +8,7 @@ authors = [
 ]
 description = "Scalable Tabularization of MEDS format Time-Series data"
 readme = "README.md"
-requires-python = ">=3.12"
+requires-python = ">=3.11"
 classifiers = [
     "Programming Language :: Python :: 3",
     "License :: OSI Approved :: MIT License",
@@ -17,7 +17,6 @@ classifiers = [
 dependencies = [
   "polars", "pyarrow", "loguru", "hydra-core", "numpy", "scipy<1.14.0", "pandas", "tqdm", "xgboost",
   "scikit-learn", "hydra-optuna-sweeper", "hydra-joblib-launcher", "ml-mixins", "meds==0.3",
-  "MEDS-transforms==0.0.5",
 ]
 
 [project.scripts]
@@ -33,6 +32,7 @@ generate-subsets = "MEDS_tabular_automl.scripts.generate_subsets:main"
 dev = ["pre-commit"]
 tests = ["pytest", "pytest-cov", "rootutils"]
 profiling = ["mprofile", "matplotlib"]
+autogluon = ["autogluon; python_version=='3.11.*'"]  # Environment marker to restrict AutoGluon to Python 3.11
 
 [build-system]
 requires = ["setuptools>=61.0", "setuptools-scm>=8.0", "wheel"]

diff --git a/src/MEDS_tabular_automl/base_model.py b/src/MEDS_tabular_automl/base_model.py
@@ -0,0 +1,30 @@
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import TypeVar
+
+from mixins import TimeableMixin
+from omegaconf import DictConfig
+
+T = TypeVar("T")
+
+
+class BaseModel(ABC, TimeableMixin):
+    @abstractmethod
+    def __init__(self):
+        pass
+
+    @abstractmethod
+    def train(self):
+        pass
+
+    @abstractmethod
+    def evaluate(self) -> float:
+        pass
+
+    @abstractmethod
+    def save_model(self, output_fp: Path):
+        pass
+
+    @classmethod
+    def initialize(cls: T, **kwargs) -> T:
+        return cls(DictConfig(kwargs, flags={"allow_objects": True}))
diff --git a/src/MEDS_tabular_automl/configs/imputer/default.yaml b/src/MEDS_tabular_automl/configs/imputer/default.yaml
@@ -0,0 +1 @@
+imputer_target: null
diff --git a/src/MEDS_tabular_automl/configs/imputer/mean_imputer.yaml b/src/MEDS_tabular_automl/configs/imputer/mean_imputer.yaml
@@ -0,0 +1,3 @@
+imputer_target:
+  _target_: sklearn.impute.SimpleImputer
+  strategy: "mean"
diff --git a/src/MEDS_tabular_automl/configs/imputer/median_imputer.yaml b/src/MEDS_tabular_automl/configs/imputer/median_imputer.yaml
@@ -0,0 +1,3 @@
+imputer_target:
+  _target_: sklearn.impute.SimpleImputer
+  strategy: "median"
diff --git a/src/MEDS_tabular_automl/configs/imputer/mode_imputer.yaml b/src/MEDS_tabular_automl/configs/imputer/mode_imputer.yaml
@@ -0,0 +1,3 @@
+imputer_target:
+  _target_: sklearn.impute.SimpleImputer
+  strategy: "most_frequent"
diff --git a/src/MEDS_tabular_automl/configs/launch_autogluon.yaml b/src/MEDS_tabular_automl/configs/launch_autogluon.yaml
@@ -0,0 +1,28 @@
+defaults:
+  - default
+  - tabularization: default
+  - override hydra/sweeper: optuna
+  - override hydra/sweeper/sampler: tpe
+  - override hydra/launcher: joblib
+  - _self_
+
+task_name: task
+
+# Task cached data dir
+input_dir: ${output_cohort_dir}/${task_name}/task_cache
+# Directory with task labels
+input_label_dir: ${output_cohort_dir}/${task_name}/labels/
+# Where to output the model and cached data
+model_dir: ${output_cohort_dir}/autogluon/autogluon_${now:%Y-%m-%d_%H-%M-%S}
+output_filepath: ${model_dir}
+
+# Model parameters
+model_params:
+  iterator:
+    keep_data_in_memory: True
+    binarize_task: True
+
+log_dir: ${model_dir}/.logs/
+log_filepath: ${log_dir}/log.txt
+
+name: launch_autogluon
diff --git a/src/MEDS_tabular_automl/configs/launch_model.yaml b/src/MEDS_tabular_automl/configs/launch_model.yaml
@@ -0,0 +1,33 @@
+defaults:
+  - _self_
+  - default
+  - tabularization: default
+  - model: xgboost # This can be changed to sgd_classifier or any other model
+  - imputer: default
+  - normalization: default
+  - override hydra/sweeper: optuna
+  - override hydra/sweeper/sampler: tpe
+  - override hydra/launcher: joblib
+
+task_name: task
+
+# Task cached data dir
+input_dir: ${output_cohort_dir}/${task_name}/task_cache
+# Directory with task labels
+input_label_dir: ${output_cohort_dir}/${task_name}/labels/
+# Where to output the model and cached data
+model_dir: ${output_cohort_dir}/model/model_${now:%Y-%m-%d_%H-%M-%S}
+output_filepath: ${model_dir}/model_metadata.json
+
+log_dir: ${model_dir}/.logs/
+
+name: launch_model
+
+hydra:
+  verbose: False
+  job:
+    name: MEDS_TAB_${name}_${worker}_{now:%Y-%m-%d_%H-%M-%S}
+  sweep:
+    dir: ${log_dir}
+  run:
+    dir: ${log_dir}
diff --git a/src/MEDS_tabular_automl/configs/launch_sklearnmodel.yaml b/src/MEDS_tabular_automl/configs/launch_sklearnmodel.yaml
@@ -0,0 +1,33 @@
+defaults:
+  - default
+  - tabularization: default
+  - override hydra/sweeper: optuna
+  - override hydra/sweeper/sampler: tpe
+  - override hydra/launcher: joblib
+  - _self_
+
+task_name: task
+
+# Task cached data dir
+input_dir: ${output_cohort_dir}/${task_name}/task_cache
+# Directory with task labels
+input_label_dir: ${output_cohort_dir}/${task_name}/labels/
+# Where to output the model and cached data
+model_dir: ${output_cohort_dir}/model/model_${now:%Y-%m-%d_%H-%M-%S}
+output_filepath: ${model_dir}/model_metadata.json
+
+# Model parameters
+model_params:
+  epochs: 20
+  early_stopping_rounds: 5
+  model:
+    _target_: sklearn.linear_model.SGDClassifier
+    loss: log_loss
+    # n_iter: ${model_params.epochs} # not sure if we want this behaviour
+  iterator:
+    keep_data_in_memory: True
+    binarize_task: True
+
+log_dir: ${model_dir}/.logs/
+
+name: launch_sklearnmodel
diff --git a/src/MEDS_tabular_automl/configs/launch_xgboost.yaml b/src/MEDS_tabular_automl/configs/launch_xgboost.yaml
@@ -53,6 +53,6 @@ hydra:
       model_params.num_boost_round: range(100, 1000)
       model_params.early_stopping_rounds: range(1, 10)
       +model_params.model.max_depth: range(2, 16)
-      tabularization.min_code_inclusion_frequency: tag(log, range(10, 1000000))
+      tabularization.min_code_inclusion_count: tag(log, range(10, 1000000))
 
 name: launch_xgboost
diff --git a/src/MEDS_tabular_automl/configs/model/sgd_classifier.yaml b/src/MEDS_tabular_automl/configs/model/sgd_classifier.yaml
@@ -0,0 +1,34 @@
+# @package _global_
+
+model_target:
+  _target_: MEDS_tabular_automl.sklearn_model.SklearnModel.initialize
+  model_params: ${model_params}
+  input_dir: ${input_dir}
+  input_label_dir: ${input_label_dir}
+  model_dir: ${model_dir}
+  output_filepath: ${output_filepath}
+  log_dir: ${log_dir}
+  cache_dir: ${cache_dir}
+  imputer: ${model_params.iterator.imputer}
+  normalization: ${model_params.iterator.normalization}
+
+model_params:
+  epochs: 20
+  early_stopping_rounds: 5
+  model:
+    _target_: sklearn.linear_model.SGDClassifier
+    loss: log_loss
+  iterator:
+    keep_data_in_memory: True
+    binarize_task: True
+    normalization: ${normalization}
+    imputer: ${imputer}
+
+hydra:
+  sweeper:
+    params:
+      +model_params.model.alpha: tag(log, interval(1e-6, 1))
+      +model_params.model.l1_ratio: interval(0, 1)
+      +model_params.model.penalty: choice(['l1', 'l2', 'elasticnet'])
+      model_params.epochs: range(10, 100)
+      model_params.early_stopping_rounds: range(1, 10)
diff --git a/src/MEDS_tabular_automl/configs/model/xgboost.yaml b/src/MEDS_tabular_automl/configs/model/xgboost.yaml
@@ -0,0 +1,42 @@
+# @package _global_
+
+model_target:
+  _target_: MEDS_tabular_automl.xgboost_model.XGBoostModel.initialize
+  model_params: ${model_params}
+  input_dir: ${input_dir}
+  input_label_dir: ${input_label_dir}
+  model_dir: ${model_dir}
+  output_filepath: ${output_filepath}
+  log_dir: ${log_dir}
+  cache_dir: ${cache_dir}
+  imputer: ${model_params.iterator.imputer}
+  normalization: ${model_params.iterator.normalization}
+  # tabularization: ${tabularization} # Ideally we should define tabularization here, but there is an issue initializing with it's resolvers.
+
+model_params:
+  num_boost_round: 1000
+  early_stopping_rounds: 5
+  model:
+    booster: gbtree
+    device: cpu
+    nthread: 1
+    tree_method: hist
+    objective: binary:logistic
+  iterator:
+    keep_data_in_memory: True
+    binarize_task: True
+    normalization: ${normalization}
+    imputer: ${imputer}
+
+hydra:
+  sweeper:
+    params:
+      +model_params.model.eta: tag(log, interval(0.001, 1))
+      +model_params.model.lambda: tag(log, interval(0.001, 1))
+      +model_params.model.alpha: tag(log, interval(0.001, 1))
+      +model_params.model.subsample: interval(0.5, 1)
+      +model_params.model.min_child_weight: interval(1e-2, 100)
+      model_params.num_boost_round: range(100, 1000)
+      model_params.early_stopping_rounds: range(1, 10)
+      +model_params.model.max_depth: range(2, 16)
+      tabularization.min_code_inclusion_frequency: tag(log, range(10, 1000000))
diff --git a/src/MEDS_tabular_automl/configs/normalization/default.yaml b/src/MEDS_tabular_automl/configs/normalization/default.yaml
@@ -0,0 +1 @@
+normalizer: null
diff --git a/src/MEDS_tabular_automl/configs/normalization/min_max_scaler.yaml b/src/MEDS_tabular_automl/configs/normalization/min_max_scaler.yaml
@@ -0,0 +1,2 @@
+normalizer:
+  _target_: sklearn.preprocessing.MinMaxScaler
diff --git a/src/MEDS_tabular_automl/configs/normalization/standard_scaler.yaml b/src/MEDS_tabular_automl/configs/normalization/standard_scaler.yaml
@@ -0,0 +1,3 @@
+normalizer:
+  _target_: sklearn.preprocessing.StandardScaler
+  with_mean: False # This preserves the sparsity of the input data.
diff --git a/src/MEDS_tabular_automl/configs/tabularization/default.yaml b/src/MEDS_tabular_automl/configs/tabularization/default.yaml
@@ -1,7 +1,9 @@
 # User inputs
+filtered_code_metadata_fp: ${output_cohort_dir}/metadata/codes.parquet
 allowed_codes: null
-min_code_inclusion_frequency: 10
-filtered_code_metadata_fp: ${output_cohort_dir}/tabularized_code_metadata.parquet
+min_code_inclusion_count: 10
+min_code_inclusion_frequency: null
+max_included_codes: null
 window_sizes:
   - "1d"
   - "7d"
@@ -19,4 +21,4 @@ aggs:
   - "value/max"
 
 # Resolved inputs
-_resolved_codes: ${filter_to_codes:${tabularization.allowed_codes},${tabularization.min_code_inclusion_frequency},${tabularization.filtered_code_metadata_fp}}
+_resolved_codes: ${filter_to_codes:${tabularization.filtered_code_metadata_fp},${tabularization.allowed_codes},${tabularization.min_code_inclusion_count},${tabularization.min_code_inclusion_frequency},${tabularization.max_included_codes}}
diff --git a/src/MEDS_tabular_automl/dense_iterator.py b/src/MEDS_tabular_automl/dense_iterator.py
@@ -0,0 +1,37 @@
+import numpy as np
+import scipy.sparse as sp
+from mixins import TimeableMixin
+from omegaconf import DictConfig
+
+from .tabular_dataset import TabularDataset
+
+
+class DenseIterator(TabularDataset, TimeableMixin):
+    def __init__(self, cfg: DictConfig, split: str):
+        """Initializes the SklearnIterator with the provided configuration and data split.
+
+        Args:
+            cfg: The configuration dictionary.
+            split: The data split to use.
+        """
+        TabularDataset.__init__(self, cfg=cfg, split=split)
+        TimeableMixin.__init__(self)
+        self.valid_event_ids, self.labels = self._load_ids_and_labels()
+        # check if the labels are empty
+        if len(self.labels) == 0:
+            raise ValueError("No labels found.")
+        # self._it = 0
+
+    def densify(self) -> np.ndarray:
+        """Builds the data as a dense matrix based on column subselection."""
+
+        # get the dense matrix by iterating through the data shards
+        data = []
+        labels = []
+        for shard_idx in range(len(self._data_shards)):
+            shard_data, shard_labels = self.get_data_shards(shard_idx)
+            data.append(shard_data)
+            labels.append(shard_labels)
+        data = sp.vstack(data)
+        labels = np.concatenate(labels, axis=0)
+        return data, labels
diff --git a/src/MEDS_tabular_automl/generate_static_features.py b/src/MEDS_tabular_automl/generate_static_features.py
@@ -185,6 +185,8 @@ def get_flat_static_rep(
     """
     static_features = get_feature_names(agg=agg, feature_columns=feature_columns)
     static_measurements = summarize_static_measurements(agg, static_features, df=shard_df)
+    if len(static_features) == 0:
+        raise ValueError(f"No static features found. Remove the aggregation function {agg}")
     # convert to sparse_matrix
     matrix = get_sparse_static_rep(static_features, static_measurements.lazy(), shard_df, feature_columns)
     assert matrix.shape[1] == len(

diff --git a/src/MEDS_tabular_automl/mapper.py b/src/MEDS_tabular_automl/mapper.py
@@ -5,10 +5,12 @@
 from collections.abc import Callable
 from datetime import datetime
 from pathlib import Path
+from typing import TypeVar
 
 from loguru import logger
 
 LOCK_TIME_FMT = "%Y-%m-%dT%H:%M:%S.%f"
+DF_T = TypeVar("DF_T")
 
 
 def get_earliest_lock(cache_directory: Path) -> datetime | None:
@@ -82,9 +84,7 @@ def register_lock(cache_directory: Path) -> tuple[datetime, Path]:
     return lock_time, lock_fp
 
 
-def wrap[
-    DF_T
-](
+def wrap(
     in_fp: Path,
     out_fp: Path,
     read_fn: Callable[[Path], DF_T],