mmcdermott · Oufattole · Sep 10, 2024 · Aug 19, 2024 · Aug 19, 2024 · Aug 20, 2024
diff --git a/.github/workflows/code-quality-main.yaml b/.github/workflows/code-quality-main.yaml
@@ -13,10 +13,12 @@ jobs:
 
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Set up Python
-        uses: actions/setup-python@v3
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
 
       - name: Run pre-commits
         uses: pre-commit/action@v3.0.1
diff --git a/.github/workflows/code-quality-pr.yaml b/.github/workflows/code-quality-pr.yaml
@@ -16,10 +16,12 @@ jobs:
 
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Set up Python
-        uses: actions/setup-python@v3
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
 
       - name: Find modified files
         id: file_changes

diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml
@@ -12,7 +12,7 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v5
         with:
-          python-version: "3.x"
+          python-version: "3.11"
       - name: Install pypa/build
         run: >-
           python3 -m
@@ -36,7 +36,7 @@ jobs:
     runs-on: ubuntu-latest
     environment:
       name: pypi
-      url: https://pypi.org/p/<package-name> # Replace <package-name> with your PyPI project name
+      url: https://pypi.org/p/meds-tab # Replace <package-name> with your PyPI project name
     permissions:
       id-token: write # IMPORTANT: mandatory for trusted publishing
 
@@ -91,27 +91,3 @@ jobs:
           gh release upload
           '${{ github.ref_name }}' dist/**
           --repo '${{ github.repository }}'
-
-  publish-to-testpypi:
-    name: Publish Python 🐍 distribution 📦 to TestPyPI
-    needs:
-      - build
-    runs-on: ubuntu-latest
-
-    environment:
-      name: testpypi
-      url: https://test.pypi.org/p/<package-name>
-
-    permissions:
-      id-token: write # IMPORTANT: mandatory for trusted publishing
-
-    steps:
-      - name: Download all the dists
-        uses: actions/download-artifact@v3
-        with:
-          name: python-package-distributions
-          path: dist/
-      - name: Publish distribution 📦 to TestPyPI
-        uses: pypa/gh-action-pypi-publish@release/v1
-        with:
-          repository-url: https://test.pypi.org/legacy/
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -12,17 +12,19 @@ jobs:
 
     strategy:
       fail-fast: false
+      matrix:
+        python-version: ["3.11", "3.12"]
 
     timeout-minutes: 30
 
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
-      - name: Set up Python 3.12
-        uses: actions/setup-python@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
         with:
-          python-version: "3.12"
+          python-version: ${{ matrix.python-version }}
 
       - name: Install packages
         run: |

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,7 +1,5 @@
 default_language_version:
-  python: python3.12
-
-exclude: "sample_data|docs/MIMIC_IV_tutorial/wandb_reports"
+  python: python3.11
 
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks

diff --git a/README.md b/README.md
@@ -84,12 +84,12 @@ By following these steps, you can seamlessly transform your dataset, define nece
 
    ```console
    # Re-shard pipeline
-   # $MIMICIV_MEDS_DIR is the directory containing the input, MEDS v0.3 formatted MIMIC-IV data
+   # $MIMICIV_input_dir is the directory containing the input, MEDS v0.3 formatted MIMIC-IV data
    # $MEDS_TAB_COHORT_DIR is the directory where the re-sharded MEDS dataset will be stored, and where your model
    # will store cached files during processing by default.
    # $N_PATIENTS_PER_SHARD is the number of patients per shard you want to use.
    MEDS_transform-reshard_to_split \
-       input_dir="$MIMICIV_MEDS_DIR" \
+       input_dir="$MIMICIV_input_dir" \
        cohort_dir="$MEDS_TAB_COHORT_DIR" \
        'stages=["reshard_to_split"]' \
        stage="reshard_to_split" \
@@ -103,14 +103,14 @@ By following these steps, you can seamlessly transform your dataset, define nece
    - static codes (codes without timestamps)
    - static numerical codes (codes without timestamps but with numerical values).
 
-   This script further caches feature names and frequencies in a dataset stored in a `code_metadata.parquet` file within the `MEDS_cohort_dir` argument specified as a hydra-style command line argument.
+   This script further caches feature names and frequencies in a dataset stored in a `code_metadata.parquet` file within the `input_dir` argument specified as a hydra-style command line argument.
 
-2. **`meds-tab-tabularize-static`**: Filters and processes the dataset based on the frequency of codes, generating a tabular vector for each patient at each timestamp in the shards. Each row corresponds to a unique `patient_id` and `timestamp` combination, thus rows are duplicated across multiple timestamps for the same patient.
+2. **`meds-tab-tabularize-static`**: Filters and processes the dataset based on the frequency of codes, generating a tabular vector for each patient at each timestamp in the shards. Each row corresponds to a unique `subject_id` and `timestamp` combination, thus rows are duplicated across multiple timestamps for the same patient.
 
    **Example: Tabularizing static data** with the minimum code frequency of 10, window sizes of `[1d, 30d,  365d, full]`, and value aggregation methods of `[static/present, static/first, code/count, value/count, value/sum, value/sum_sqd, value/min, value/max]`
 
    ```console
-   meds-tab-tabularize-static MEDS_cohort_dir="path_to_data" \
+   meds-tab-tabularize-static input_dir="path_to_data" \
                                tabularization.min_code_inclusion_frequency=10 \
                                tabularization.window_sizes=[1d,30d,365d,full] \
                                do_overwrite=False \
@@ -119,27 +119,27 @@ By following these steps, you can seamlessly transform your dataset, define nece
 
    - For the exhaustive examples of value aggregations, see [`/src/MEDS_tabular_automl/utils.py`](https://github.com/mmcdermott/MEDS_Tabular_AutoML/blob/main/src/MEDS_tabular_automl/utils.py#L24)
 
-3. **`meds-tab-tabularize-time-series`**: Iterates through combinations of a shard, `window_size`, and `aggregation` to generate feature vectors that aggregate patient data for each unique `patient_id` x `timestamp`. This stage (and the previous stage) uses sparse matrix formats to efficiently handle the computational and storage demands of rolling window calculations on large datasets. We support parallelization through Hydra's [`--multirun`](https://hydra.cc/docs/intro/#multirun) flag and the [`joblib` launcher](https://hydra.cc/docs/plugins/joblib_launcher/#internaldocs-banner).
+3. **`meds-tab-tabularize-time-series`**: Iterates through combinations of a shard, `window_size`, and `aggregation` to generate feature vectors that aggregate patient data for each unique `subject_id` x `timestamp`. This stage (and the previous stage) uses sparse matrix formats to efficiently handle the computational and storage demands of rolling window calculations on large datasets. We support parallelization through Hydra's [`--multirun`](https://hydra.cc/docs/intro/#multirun) flag and the [`joblib` launcher](https://hydra.cc/docs/plugins/joblib_launcher/#internaldocs-banner).
 
    **Example: Aggregate time-series data** on features across different `window_sizes`
 
    ```console
    meds-tab-tabularize-time-series --multirun \
       worker="range(0,$N_PARALLEL_WORKERS)" \
       hydra/launcher=joblib \
-      MEDS_cohort_dir="path_to_data" \
+      input_dir="path_to_data" \
       tabularization.min_code_inclusion_frequency=10 \
       do_overwrite=False \
       tabularization.window_sizes=[1d,30d,365d,full] \
       tabularization.aggs=[static/present,static/first,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]
    ```
 
-4. **`meds-tab-cache-task`**: Aligns task-specific labels with the nearest prior event in the tabularized data. It requires a labeled dataset directory with three columns (`patient_id`, `timestamp`, `label`) structured similarly to the `MEDS_cohort_dir`.
+4. **`meds-tab-cache-task`**: Aligns task-specific labels with the nearest prior event in the tabularized data. It requires a labeled dataset directory with three columns (`subject_id`, `timestamp`, `label`) structured similarly to the `input_dir`.
 
-   **Example: Align tabularized data** for a specific task `$TASK` and labels that has pulled from [ACES](https://github.com/justin13601/ACES)
+   **Example: Align tabularized data** for a specific task `$TASK` and labels that have been pulled from [ACES](https://github.com/justin13601/ACES)
 
    ```console
-   meds-tab-cache-task MEDS_cohort_dir="path_to_data" \
+   meds-tab-cache-task input_dir="path_to_data" \
       task_name=$TASK \
       tabularization.min_code_inclusion_frequency=10 \
       do_overwrite=False \
@@ -151,7 +151,7 @@ By following these steps, you can seamlessly transform your dataset, define nece
 
    ```console
    meds-tab-xgboost --multirun \
-      MEDS_cohort_dir="path_to_data" \
+      input_dir="path_to_data" \
       task_name=$TASK \
       output_dir="output_directory" \
       tabularization.min_code_inclusion_frequency=10 \
@@ -321,7 +321,7 @@ Now that we have generated tabular features for all the events in our dataset, w
 - **Row Selection Based on Tasks**: Only the data rows that are relevant to the specific tasks are selected and cached. This reduces the memory footprint and speeds up the training process.
 - **Use of Sparse Matrices for Efficient Storage**: Sparse matrices are again employed here to store the selected data efficiently, ensuring that only non-zero data points are kept in memory, thus optimizing both storage and retrieval times.
 
-The file structure for the cached data mirrors that of the tabular data, also consisting of `.npz` files, where users must specify the directory that stores labels. Labels follow the same shard file structure as the input meds data from step (1), and the label parquets need `patient_id`, `timestamp`, and `label` columns.
+The file structure for the cached data mirrors that of the tabular data, also consisting of `.npz` files, where users must specify the directory that stores labels. Labels follow the same shard file structure as the input meds data from step (1), and the label parquets need `subject_id`, `timestamp`, and `label` columns.
 
 ## 4. XGBoost Training
 
@@ -436,7 +436,7 @@ A single XGBoost run was completed to profile time and memory usage. This was do
 
 ```console
 meds-tab-xgboost
-      MEDS_cohort_dir="path_to_data" \
+      input_dir="path_to_data" \
       task_name=$TASK \
       output_dir="output_directory" \
       do_overwrite=False \
@@ -506,7 +506,7 @@ The XGBoost sweep was run using the following command for each `$TASK`:
 
 ```console
 meds-tab-xgboost --multirun \
-      MEDS_cohort_dir="path_to_data" \
+      input_dir="path_to_data" \
       task_name=$TASK \
       output_dir="output_directory" \
       tabularization.window_sizes=$(generate-subsets [1d,30d,365d,full]) \
@@ -529,14 +529,14 @@ The hydra sweeper swept over the parameters:
 
 ```yaml
 params:
-  +model_params.model.eta: tag(log, interval(0.001, 1))
-  +model_params.model.lambda: tag(log, interval(0.001, 1))
-  +model_params.model.alpha: tag(log, interval(0.001, 1))
-  +model_params.model.subsample: interval(0.5, 1)
-  +model_params.model.min_child_weight: interval(1e-2, 100)
-  +model_params.model.max_depth: range(2, 16)
-  model_params.num_boost_round: range(100, 1000)
-  model_params.early_stopping_rounds: range(1, 10)
+  model.eta: tag(log, interval(0.001, 1))
+  model.lambda: tag(log, interval(0.001, 1))
+  model.alpha: tag(log, interval(0.001, 1))
+  model.subsample: interval(0.5, 1)
+  model.min_child_weight: interval(1e-2, 100)
+  model.max_depth: range(2, 16)
+  num_boost_round: range(100, 1000)
+  early_stopping_rounds: range(1, 10)
   tabularization.min_code_inclusion_frequency: tag(log, range(10, 1000000))
 ```
 

diff --git a/docs/source/implementation.md b/docs/source/implementation.md
@@ -92,7 +92,7 @@ Now that we have generated tabular features for all the events in our dataset, w
 - **Row Selection Based on Tasks**: Only the data rows that are relevant to the specific tasks are selected and cached. This reduces the memory footprint and speeds up the training process.
 - **Use of Sparse Matrices for Efficient Storage**: Sparse matrices are again employed here to store the selected data efficiently, ensuring that only non-zero data points are kept in memory, thus optimizing both storage and retrieval times.
 
-The file structure for the cached data mirrors that of the tabular data, also consisting of `.npz` files, where users must specify the directory that stores labels. Labels follow the same shard filestructure as the input meds data from step (1), and the label parquets need `patient_id`, `timestamp`, and `label` columns.
+The file structure for the cached data mirrors that of the tabular data, also consisting of `.npz` files, where users must specify the directory that stores labels. Labels follow the same shard filestructure as the input meds data from step (1), and the label parquets need `subject_id`, `timestamp`, and `label` columns.
 
 ## 4. XGBoost Training
 

diff --git a/docs/source/overview.md b/docs/source/overview.md
@@ -38,14 +38,14 @@ See [`/tests/test_integration.py`](https://github.com/mmcdermott/MEDS_Tabular_Au
    - static codes (codes without timestamps)
    - static numerical codes (codes without timestamps but with numerical values).
 
-   This script further caches feature names and frequencies in a dataset stored in a `code_metadata.parquet` file within the `MEDS_cohort_dir` argument specified as a hydra-style command line argument.
+   This script further caches feature names and frequencies in a dataset stored in a `code_metadata.parquet` file within the `input_dir` argument specified as a hydra-style command line argument.
 
-2. **`meds-tab-tabularize-static`**: Filters and processes the dataset based on the frequency of codes, generating a tabular vector for each patient at each timestamp in the shards. Each row corresponds to a unique `patient_id` and `timestamp` combination, thus rows are duplicated across multiple timestamps for the same patient.
+2. **`meds-tab-tabularize-static`**: Filters and processes the dataset based on the frequency of codes, generating a tabular vector for each patient at each timestamp in the shards. Each row corresponds to a unique `subject_id` and `timestamp` combination, thus rows are duplicated across multiple timestamps for the same patient.
 
    **Example: Tabularizing static data** with the minimum code frequency of 10, window sizes of `[1d, 30d,  365d, full]`, and value aggregation methods of `[static/present, static/first, code/count, value/count, value/sum, value/sum_sqd, value/min, value/max]`
 
    ```console
-   meds-tab-tabularize-static MEDS_cohort_dir="path_to_data" \
+   meds-tab-tabularize-static input_dir="path_to_data" \
                                tabularization.min_code_inclusion_frequency=10 \
                                tabularization.window_sizes=[1d,30d,365d,full] \
                                do_overwrite=False \
@@ -54,27 +54,27 @@ See [`/tests/test_integration.py`](https://github.com/mmcdermott/MEDS_Tabular_Au
 
    - For the exhaustive examples of value aggregations, see [`/src/MEDS_tabular_automl/utils.py`](https://github.com/mmcdermott/MEDS_Tabular_AutoML/blob/main/src/MEDS_tabular_automl/utils.py#L24)
 
-3. **`meds-tab-tabularize-time-series`**: Iterates through combinations of a shard, `window_size`, and `aggregation` to generate feature vectors that aggregate patient data for each unique `patient_id` x `timestamp`. This stage (and the previous stage) uses sparse matrix formats to efficiently handle the computational and storage demands of rolling window calculations on large datasets. We support parallelization through Hydra's [`--multirun`](https://hydra.cc/docs/intro/#multirun) flag and the [`joblib` launcher](https://hydra.cc/docs/plugins/joblib_launcher/#internaldocs-banner).
+3. **`meds-tab-tabularize-time-series`**: Iterates through combinations of a shard, `window_size`, and `aggregation` to generate feature vectors that aggregate patient data for each unique `subject_id` x `timestamp`. This stage (and the previous stage) uses sparse matrix formats to efficiently handle the computational and storage demands of rolling window calculations on large datasets. We support parallelization through Hydra's [`--multirun`](https://hydra.cc/docs/intro/#multirun) flag and the [`joblib` launcher](https://hydra.cc/docs/plugins/joblib_launcher/#internaldocs-banner).
 
    **Example: Aggregate time-series data** on features across different `window_sizes`
 
    ```console
    meds-tab-tabularize-time-series --multirun \
       worker="range(0,$N_PARALLEL_WORKERS)" \
       hydra/launcher=joblib \
-      MEDS_cohort_dir="path_to_data" \
+      input_dir="path_to_data" \
       tabularization.min_code_inclusion_frequency=10 \
       do_overwrite=False \
       tabularization.window_sizes=[1d,30d,365d,full] \
       tabularization.aggs=[static/present,static/first,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]
    ```
 
-4. **`meds-tab-cache-task`**: Aligns task-specific labels with the nearest prior event in the tabularized data. It requires a labeled dataset directory with three columns (`patient_id`, `timestamp`, `label`) structured similarly to the `MEDS_cohort_dir`.
+4. **`meds-tab-cache-task`**: Aligns task-specific labels with the nearest prior event in the tabularized data. It requires a labeled dataset directory with three columns (`subject_id`, `timestamp`, `label`) structured similarly to the `input_dir`.
 
    **Example: Align tabularized data** for a specific task `$TASK` and labels that has pulled from [ACES](https://github.com/justin13601/ACES)
 
    ```console
-   meds-tab-cache-task MEDS_cohort_dir="path_to_data" \
+   meds-tab-cache-task input_dir="path_to_data" \
       task_name=$TASK \
       tabularization.min_code_inclusion_frequency=10 \
       do_overwrite=False \
@@ -86,7 +86,7 @@ See [`/tests/test_integration.py`](https://github.com/mmcdermott/MEDS_Tabular_Au
 
    ```console
    meds-tab-xgboost --multirun \
-      MEDS_cohort_dir="path_to_data" \
+      input_dir="path_to_data" \
       task_name=$TASK \
       output_dir="output_directory" \
       tabularization.min_code_inclusion_frequency=10 \

diff --git a/docs/source/prediction.md b/docs/source/prediction.md
@@ -14,7 +14,7 @@ A single XGBoost run was completed to profile time and memory usage. This was do
 
 ```console
 meds-tab-xgboost
-      MEDS_cohort_dir="path_to_data" \
+      input_dir="path_to_data" \
       task_name=$TASK \
       output_dir="output_directory" \
       do_overwrite=False \
@@ -84,7 +84,7 @@ The XGBoost sweep was run using the following command for each `$TASK`:
 
 ```console
 meds-tab-xgboost --multirun \
-      MEDS_cohort_dir="path_to_data" \
+      input_dir="path_to_data" \
       task_name=$TASK \
       output_dir="output_directory" \
       tabularization.window_sizes=$(generate-permutations [1d,30d,365d,full]) \
@@ -107,14 +107,14 @@ The hydra sweeper swept over the parameters:
 
 ```yaml
 params:
-  +model_params.model.eta: tag(log, interval(0.001, 1))
-  +model_params.model.lambda: tag(log, interval(0.001, 1))
-  +model_params.model.alpha: tag(log, interval(0.001, 1))
-  +model_params.model.subsample: interval(0.5, 1)
-  +model_params.model.min_child_weight: interval(1e-2, 100)
-  +model_params.model.max_depth: range(2, 16)
-  model_params.num_boost_round: range(100, 1000)
-  model_params.early_stopping_rounds: range(1, 10)
+  model.eta: tag(log, interval(0.001, 1))
+  model.lambda: tag(log, interval(0.001, 1))
+  model.alpha: tag(log, interval(0.001, 1))
+  model.subsample: interval(0.5, 1)
+  model.min_child_weight: interval(1e-2, 100)
+  model.max_depth: range(2, 16)
+  num_boost_round: range(100, 1000)
+  early_stopping_rounds: range(1, 10)
   tabularization.min_code_inclusion_frequency: tag(log, range(10, 1000000))
 ```