diff --git a/.cruft.json b/.cruft.json index f618712a..0c9c24d5 100644 --- a/.cruft.json +++ b/.cruft.json @@ -1,6 +1,6 @@ { "template": "https://github.com/MartinBernstorff/swift-python-cookiecutter", - "commit": "8ecdbe54a1bc87dba0f664995a581c9504b27a33", + "commit": "5e55520ebfd3b1269cd85b352cd690905c32f7fa", "checkout": null, "context": { "cookiecutter": { diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 1932744f..9613c02d 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -25,23 +25,26 @@ jobs: steps: - uses: actions/checkout@v3 + - name: Cache venv + uses: actions/cache@v3.2.6 + id: cache_venv + with: + path: | + .venv + key: ${{ runner.os }}-${{ steps.setup_python.python-version }}-venv-${{ hashFiles('**/pyproject.toml') }} + - name: Set up Python uses: actions/setup-python@v4 id: setup_python + if: steps.cache_venv.outputs.cache-hit != 'true' with: python-version: ${{ matrix.python-version }} cache: "pip" cache-dependency-path: "**/pyproject.toml" - - name: Cache venv - uses: actions/cache@v3.2.6 - with: - path: | - .venv - key: ${{ runner.os }}-${{ steps.setup_python.python-version }}-venv-${{ hashFiles('**/pyproject.toml') }} - - name: Install dependencies shell: bash + if: steps.cache_venv.outputs.cache-hit != 'true' run: | python -m venv .venv source .venv/bin/activate diff --git a/src/psycop_model_training/model_eval/base_artifacts/plots/utils.py b/src/psycop_model_training/model_eval/base_artifacts/plots/utils.py index f2892a6a..262febdb 100644 --- a/src/psycop_model_training/model_eval/base_artifacts/plots/utils.py +++ b/src/psycop_model_training/model_eval/base_artifacts/plots/utils.py @@ -93,7 +93,7 @@ def create_performance_by_input( }, ) - # bin data + # bin data and calculate metric per bin if bin_continuous_input: df[f"{input_name}_binned"] = bin_continuous_data(df[input_name], bins=bins) diff --git a/src/psycop_model_training/utils/utils.py b/src/psycop_model_training/utils/utils.py index 2feda5a5..c5c53688 100644 --- a/src/psycop_model_training/utils/utils.py +++ b/src/psycop_model_training/utils/utils.py @@ -215,10 +215,24 @@ def bin_continuous_data( else: continue - # Drop any category in the series where the bin has fewer than 5 observations - series = series[series.groupby(series).transform("count") >= min_n_in_bin] + df = pd.DataFrame( + { + "series": series, + "bin": pd.cut( + series, + bins=bins, + labels=labels, + duplicates="drop", + include_lowest=True, + ), + }, + ) + + bins_with_insufficient_n = ( + df.groupby("bin")["series"].transform("size") < min_n_in_bin + ) - return pd.cut(series, bins=bins, labels=labels, duplicates="drop") + return df["bin"].mask(bins_with_insufficient_n) def positive_rate_to_pred_probs( diff --git a/tests/test_utils.py b/tests/test_utils.py index 9b0eabe7..a8181ba5 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -4,6 +4,7 @@ import numpy as np import pandas as pd from psycop_model_training.utils.utils import ( + bin_continuous_data, drop_records_if_datediff_days_smaller_than, flatten_nested_dict, ) @@ -91,3 +92,30 @@ def test_flatten_nested_dict(): output_dict = flatten_nested_dict(input_dict) assert expected_dict == output_dict + + +def test_bin_contiuous_data(): + one_to_five = pd.Series([1, 2, 3, 4, 5]) + + # One bin, more than 5 + one_bin_more_than_five = bin_continuous_data( + series=one_to_five, + bins=[0, 5], + ) + assert len(one_bin_more_than_five.unique()) == 1 + assert one_bin_more_than_five.isna().sum() == 0 + + # One bin, less than 5 + one_to_four = pd.Series([1, 2, 3, 4]) + one_bin_less_than_five = bin_continuous_data(series=one_to_four, bins=[0, 5]) + assert one_bin_less_than_five.isna().sum() == 4 + + # Two bins, less than 5 + two_bins_less_than_five = bin_continuous_data(series=one_to_four, bins=[0, 2, 5]) + assert two_bins_less_than_five.isna().sum() == 4 + + # Two bins, more than 5 + one_to_ten = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) + two_bins_more_than_five = bin_continuous_data(series=one_to_ten, bins=[0, 5, 11]) + assert len(two_bins_more_than_five.unique()) == 2 + assert two_bins_more_than_five.isna().sum() == 0