From 8b1ba870fe7e29202f78fa37867cabfa3cea343e Mon Sep 17 00:00:00 2001 From: Marc-Antoine Schmidt Date: Fri, 20 Sep 2024 15:53:01 -0400 Subject: [PATCH 1/3] Create CatMatrix from codes and categories (#389) * first try * changelog * cat is pandas-only * add error message when indices can't be casted to int32. --- CHANGELOG.rst | 5 +++ pixi.lock | 44 ++++++++++++------------- src/tabmat/categorical_matrix.py | 55 ++++++++++++++++++-------------- 3 files changed, 58 insertions(+), 46 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 89c36517..265dd504 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -14,10 +14,15 @@ Changelog - Added a new function, :func:`tabmat.from_polars`, to convert a :class:`polars.DataFrame` into a :class:`tabmat.SplitMatrix`. +**Other changes:** + +- Allow :class:`CategoricalMatrix` to be initialized directly with indices and categories. + **Bug fix:** - Fixed a bug in :meth:`tabmat.CategoricalMatrix.standardize` that sometimes returned ``nan`` values for the standard deviation due to numerical instability if using ``np.float32`` precision. + 4.0.1 - 2024-06-25 ------------------ diff --git a/pixi.lock b/pixi.lock index 5246adb3..00dc65f4 100644 --- a/pixi.lock +++ b/pixi.lock @@ -368,7 +368,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_impl_linux-64-2.40-ha1999f0_7.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_linux-64-2.40-hb3c18ed_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.33.1-heb4867d_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.32.3-h4bc722e_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/c-compiler-1.7.0-hd590300_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2024.8.30-hbcca054_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cached_property-1.5.2-pyha770c72_1.tar.bz2 @@ -1686,7 +1686,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_impl_linux-64-2.40-ha1999f0_7.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_linux-64-2.40-hb3c18ed_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.33.1-heb4867d_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.32.3-h4bc722e_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/c-compiler-1.7.0-hd590300_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2024.8.30-hbcca054_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cached_property-1.5.2-pyha770c72_1.tar.bz2 @@ -2099,7 +2099,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_impl_linux-64-2.40-ha1999f0_7.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_linux-64-2.40-hb3c18ed_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.33.1-heb4867d_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.32.3-h4bc722e_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/c-compiler-1.7.0-hd590300_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2024.8.30-hbcca054_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cached_property-1.5.2-pyha770c72_1.tar.bz2 @@ -2506,7 +2506,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_impl_linux-64-2.40-ha1999f0_7.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_linux-64-2.40-hb3c18ed_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.33.1-heb4867d_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.32.3-h4bc722e_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/c-compiler-1.7.0-hd590300_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2024.8.30-hbcca054_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cached_property-1.5.2-pyha770c72_1.tar.bz2 @@ -2916,7 +2916,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_impl_linux-64-2.40-ha1999f0_7.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_linux-64-2.40-hb3c18ed_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.33.1-heb4867d_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.32.3-h4bc722e_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/c-compiler-1.7.0-hd590300_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2024.8.30-hbcca054_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cached_property-1.5.2-pyha770c72_1.tar.bz2 @@ -3329,7 +3329,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_impl_linux-64-2.40-ha1999f0_7.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_linux-64-2.40-hb3c18ed_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.33.1-heb4867d_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.32.3-h4bc722e_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/c-compiler-1.7.0-hd590300_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2024.8.30-hbcca054_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cached_property-1.5.2-pyha770c72_1.tar.bz2 @@ -3742,7 +3742,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_impl_linux-64-2.40-ha1999f0_7.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_linux-64-2.40-hb3c18ed_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.33.1-heb4867d_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.32.3-h4bc722e_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/c-compiler-1.7.0-hd590300_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2024.8.30-hbcca054_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cached_property-1.5.2-pyha770c72_1.tar.bz2 @@ -5727,6 +5727,21 @@ packages: license_family: BSD size: 122909 timestamp: 1720974522888 +- kind: conda + name: c-ares + version: 1.32.3 + build: h4bc722e_0 + subdir: linux-64 + url: https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.32.3-h4bc722e_0.conda + sha256: 3c5a844bb60b0d52d89c3f1bd828c9856417fe33a6102fd8bbd5c13c3351704a + md5: 7624e34ee6baebfc80d67bac76cc9d9d + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc-ng >=12 + license: MIT + license_family: MIT + size: 179736 + timestamp: 1721834714515 - kind: conda name: c-ares version: 1.33.1 @@ -5757,21 +5772,6 @@ packages: license_family: MIT size: 159389 timestamp: 1724438175204 -- kind: conda - name: c-ares - version: 1.33.1 - build: heb4867d_0 - subdir: linux-64 - url: https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.33.1-heb4867d_0.conda - sha256: 2cb24f613eaf2850b1a08f28f967b10d8bd44ef623efa0154dc45eb718776be6 - md5: 0d3c60291342c0c025db231353376dfb - depends: - - __glibc >=2.28,<3.0.a0 - - libgcc-ng >=13 - license: MIT - license_family: MIT - size: 182796 - timestamp: 1724438109690 - kind: conda name: c-compiler version: 1.7.0 diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py index a0d0c2fa..26cf0050 100644 --- a/src/tabmat/categorical_matrix.py +++ b/src/tabmat/categorical_matrix.py @@ -199,15 +199,6 @@ def matvec(mat, vec): import polars as pl -class _Categorical: - """This class helps us avoid copies while subsetting.""" - - def __init__(self, indices, categories, dtype): - self.indices = indices - self.categories = categories - self.dtype = dtype - - def _is_indexer_full_length(full_length: int, indexer: Union[slice, np.ndarray]): if isinstance(indexer, np.ndarray): if (indexer > full_length - 1).any(): @@ -232,10 +223,7 @@ def _is_polars(x) -> bool: def _extract_codes_and_categories(cat_vec): - if isinstance(cat_vec, _Categorical): - categories = cat_vec.categories - indices = cat_vec.indices - elif _is_pandas(cat_vec): + if _is_pandas(cat_vec): categories = cat_vec.categories.to_numpy() indices = cat_vec.codes elif _is_pandas(cat_vec.dtype): @@ -284,6 +272,9 @@ class CategoricalMatrix(MatrixBase): cat_vec: array-like vector of categorical data. + categories: np.ndarray, default None + If provided, cat_vec is assumed to be an array-like vector of indices. + drop_first: drop the first level of the dummy encoding. This allows a CategoricalMatrix to be used in an unregularized setting. @@ -306,6 +297,7 @@ class CategoricalMatrix(MatrixBase): def __init__( self, cat_vec, + categories: Optional[np.ndarray] = None, drop_first: bool = False, dtype: np.dtype = np.float64, column_name: Optional[str] = None, @@ -321,13 +313,21 @@ def __init__( ) if not hasattr(cat_vec, "dtype"): - cat_vec = np.array(cat_vec) # avoid errors in pd.factorize + cat_vec = np.asarray(cat_vec) # avoid errors in pd.factorize self._input_dtype = cat_vec.dtype self._missing_method = cat_missing_method self._missing_category = cat_missing_name - indices, self.categories = _extract_codes_and_categories(cat_vec) + if categories is not None: + self.categories = categories + indices = np.nan_to_num(cat_vec, nan=-1) + if max(indices) >= len(categories): + raise ValueError("Indices exceed length of categories.") + if min(indices) < -1: + raise ValueError("Indices must be non-negative (or -1 for missing).") + else: + indices, self.categories = _extract_codes_and_categories(cat_vec) if np.any(indices == -1): if self._missing_method == "fail": @@ -357,7 +357,13 @@ def __init__( self._has_missings = False self.drop_first = drop_first - self.indices = indices.astype(np.int32, copy=False) + try: + self.indices = indices.astype(np.int32, copy=False) + except ValueError: + raise ValueError( + "When creating a CategoricalMatrix with indices and categories, " + "indices must be castable to a numpy int32 dtype." + ) self.shape = (len(self.indices), len(self.categories) - int(drop_first)) self.x_csc = None self.dtype = np.dtype(dtype) @@ -382,13 +388,13 @@ def cat(self): "This property will be removed in the next major release.", category=DeprecationWarning, ) - - if _is_polars(self._input_dtype): - out = self.categories[self.indices].astype("object", copy=False) - out = np.where(self.indices < 0, None, out) - return pl.Series(out, dtype=pl.Enum(self.categories)) - - return pd.Categorical.from_codes(self.indices, categories=self.categories) + try: + return pd.Categorical.from_codes(self.indices, categories=self.categories) + except NameError: + raise ModuleNotFoundError( + "The `cat` property is provided for backward compatibility and " + "requires pandas to be installed." + ) def recover_orig(self) -> np.ndarray: """ @@ -683,7 +689,8 @@ def __getitem__(self, item): if isinstance(row, np.ndarray): row = row.ravel() return CategoricalMatrix( - _Categorical(self.indices[row], self.categories, self._input_dtype), + self.indices[row], + categories=self.categories, drop_first=self.drop_first, dtype=self.dtype, column_name=self._colname, From 55990e440985885b549d0c0ffec70e156b397c39 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 23 Sep 2024 09:10:51 +0100 Subject: [PATCH 2/3] Bump pypa/cibuildwheel from 2.21.0 to 2.21.1 (#394) --- .github/workflows/build-wheels.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml index 5c5bd7e9..790a93d2 100644 --- a/.github/workflows/build-wheels.yml +++ b/.github/workflows/build-wheels.yml @@ -25,14 +25,14 @@ jobs: uses: mamba-org/setup-micromamba@f8b8a1e23a26f60a44c853292711bacfd3eac822 - name: Build wheels if: github.event_name != 'release' - uses: pypa/cibuildwheel@v2.21.0 + uses: pypa/cibuildwheel@v2.21.1 env: CIBW_ARCHS_LINUX: auto CIBW_ARCHS_MACOS: x86_64 arm64 CIBW_TEST_SKIP: '*-macosx_arm64' - name: Build wheels (release) if: github.event_name == 'release' && github.event.action == 'published' - uses: pypa/cibuildwheel@v2.21.0 + uses: pypa/cibuildwheel@v2.21.1 env: CIBW_ARCHS_LINUX: x86_64 aarch64 CIBW_ARCHS_MACOS: x86_64 arm64 From 1e9579abb83221b07636c236850feaf8de1d68ab Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 23 Sep 2024 09:11:05 +0100 Subject: [PATCH 3/3] Bump pypa/gh-action-pypi-publish from 1.10.1 to 1.10.2 (#395) --- .github/workflows/build-wheels.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml index 790a93d2..b7498ae3 100644 --- a/.github/workflows/build-wheels.yml +++ b/.github/workflows/build-wheels.yml @@ -73,7 +73,7 @@ jobs: with: merge-multiple: true path: dist - - uses: pypa/gh-action-pypi-publish@v1.10.1 + - uses: pypa/gh-action-pypi-publish@v1.10.2 with: repository-url: https://test.pypi.org/legacy/ @@ -91,4 +91,4 @@ jobs: with: merge-multiple: true path: dist - - uses: pypa/gh-action-pypi-publish@v1.10.1 + - uses: pypa/gh-action-pypi-publish@v1.10.2