Skip to content

Commit

Permalink
Merge branch 'main' into check-sandwich-dimensions
Browse files Browse the repository at this point in the history
  • Loading branch information
stanmart authored Sep 23, 2024
2 parents 0fed479 + 1e9579a commit e221f7f
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 50 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/build-wheels.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,14 @@ jobs:
uses: mamba-org/setup-micromamba@f8b8a1e23a26f60a44c853292711bacfd3eac822
- name: Build wheels
if: github.event_name != 'release'
uses: pypa/cibuildwheel@v2.21.0
uses: pypa/cibuildwheel@v2.21.1
env:
CIBW_ARCHS_LINUX: auto
CIBW_ARCHS_MACOS: x86_64 arm64
CIBW_TEST_SKIP: '*-macosx_arm64'
- name: Build wheels (release)
if: github.event_name == 'release' && github.event.action == 'published'
uses: pypa/cibuildwheel@v2.21.0
uses: pypa/cibuildwheel@v2.21.1
env:
CIBW_ARCHS_LINUX: x86_64 aarch64
CIBW_ARCHS_MACOS: x86_64 arm64
Expand Down Expand Up @@ -73,7 +73,7 @@ jobs:
with:
merge-multiple: true
path: dist
- uses: pypa/gh-action-pypi-publish@v1.10.1
- uses: pypa/gh-action-pypi-publish@v1.10.2
with:
repository-url: https://test.pypi.org/legacy/

Expand All @@ -91,4 +91,4 @@ jobs:
with:
merge-multiple: true
path: dist
- uses: pypa/gh-action-pypi-publish@v1.10.1
- uses: pypa/gh-action-pypi-publish@v1.10.2
2 changes: 2 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,14 @@ Changelog

**Other changes:**

- Allow :class:`CategoricalMatrix` to be initialized directly with indices and categories.
- Added checks for dimension and ``dtype`` mismatch in :meth:`MatrixBasesandwich.sandwich`.

**Bug fix:**

- Fixed a bug in :meth:`tabmat.CategoricalMatrix.standardize` that sometimes returned ``nan`` values for the standard deviation due to numerical instability if using ``np.float32`` precision.


4.0.1 - 2024-06-25
------------------

Expand Down
44 changes: 22 additions & 22 deletions pixi.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

55 changes: 31 additions & 24 deletions src/tabmat/categorical_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,15 +200,6 @@ def matvec(mat, vec):
import polars as pl


class _Categorical:
"""This class helps us avoid copies while subsetting."""

def __init__(self, indices, categories, dtype):
self.indices = indices
self.categories = categories
self.dtype = dtype


def _is_indexer_full_length(full_length: int, indexer: Union[slice, np.ndarray]):
if isinstance(indexer, np.ndarray):
if (indexer > full_length - 1).any():
Expand All @@ -233,10 +224,7 @@ def _is_polars(x) -> bool:


def _extract_codes_and_categories(cat_vec):
if isinstance(cat_vec, _Categorical):
categories = cat_vec.categories
indices = cat_vec.indices
elif _is_pandas(cat_vec):
if _is_pandas(cat_vec):
categories = cat_vec.categories.to_numpy()
indices = cat_vec.codes
elif _is_pandas(cat_vec.dtype):
Expand Down Expand Up @@ -285,6 +273,9 @@ class CategoricalMatrix(MatrixBase):
cat_vec:
array-like vector of categorical data.
categories: np.ndarray, default None
If provided, cat_vec is assumed to be an array-like vector of indices.
drop_first:
drop the first level of the dummy encoding. This allows a CategoricalMatrix
to be used in an unregularized setting.
Expand All @@ -307,6 +298,7 @@ class CategoricalMatrix(MatrixBase):
def __init__(
self,
cat_vec,
categories: Optional[np.ndarray] = None,
drop_first: bool = False,
dtype: np.dtype = np.float64,
column_name: Optional[str] = None,
Expand All @@ -322,13 +314,21 @@ def __init__(
)

if not hasattr(cat_vec, "dtype"):
cat_vec = np.array(cat_vec) # avoid errors in pd.factorize
cat_vec = np.asarray(cat_vec) # avoid errors in pd.factorize

self._input_dtype = cat_vec.dtype
self._missing_method = cat_missing_method
self._missing_category = cat_missing_name

indices, self.categories = _extract_codes_and_categories(cat_vec)
if categories is not None:
self.categories = categories
indices = np.nan_to_num(cat_vec, nan=-1)
if max(indices) >= len(categories):
raise ValueError("Indices exceed length of categories.")
if min(indices) < -1:
raise ValueError("Indices must be non-negative (or -1 for missing).")
else:
indices, self.categories = _extract_codes_and_categories(cat_vec)

if np.any(indices == -1):
if self._missing_method == "fail":
Expand Down Expand Up @@ -358,7 +358,13 @@ def __init__(
self._has_missings = False

self.drop_first = drop_first
self.indices = indices.astype(np.int32, copy=False)
try:
self.indices = indices.astype(np.int32, copy=False)
except ValueError:
raise ValueError(
"When creating a CategoricalMatrix with indices and categories, "
"indices must be castable to a numpy int32 dtype."
)
self.shape = (len(self.indices), len(self.categories) - int(drop_first))
self.x_csc = None
self.dtype = np.dtype(dtype)
Expand All @@ -383,13 +389,13 @@ def cat(self):
"This property will be removed in the next major release.",
category=DeprecationWarning,
)

if _is_polars(self._input_dtype):
out = self.categories[self.indices].astype("object", copy=False)
out = np.where(self.indices < 0, None, out)
return pl.Series(out, dtype=pl.Enum(self.categories))

return pd.Categorical.from_codes(self.indices, categories=self.categories)
try:
return pd.Categorical.from_codes(self.indices, categories=self.categories)
except NameError:
raise ModuleNotFoundError(
"The `cat` property is provided for backward compatibility and "
"requires pandas to be installed."
)

def recover_orig(self) -> np.ndarray:
"""
Expand Down Expand Up @@ -685,7 +691,8 @@ def __getitem__(self, item):
if isinstance(row, np.ndarray):
row = row.ravel()
return CategoricalMatrix(
_Categorical(self.indices[row], self.categories, self._input_dtype),
self.indices[row],
categories=self.categories,
drop_first=self.drop_first,
dtype=self.dtype,
column_name=self._colname,
Expand Down

0 comments on commit e221f7f

Please sign in to comment.