diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 9de08d2f..b3750c98 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -13,6 +13,7 @@ Unreleased **New features:** - Add column name and term name metadata to ``MatrixBase`` objects. These are automatically populated when initializing a ``MatrixBase`` from a ``pandas.DataFrame``. In addition, they can be accessed and modified via the ``column_names`` and ``term_names`` properties. +- Add a formula interface for creating tabmat matrices from pandas data frames. See :func:`tabmat.from_formula` for details. **Other changes:** diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml index de1ec993..2d183c3d 100644 --- a/conda.recipe/meta.yaml +++ b/conda.recipe/meta.yaml @@ -38,6 +38,7 @@ requirements: - {{ pin_compatible('numpy') }} - pandas - scipy + - formulaic>=0.4 test: requires: diff --git a/environment-win.yml b/environment-win.yml index 79e9f4e4..fce22a21 100644 --- a/environment-win.yml +++ b/environment-win.yml @@ -5,6 +5,7 @@ channels: dependencies: - libblas>=0=*mkl - pandas + - formulaic>=0.4 # development tools - black diff --git a/environment.yml b/environment.yml index 1f4d34a2..9c425c19 100644 --- a/environment.yml +++ b/environment.yml @@ -4,6 +4,7 @@ channels: - nodefaults dependencies: - pandas + - formulaic>=0.4 # development tools - black diff --git a/pyproject.toml b/pyproject.toml index ec42dcbe..278280a5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,7 @@ default_section = 'THIRDPARTY' [tool.cibuildwheel] skip = [ + "cp36-*", "*-win32", "*-manylinux_i686", "pp*", diff --git a/setup.py b/setup.py index 4c2a36ed..632728ac 100644 --- a/setup.py +++ b/setup.py @@ -157,7 +157,7 @@ ], package_dir={"": "src"}, packages=find_packages(where="src"), - install_requires=["numpy", "pandas", "scipy"], + install_requires=["numpy", "pandas", "scipy", "formulaic>=0.4"], ext_modules=cythonize( ext_modules, annotate=False, diff --git a/src/tabmat/__init__.py b/src/tabmat/__init__.py index 53fb1c55..0a3f8233 100644 --- a/src/tabmat/__init__.py +++ b/src/tabmat/__init__.py @@ -1,5 +1,5 @@ from .categorical_matrix import CategoricalMatrix -from .constructor import from_csc, from_pandas +from .constructor import from_csc, from_formula, from_pandas from .dense_matrix import DenseMatrix from .matrix_base import MatrixBase from .sparse_matrix import SparseMatrix @@ -14,6 +14,7 @@ "SplitMatrix", "CategoricalMatrix", "from_csc", + "from_formula", "from_pandas", "as_tabmat", "hstack", diff --git a/src/tabmat/constructor.py b/src/tabmat/constructor.py index d280140a..3426a76b 100644 --- a/src/tabmat/constructor.py +++ b/src/tabmat/constructor.py @@ -1,13 +1,20 @@ +import sys import warnings -from typing import List, Optional, Sequence, Tuple, Union +from typing import Any, List, Mapping, Optional, Union import numpy as np import pandas as pd +from formulaic import Formula, ModelSpec +from formulaic.materializers.types import NAAction +from formulaic.parser import DefaultFormulaParser +from formulaic.utils.layered_mapping import LayeredMapping from pandas.api.types import is_numeric_dtype from scipy import sparse as sps from .categorical_matrix import CategoricalMatrix +from .constructor_util import _split_sparse_and_dense_parts from .dense_matrix import DenseMatrix +from .formula import TabmatMaterializer from .matrix_base import MatrixBase from .sparse_matrix import SparseMatrix from .split_matrix import SplitMatrix @@ -179,47 +186,6 @@ def from_pandas( return matrices[0] -def _split_sparse_and_dense_parts( - arg1: sps.csc_matrix, - threshold: float = 0.1, - column_names: Optional[Sequence[Optional[str]]] = None, - term_names: Optional[Sequence[Optional[str]]] = None, -) -> Tuple[DenseMatrix, SparseMatrix, np.ndarray, np.ndarray]: - """ - Split matrix. - - Return the dense and sparse parts of a matrix and the corresponding indices - for each at the provided threshold. - """ - if not isinstance(arg1, sps.csc_matrix): - raise TypeError( - f"X must be of type scipy.sparse.csc_matrix or matrix.SparseMatrix," - f"not {type(arg1)}" - ) - if not 0 <= threshold <= 1: - raise ValueError("Threshold must be between 0 and 1.") - densities = np.diff(arg1.indptr) / arg1.shape[0] - dense_indices = np.where(densities > threshold)[0] - sparse_indices = np.setdiff1d(np.arange(densities.shape[0]), dense_indices) - - if column_names is None: - column_names = [None] * arg1.shape[1] - if term_names is None: - term_names = column_names - - X_dense_F = DenseMatrix( - np.asfortranarray(arg1[:, dense_indices].toarray()), - column_names=[column_names[i] for i in dense_indices], - term_names=[term_names[i] for i in dense_indices], - ) - X_sparse = SparseMatrix( - arg1[:, sparse_indices], - column_names=[column_names[i] for i in sparse_indices], - term_names=[term_names[i] for i in sparse_indices], - ) - return X_dense_F, X_sparse, dense_indices, sparse_indices - - def from_csc(mat: sps.csc_matrix, threshold=0.1, column_names=None, term_names=None): """ Convert a CSC-format sparse matrix into a ``SplitMatrix``. @@ -229,3 +195,91 @@ def from_csc(mat: sps.csc_matrix, threshold=0.1, column_names=None, term_names=N """ dense, sparse, dense_idx, sparse_idx = _split_sparse_and_dense_parts(mat, threshold) return SplitMatrix([dense, sparse], [dense_idx, sparse_idx]) + + +def from_formula( + formula: Union[str, Formula], + data: pd.DataFrame, + ensure_full_rank: bool = False, + na_action: Union[str, NAAction] = NAAction.IGNORE, + dtype: np.dtype = np.float64, + sparse_threshold: float = 0.1, + cat_threshold: int = 4, + interaction_separator: str = ":", + categorical_format: str = "{name}[{category}]", + intercept_name: str = "Intercept", + include_intercept: bool = False, + add_column_for_intercept: bool = True, + context: Optional[Union[int, Mapping[str, Any]]] = 0, +) -> SplitMatrix: + """ + Transform a pandas data frame to a SplitMatrix using a Wilkinson formula. + + Parameters + ---------- + formula: str + A formula accepted by formulaic. + data: pd.DataFrame + pandas data frame to be converted. + ensure_full_rank: bool, default False + If True, ensure that the matrix has full structural rank by categories. + na_action: Union[str, NAAction], default NAAction.IGNORE + How to handle missing values. Can be one of "drop", "ignore", "raise". + dtype: np.dtype, default np.float64 + The dtype of the resulting matrix. + sparse_threshold: float, default 0.1 + The density below which a column is treated as sparse. + cat_threshold: int, default 4 + The number of categories below which a categorical column is one-hot + encoded. This is only checked after interactions have been applied. + interaction_separator: str, default ":" + The separator between the names of interacted variables. + categorical_format: str, default "{name}[T.{category}]" + The format string used to generate the names of categorical variables. + Has to include the placeholders ``{name}`` and ``{category}``. + intercept_name: str, default "Intercept" + The name of the intercept column. + include_intercept: bool, default False + Whether to include an intercept term if the formula does not + include (``+ 1``) or exclude (``+ 0`` or ``- 1``) it explicitly. + add_column_for_intercept: bool, default = True + Whether to add a column of ones for the intercept, or just + have a term without a corresponding column. For advanced use only. + context: Union[int, Mapping[str, Any]], default 0 + The context to use for evaluating the formula. If an integer, the + context is taken from the stack frame of the caller at the given + depth. If None, the context is taken from the stack frame of the + caller at depth 1. If a dict, it is used as the context directly. + """ + if isinstance(context, int): + if hasattr(sys, "_getframe"): + frame = sys._getframe(context + 1) + context = LayeredMapping(frame.f_locals, frame.f_globals) + else: + context = None + spec = ModelSpec( + formula=Formula( + formula, _parser=DefaultFormulaParser(include_intercept=include_intercept) + ), + ensure_full_rank=ensure_full_rank, + na_action=na_action, + ) + materializer = TabmatMaterializer( + data, + context=context, + interaction_separator=interaction_separator, + categorical_format=categorical_format, + intercept_name=intercept_name, + dtype=dtype, + sparse_threshold=sparse_threshold, + cat_threshold=cat_threshold, + add_column_for_intercept=add_column_for_intercept, + ) + result = materializer.get_model_matrix(spec) + + term_names = np.zeros(len(result.term_names), dtype="object") + for term, indices in result.model_spec.term_indices.items(): + term_names[indices] = str(term) + result.term_names = term_names.tolist() + + return result diff --git a/src/tabmat/constructor_util.py b/src/tabmat/constructor_util.py new file mode 100644 index 00000000..f73ceeb2 --- /dev/null +++ b/src/tabmat/constructor_util.py @@ -0,0 +1,48 @@ +from typing import Optional, Sequence, Tuple + +import numpy as np +import scipy.sparse as sps + +from .dense_matrix import DenseMatrix +from .sparse_matrix import SparseMatrix + + +def _split_sparse_and_dense_parts( + arg1: sps.csc_matrix, + threshold: float = 0.1, + column_names: Optional[Sequence[Optional[str]]] = None, + term_names: Optional[Sequence[Optional[str]]] = None, +) -> Tuple[DenseMatrix, SparseMatrix, np.ndarray, np.ndarray]: + """ + Split matrix. + + Return the dense and sparse parts of a matrix and the corresponding indices + for each at the provided threshold. + """ + if not isinstance(arg1, sps.csc_matrix): + raise TypeError( + f"X must be of type scipy.sparse.csc_matrix or matrix.SparseMatrix," + f"not {type(arg1)}" + ) + if not 0 <= threshold <= 1: + raise ValueError("Threshold must be between 0 and 1.") + densities = np.diff(arg1.indptr) / arg1.shape[0] + dense_indices = np.where(densities > threshold)[0] + sparse_indices = np.setdiff1d(np.arange(densities.shape[0]), dense_indices) + + if column_names is None: + column_names = [None] * arg1.shape[1] + if term_names is None: + term_names = column_names + + X_dense_F = DenseMatrix( + np.asfortranarray(arg1[:, dense_indices].toarray()), + column_names=[column_names[i] for i in dense_indices], + term_names=[term_names[i] for i in dense_indices], + ) + X_sparse = SparseMatrix( + arg1[:, sparse_indices], + column_names=[column_names[i] for i in sparse_indices], + term_names=[term_names[i] for i in sparse_indices], + ) + return X_dense_F, X_sparse, dense_indices, sparse_indices diff --git a/src/tabmat/formula.py b/src/tabmat/formula.py new file mode 100644 index 00000000..b97d5617 --- /dev/null +++ b/src/tabmat/formula.py @@ -0,0 +1,722 @@ +import functools +import itertools +from abc import ABC, abstractmethod +from collections import OrderedDict +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union + +import numpy +import pandas +from formulaic import ModelMatrix, ModelSpec +from formulaic.errors import FactorEncodingError +from formulaic.materializers import FormulaMaterializer +from formulaic.materializers.base import EncodedTermStructure +from formulaic.materializers.types import FactorValues, NAAction, ScopedTerm +from formulaic.parser.types import Term +from formulaic.transforms import stateful_transform +from interface_meta import override +from scipy import sparse as sps + +from .categorical_matrix import CategoricalMatrix +from .constructor_util import _split_sparse_and_dense_parts +from .dense_matrix import DenseMatrix +from .matrix_base import MatrixBase +from .sparse_matrix import SparseMatrix +from .split_matrix import SplitMatrix + + +class TabmatMaterializer(FormulaMaterializer): + """Materializer for pandas input and tabmat output.""" + + REGISTER_NAME = "tabmat" + REGISTER_INPUTS = ("pandas.core.frame.DataFrame",) + REGISTER_OUTPUTS = "tabmat" + + @override + def _init(self): + self.interaction_separator = self.params.get("interaction_separator", ":") + self.categorical_format = self.params.get( + "categorical_format", "{name}[{category}]" + ) + self.intercept_name = self.params.get("intercept_name", "Intercept") + self.dtype = self.params.get("dtype", numpy.float64) + self.sparse_threshold = self.params.get("sparse_threshold", 0.1) + self.cat_threshold = self.params.get("cat_threshold", 4) + self.add_column_for_intercept = self.params.get( + "add_column_for_intercept", True + ) + + # We can override formulaic's C() function here + self.context["C"] = _C + + @override + def _is_categorical(self, values): + if isinstance(values, (pandas.Series, pandas.Categorical)): + return values.dtype == object or isinstance( + values.dtype, pandas.CategoricalDtype + ) + return super()._is_categorical(values) + + @override + def _check_for_nulls(self, name, values, na_action, drop_rows): + if na_action is NAAction.IGNORE: + return + + if na_action is NAAction.RAISE: + if isinstance(values, pandas.Series) and values.isnull().values.any(): + raise ValueError(f"`{name}` contains null values after evaluation.") + + elif na_action is NAAction.DROP: + if isinstance(values, pandas.Series): + drop_rows.update(numpy.flatnonzero(values.isnull().values)) + + else: + raise ValueError( + f"Do not know how to interpret `na_action` = {repr(na_action)}." + ) + + @override + def _encode_constant(self, value, metadata, encoder_state, spec, drop_rows): + series = value * numpy.ones(self.nrows - len(drop_rows)) + return _InteractableDenseVector(series, name=self.intercept_name) + + @override + def _encode_numerical(self, values, metadata, encoder_state, spec, drop_rows): + if drop_rows: + values = values.drop(index=values.index[drop_rows]) + if isinstance(values, pandas.Series): + values = values.to_numpy().astype(self.dtype) + if (values != 0).mean() <= self.sparse_threshold: + return _InteractableSparseVector(sps.csc_matrix(values[:, numpy.newaxis])) + else: + return _InteractableDenseVector(values) + + @override + def _encode_categorical( + self, values, metadata, encoder_state, spec, drop_rows, reduced_rank=False + ): + # We do not do any encoding here as it is handled by tabmat + if drop_rows: + values = values.drop(index=values.index[drop_rows]) + return encode_contrasts( + values, + reduced_rank=reduced_rank, + _metadata=metadata, + _state=encoder_state, + _spec=spec, + ) + + @override + def _combine_columns(self, cols, spec, drop_rows): + # Special case no columns + if not cols: + values = numpy.empty((self.data.shape[0], 0), dtype=self.dtype) + return DenseMatrix(values) + + # Otherwise, concatenate columns into SplitMatrix + return SplitMatrix( + [ + col[1].to_tabmat(self.dtype, self.sparse_threshold, self.cat_threshold) + for col in cols + ] + ) + + # Have to override this because of column names + # (and possibly intercept later on) + @override + def _build_model_matrix(self, spec: ModelSpec, drop_rows): + # Step 0: Apply any requested column/term clustering + # This must happen before Step 1 otherwise the greedy rank reduction + # below would result in a different outcome than if the columns had + # always been in the generated order. + terms = self._cluster_terms(spec.formula, cluster_by=spec.cluster_by) + + # Step 1: Determine strategy to maintain structural full-rankness of output matrix + scoped_terms_for_terms = self._get_scoped_terms( + terms, + ensure_full_rank=spec.ensure_full_rank, + ) + + # Step 2: Generate the columns which will be collated into the full matrix + cols = [] + for term, scoped_terms in scoped_terms_for_terms: + scoped_cols = OrderedDict() + for scoped_term in scoped_terms: + if not scoped_term.factors: + if not self.add_column_for_intercept: + continue + scoped_cols[ + "Intercept" + ] = scoped_term.scale * self._encode_constant( + 1, None, {}, spec, drop_rows + ) + else: + scoped_cols.update( + self._get_columns_for_term( + [ + self._encode_evaled_factor( + scoped_factor.factor, + spec, + drop_rows, + reduced_rank=scoped_factor.reduced, + ) + for scoped_factor in scoped_term.factors + ], + spec=spec, + scale=scoped_term.scale, + ) + ) + cols.append((term, scoped_terms, scoped_cols)) + + # Step 3: Populate remaining model spec fields + if spec.structure: + cols = self._enforce_structure(cols, spec, drop_rows) + else: + # for term, scoped_terms, columns in spec.structure: + # expanded_columns = list(itertools.chain(colname_dict[col] for col in columns)) + # expanded_structure.append( + # EncodedTermStructure(term, scoped_terms, expanded_columns) + # ) + + spec = spec.update( + structure=[ + EncodedTermStructure( + term, + [st.copy(without_values=True) for st in scoped_terms], + # This is the only line that is different from the original: + list( + itertools.chain( + *(mat.get_names() for mat in scoped_cols.values()) + ) + ), + ) + for term, scoped_terms, scoped_cols in cols + ], + ) + + # Step 4: Collate factors into one ModelMatrix + return ModelMatrix( + self._combine_columns( + [ + (name, values) + for term, scoped_terms, scoped_cols in cols + for name, values in scoped_cols.items() + ], + spec=spec, + drop_rows=drop_rows, + ), + spec=spec, + ) + + @override + def _get_columns_for_term(self, factors, spec, scale=1): + """Assemble the columns for a model matrix given factors and a scale.""" + out = OrderedDict() + for reverse_product in itertools.product( + *(factor.items() for factor in reversed(factors)) + ): + product = reverse_product[::-1] + out[":".join(p[0] for p in product)] = scale * functools.reduce( + functools.partial(_interact, separator=self.interaction_separator), + ( + p[1].set_name(p[0], name_format=self.categorical_format) + for p in product + ), + ) + return out + + # Again, need a correction to handle categoricals properly + @override + def _enforce_structure( + self, + cols: List[Tuple[Term, List[ScopedTerm], Dict[str, Any]]], + spec, + drop_rows: set, + ): + assert len(cols) == len(spec.structure) + for i, col_spec in enumerate(cols): + scoped_cols = col_spec[2] + target_cols = spec.structure[i][2].copy() + + # Correction for categorical variables: + for name, col in scoped_cols.items(): + if isinstance(col, _InteractableCategoricalVector): + try: + _replace_sequence(target_cols, col.get_names(), name) + except ValueError: + raise FactorEncodingError( + f"Term `{col_spec[0]}` has generated columns that are " + "inconsistent with the specification: generated: " + f"{col.get_names()}, expecting: {target_cols}." + ) + + if len(scoped_cols) > len(target_cols): + raise FactorEncodingError( + f"Term `{col_spec[0]}` has generated too many columns compared to " + f"specification: generated {list(scoped_cols)}, expecting " + f"{target_cols}." + ) + if len(scoped_cols) < len(target_cols): + if len(scoped_cols) == 0: + col = self._encode_constant(0, None, None, spec, drop_rows) + elif len(scoped_cols) == 1: + col = tuple(scoped_cols.values())[0] + else: + raise FactorEncodingError( + f"Term `{col_spec[0]}` has generated insufficient columns " + "compared to specification: generated {list(scoped_cols)}, " + f"expecting {target_cols}." + ) + scoped_cols = {name: col for name in target_cols} + elif set(scoped_cols) != set(target_cols): + raise FactorEncodingError( + f"Term `{col_spec[0]}` has generated columns that are inconsistent " + "with specification: generated {list(scoped_cols)}, expecting " + f"{target_cols}." + ) + + yield col_spec[0], col_spec[1], { + col: scoped_cols[col] for col in target_cols + } + + +class _InteractableVector(ABC): + """Abstract base class for interactable vectors, which are mostly thin + wrappers over numpy arrays, scipy sparse matrices and pandas categoricals. + """ + + name: Optional[str] + + @abstractmethod + def to_tabmat( + self, + dtype: numpy.dtype, + sparse_threshold: float, + cat_threshold: int, + ) -> MatrixBase: + """Convert to an actual tabmat matrix.""" + pass + + @abstractmethod + def get_names(self) -> List[str]: + """Return the names of the columns represented by this vector. + + Returns + ------- + List[str] + The names of the columns represented by this vector. + """ + pass + + @abstractmethod + def set_name(self, name, name_format): + """Set the name of the vector. + + Parameters + ---------- + name : str + The name to set. + name_format : str + The format string to use to format the name. Only used for + categoricals. Has to include the placeholders ``{name}`` + and ``{category}`` + + Returns + ------- + self + A reference to the vector itself. + """ + pass + + +class _InteractableDenseVector(_InteractableVector): + def __init__(self, values: numpy.ndarray, name: Optional[str] = None): + self.values = values + self.name = name + + def __rmul__(self, other): + if isinstance(other, (int, float)): + return _InteractableDenseVector( + values=self.values * other, + name=self.name, + ) + + def to_tabmat( + self, + dtype: numpy.dtype = numpy.float64, + sparse_threshold: float = 0.1, + cat_threshold: int = 4, + ) -> Union[SparseMatrix, DenseMatrix]: + if (self.values != 0).mean() > sparse_threshold: + return DenseMatrix(self.values, column_names=[self.name]) + else: + # Columns can become sparser, but not denser through interactions + return SparseMatrix( + sps.csc_matrix(self.values[:, numpy.newaxis]), column_names=[self.name] + ) + + def get_names(self) -> List[str]: + if self.name is None: + raise RuntimeError("Name not set") + return [self.name] + + def set_name(self, name, name_format=None) -> "_InteractableDenseVector": + self.name = name + return self + + +class _InteractableSparseVector(_InteractableVector): + def __init__(self, values: sps.csc_matrix, name: Optional[str] = None): + self.values = values + self.name = name + + def __rmul__(self, other): + if isinstance(other, (int, float)): + return _InteractableSparseVector( + values=self.values * other, + name=self.name, + ) + + def to_tabmat( + self, + dtype: numpy.dtype = numpy.float64, + sparse_threshold: float = 0.1, + cat_threshold: int = 4, + ) -> SparseMatrix: + return SparseMatrix(self.values, column_names=[self.name]) + + def get_names(self) -> List[str]: + if self.name is None: + raise RuntimeError("Name not set") + return [self.name] + + def set_name(self, name, name_format=None) -> "_InteractableSparseVector": + self.name = name + return self + + +class _InteractableCategoricalVector(_InteractableVector): + def __init__( + self, + codes: numpy.ndarray, + categories: List[str], + multipliers: numpy.ndarray, + name: Optional[str] = None, + ): + # sentinel values for codes: + # -1: missing + # -2: drop + self.codes = codes + self.categories = categories + self.multipliers = multipliers + self.name = name + + @classmethod + def from_categorical( + cls, cat: pandas.Categorical, reduced_rank: bool + ) -> "_InteractableCategoricalVector": + """Create an interactable categorical vector from a pandas categorical.""" + categories = list(cat.categories) + codes = cat.codes.copy().astype(numpy.int64) + if reduced_rank: + codes[codes == 0] = -2 + codes[codes > 0] -= 1 + categories = categories[1:] + return cls( + codes=codes, + categories=categories, + multipliers=numpy.ones(len(cat.codes)), + ) + + def __rmul__(self, other): + if isinstance(other, (int, float)): + return _InteractableCategoricalVector( + categories=self.categories, + codes=self.codes, + multipliers=self.multipliers * other, + name=self.name, + ) + + def to_tabmat( + self, + dtype: numpy.dtype = numpy.float64, + sparse_threshold: float = 0.1, + cat_threshold: int = 4, + ) -> Union[DenseMatrix, CategoricalMatrix, SplitMatrix]: + codes = self.codes.copy() + categories = self.categories.copy() + if -2 in self.codes: + codes[codes >= 0] += 1 + codes[codes == -2] = 0 + categories.insert(0, "__drop__") + drop_first = True + else: + drop_first = False + + cat = pandas.Categorical.from_codes( + codes=codes, + categories=categories, + ordered=False, + ) + + categorical_part = CategoricalMatrix( + cat, + drop_first=drop_first, + dtype=dtype, + column_name=self.name, + column_name_format="{category}", + ) + + if (self.codes == -2).all(): + # All values are dropped + return DenseMatrix(numpy.empty((len(codes), 0), dtype=dtype)) + elif (self.multipliers == 1).all() and len(categories) >= cat_threshold: + return categorical_part + else: + sparse_matrix = sps.csc_matrix( + categorical_part.tocsr().multiply(self.multipliers[:, numpy.newaxis]) + ) + ( + dense_part, + sparse_part, + dense_idx, + sparse_idx, + ) = _split_sparse_and_dense_parts( + sparse_matrix, + sparse_threshold, + column_names=categorical_part.column_names, + ) + return SplitMatrix([dense_part, sparse_part], [dense_idx, sparse_idx]) + + def get_names(self) -> List[str]: + if self.name is None: + raise RuntimeError("Name not set") + return self.categories + + def set_name( + self, name, name_format="{name}[{category}]" + ) -> "_InteractableCategoricalVector": + if self.name is None: + # Make sure to only format the name once + self.name = name + self.categories = [ + name_format.format(name=name, category=cat) for cat in self.categories + ] + return self + + +def _interact( + left: _InteractableVector, right: _InteractableVector, reverse=False, separator=":" +) -> _InteractableVector: + """Interact two interactable vectors. + + Parameters + ---------- + left : _InteractableVector + The left vector. + right : _InteractableVector + The right vector. + reverse : bool, optional + Whether to reverse the order of the interaction, by default False + separator : str, optional + The separator to use between the names of the interacted vectors, by default ":" + + Returns + ------- + _InteractableVector + The interacted vector. + """ + if isinstance(left, _InteractableDenseVector): + if isinstance(right, _InteractableDenseVector): + if not reverse: + new_name = f"{left.name}{separator}{right.name}" + else: + new_name = f"{right.name}{separator}{left.name}" + return _InteractableDenseVector(left.values * right.values, name=new_name) + + else: + return _interact(right, left, reverse=not reverse, separator=separator) + + if isinstance(left, _InteractableSparseVector): + if isinstance(right, (_InteractableDenseVector, _InteractableSparseVector)): + if not reverse: + new_name = f"{left.name}{separator}{right.name}" + else: + new_name = f"{right.name}{separator}{left.name}" + return _InteractableSparseVector( + left.values.multiply(right.values.reshape((-1, 1))), + name=new_name, + ) + + else: + return _interact(right, left, reverse=not reverse, separator=separator) + + if isinstance(left, _InteractableCategoricalVector): + if isinstance(right, (_InteractableDenseVector, _InteractableSparseVector)): + if isinstance(right, _InteractableDenseVector): + right_values = right.values + else: + right_values = right.values.toarray().squeeze() + if not reverse: + new_categories = [ + f"{cat}{separator}{right.name}" for cat in left.categories + ] + new_name = f"{left.name}{separator}{right.name}" + else: + new_categories = [ + f"{right.name}{separator}{cat}" for cat in left.categories + ] + new_name = f"{right.name}{separator}{left.name}" + return _InteractableCategoricalVector( + codes=left.codes, + categories=new_categories, + multipliers=left.multipliers * right_values, + name=new_name, + ) + + elif isinstance(right, _InteractableCategoricalVector): + if not reverse: + return _interact_categoricals(left, right, separator=separator) + else: + return _interact_categoricals(right, left, separator=separator) + + raise TypeError( + f"Cannot interact {type(left).__name__} with {type(right).__name__}" + ) + + +def _interact_categoricals( + left: _InteractableCategoricalVector, + right: _InteractableCategoricalVector, + separator=":", +) -> _InteractableCategoricalVector: + """Interact two categorical vectors. + + Parameters + ---------- + left : _InteractableCategoricalVector + The left categorical vector. + right : _InteractableCategoricalVector + The right categorical vector. + separator : str, optional + The separator to use between the names of the interacted vectors, by default ":" + + Returns + ------- + _InteractableCategoricalVector + The interacted categorical vector. + """ + cardinality_left = len(left.categories) + new_codes = right.codes * cardinality_left + left.codes + + na_mask = (left.codes == -1) | (right.codes == -1) + drop_mask = (left.codes == -2) | (right.codes == -2) + + new_codes[na_mask] = -1 + new_codes[drop_mask] = -2 + + new_categories = [ + f"{left_cat}{separator}{right_cat}" + for right_cat, left_cat in itertools.product(right.categories, left.categories) + ] + + return _InteractableCategoricalVector( + codes=new_codes, + categories=new_categories, + multipliers=left.multipliers * right.multipliers, + name=f"{left.name}{separator}{right.name}", + ) + + +def _C( + data, + *, + levels: Optional[Iterable[str]] = None, + spans_intercept: bool = True, +): + """ + Mark data as categorical. + + A reduced-functionality version of the ``formulaic`` ``C()`` function. It does not + support custom contrasts or the level argument, but it allows setting + ``spans_intercept=False`` to avoid dropping categories. + """ + + def encoder( + values: Any, + reduced_rank: bool, + drop_rows: List[int], + encoder_state: Dict[str, Any], + model_spec: ModelSpec, + ): + if drop_rows: + values = values.drop(index=values.index[drop_rows]) + return encode_contrasts( + values, + levels=levels, + reduced_rank=reduced_rank, + _state=encoder_state, + _spec=model_spec, + ) + + return FactorValues( + data, + kind="categorical", + spans_intercept=spans_intercept, + encoder=encoder, + ) + + +@stateful_transform +def encode_contrasts( + data, + *, + levels: Optional[Iterable[str]] = None, + reduced_rank: bool = False, + _state=None, + _spec=None, +) -> FactorValues[_InteractableCategoricalVector]: + """ + Encode a categorical dataset into one an _InteractableCategoricalVector + + Parameters + ---------- + data: The categorical data array/series to be encoded. + levels: The complete set of levels (categories) posited to be present in + the data. This can also be used to reorder the levels as needed. + reduced_rank: Whether to reduce the rank of output encoded columns in + order to avoid spanning the intercept. + """ + levels = levels if levels is not None else _state.get("categories") + cat = pandas.Categorical(data._values, categories=levels) + _state["categories"] = cat.categories + return _InteractableCategoricalVector.from_categorical( + cat, reduced_rank=reduced_rank + ) + + +def _replace_sequence(lst: List[str], sequence: List[str], replacement: "str") -> None: + """Replace a sequence of elements in a list with a single element. + + Raises a ValueError if the sequence is not in the list in the correct order. + Only checks for the first possible start of the sequence. + + Parameters + ---------- + lst : List[str] + The list to replace elements in. + sequence : List[str] + The sequence of elements to replace. + replacement : str + The element to replace the sequence with. + """ + try: + start = lst.index(sequence[0]) + except ValueError: + start = 0 # Will handle this below + + for elem in sequence: + if lst[start] != elem: + raise ValueError("The sequence is not in the list") + del lst[start] + + lst.insert(start, replacement) diff --git a/tests/test_formula.py b/tests/test_formula.py new file mode 100644 index 00000000..88a55c71 --- /dev/null +++ b/tests/test_formula.py @@ -0,0 +1,952 @@ +import pickle +import re +from io import BytesIO + +import formulaic +import numpy as np +import pandas as pd +import pytest +from formulaic.materializers import FormulaMaterializer +from formulaic.materializers.types import EvaluatedFactor, FactorValues +from formulaic.parser.types import Factor +from scipy import sparse as sps + +import tabmat as tm +from tabmat.formula import ( + TabmatMaterializer, + _interact, + _InteractableCategoricalVector, + _InteractableDenseVector, + _InteractableSparseVector, +) + + +@pytest.fixture +def df(): + df = pd.DataFrame( + { + "num_1": [1.0, 2.0, 3.0, 4.0, 5.0], + "num_2": [5.0, 4.0, 3.0, 2.0, 1.0], + "cat_1": pd.Categorical(["a", "b", "c", "b", "a"]), + "cat_2": pd.Categorical(["x", "y", "z", "x", "y"]), + "cat_3": pd.Categorical(["1", "2", "1", "2", "1"]), + "str_1": ["a", "b", "c", "b", "a"], + } + ) + return df + + +def test_retrieval(): + assert FormulaMaterializer.for_materializer("tabmat") is TabmatMaterializer + assert ( + FormulaMaterializer.for_data(pd.DataFrame(), output="tabmat") + is TabmatMaterializer + ) + + +@pytest.mark.parametrize( + "formula, expected", + [ + pytest.param( + "1 + num_1", + tm.SplitMatrix( + [ + tm.DenseMatrix( + np.array( + [[1.0, 1.0, 1.0, 1.0, 1.0], [1.0, 2.0, 3.0, 4.0, 5.0]] + ).T + ) + ] + ), + id="numeric", + ), + pytest.param( + "1 + cat_1", + tm.SplitMatrix( + [ + tm.DenseMatrix(np.array([[1.0, 1.0, 1.0, 1.0, 1.0]]).T), + tm.CategoricalMatrix( + pd.Categorical( + [ + "__drop__", + "cat_1[b]", + "cat_1[c]", + "cat_1[b]", + "__drop__", + ], + categories=["__drop__", "cat_1[b]", "cat_1[c]"], + ), + drop_first=True, + ), + ] + ), + id="categorical", + ), + pytest.param( + "{np.where(num_1 >= 2, num_1, 0)} * {np.where(num_2 <= 2, num_2, 0)}", + tm.SplitMatrix( + [ + tm.DenseMatrix(np.array([[0.0, 2.0, 3.0, 4.0, 5.0]]).T), + tm.SparseMatrix( + sps.csc_matrix( + np.array( + [ + [1.0, 2.0, 0.0, 0.0, 0.0], + [0.0, 2.0, 0.0, 0.0, 0.0], + ] + ) + ).T + ), + ] + ), + id="numeric_sparse", + ), + pytest.param( + "1 + num_1 : cat_1", + tm.SplitMatrix( + [ + tm.DenseMatrix(np.array([[1.0, 1.0, 1.0, 1.0, 1.0]]).T), + tm.SparseMatrix( + sps.csc_matrix( + np.array( + [ + [1.0, 0.0, 0.0, 0.0, 5.0], + [0.0, 2.0, 0.0, 4.0, 0.0], + [0.0, 0.0, 3.0, 0.0, 0.0], + ] + ).T + ) + ), + ] + ), + id="interaction_cat_num", + ), + pytest.param( + "cat_1 : cat_3 - 1", + tm.SplitMatrix( + [ + tm.CategoricalMatrix( + pd.Categorical( + [ + "cat_1[a]:cat_3[1]", + "cat_1[b]:cat_3[2]", + "cat_1[c]:cat_3[1]", + "cat_1[b]:cat_3[2]", + "cat_1[a]:cat_3[1]", + ], + categories=[ + "cat_1[a]:cat_3[1]", + "cat_1[b]:cat_3[1]", + "cat_1[c]:cat_3[1]", + "cat_1[a]:cat_3[2]", + "cat_1[c]:cat_3[2]", + "cat_1[b]:cat_3[2]", + ], + ), + drop_first=False, + ), + ] + ), + id="interaction_cat_cat", + ), + ], +) +def test_matrix_against_expectation(df, formula, expected): + model_df = tm.from_formula( + formula, df, ensure_full_rank=True, cat_threshold=1, sparse_threshold=0.5 + ) + assert len(model_df.matrices) == len(expected.matrices) + for res, exp in zip(model_df.matrices, expected.matrices): + assert type(res) == type(exp) + if isinstance(res, (tm.DenseMatrix, tm.SparseMatrix)): + np.testing.assert_array_equal(res.A, res.A) + elif isinstance(res, tm.CategoricalMatrix): + assert (exp.cat == res.cat).all() + assert exp.drop_first == res.drop_first + + +@pytest.mark.parametrize( + "formula, expected", + [ + pytest.param( + "1 + num_1", + tm.SplitMatrix( + [ + tm.DenseMatrix( + np.array( + [[1.0, 1.0, 1.0, 1.0, 1.0], [1.0, 2.0, 3.0, 4.0, 5.0]] + ).T + ) + ] + ), + id="numeric", + ), + pytest.param( + "1 + cat_1", + tm.SplitMatrix( + [ + tm.DenseMatrix(np.array([[1.0, 1.0, 1.0, 1.0, 1.0]]).T), + tm.CategoricalMatrix( + pd.Categorical( + [ + "__drop__", + "cat_1__b", + "cat_1__c", + "cat_1__b", + "__drop__", + ], + categories=["__drop__", "cat_1__b", "cat_1__c"], + ), + drop_first=True, + ), + ] + ), + id="categorical", + ), + pytest.param( + "1 + num_1 : cat_1", + tm.SplitMatrix( + [ + tm.DenseMatrix(np.array([[1.0, 1.0, 1.0, 1.0, 1.0]]).T), + tm.SparseMatrix( + sps.csc_matrix( + np.array( + [ + [1.0, 0.0, 0.0, 0.0, 5.0], + [0.0, 2.0, 0.0, 4.0, 0.0], + [0.0, 0.0, 3.0, 0.0, 0.0], + ] + ).T + ) + ), + ] + ), + id="interaction_cat_num", + ), + pytest.param( + "cat_1 : cat_3 - 1", + tm.SplitMatrix( + [ + tm.CategoricalMatrix( + pd.Categorical( + [ + "cat_1__a__x__cat_3__1", + "cat_1__b__x__cat_3__2", + "cat_1__c__x__cat_3__1", + "cat_1__b__x__cat_3__2", + "cat_1__a__x__cat_3__1", + ], + categories=[ + "cat_1__a__x__cat_3__1", + "cat_1__b__x__cat_3__1", + "cat_1__c__x__cat_3__1", + "cat_1__a__x__cat_3__2", + "cat_1__c__x__cat_3__2", + "cat_1__b__x__cat_3__2", + ], + ), + drop_first=False, + ), + ] + ), + id="interaction_cat_cat", + ), + ], +) +def test_matrix_against_expectation_qcl(df, formula, expected): + model_df = tm.from_formula( + formula, + df, + cat_threshold=1, + sparse_threshold=0.5, + ensure_full_rank=True, + interaction_separator="__x__", + categorical_format="{name}__{category}", + intercept_name="intercept", + ) + assert len(model_df.matrices) == len(expected.matrices) + for res, exp in zip(model_df.matrices, expected.matrices): + assert type(res) == type(exp) + if isinstance(res, (tm.DenseMatrix, tm.SparseMatrix)): + np.testing.assert_array_equal(res.A, res.A) + elif isinstance(res, tm.CategoricalMatrix): + assert (exp.cat == res.cat).all() + assert exp.drop_first == res.drop_first + + +@pytest.mark.parametrize( + "ensure_full_rank", [True, False], ids=["full_rank", "all_levels"] +) +@pytest.mark.parametrize( + "formula", + [ + pytest.param("num_1 + num_2", id="numeric"), + pytest.param("cat_1 + cat_2", id="categorical"), + pytest.param("cat_1 * cat_2 * cat_3", id="interaction"), + pytest.param("num_1 + cat_1 * num_2 * cat_2", id="mixed"), + pytest.param("{np.log(num_1)} + {num_in_scope * num_2}", id="functions"), + pytest.param("{num_1 * num_in_scope}", id="variable_in_scope"), + pytest.param("bs(num_1, 3)", id="spline"), + pytest.param( + "poly(num_1, 3, raw=True) + poly(num_2, 3, raw=False)", id="polynomial" + ), + pytest.param( + "C(num_1)", + id="convert_to_categorical", + ), + pytest.param( + "C(cat_1, spans_intercept=False) * cat_2 * cat_3", + id="custom_contrasts", + ), + pytest.param("str_1", id="string_as_categorical"), + ], +) +def test_matrix_against_pandas(df, formula, ensure_full_rank): + num_in_scope = 2 # noqa + model_df = formulaic.model_matrix(formula, df, ensure_full_rank=ensure_full_rank) + model_tabmat = tm.from_formula( + formula, df, ensure_full_rank=ensure_full_rank, include_intercept=True + ) + np.testing.assert_array_equal(model_df.to_numpy(), model_tabmat.A) + + +@pytest.mark.parametrize( + "formula, expected_names", + [ + pytest.param( + "1 + num_1 + num_2", ("Intercept", "num_1", "num_2"), id="numeric" + ), + pytest.param("num_1 + num_2 - 1", ("num_1", "num_2"), id="no_intercept"), + pytest.param( + "1 + cat_1", ("Intercept", "cat_1[b]", "cat_1[c]"), id="categorical" + ), + pytest.param( + "1 + cat_2 * cat_3", + ( + "Intercept", + "cat_2[y]", + "cat_2[z]", + "cat_3[2]", + "cat_2[y]:cat_3[2]", + "cat_2[z]:cat_3[2]", + ), + id="interaction", + ), + pytest.param( + "poly(num_1, 3) - 1", + ("poly(num_1, 3)[1]", "poly(num_1, 3)[2]", "poly(num_1, 3)[3]"), + id="polynomial", + ), + pytest.param( + "1 + {np.log(num_1 ** 2)}", + ("Intercept", "np.log(num_1 ** 2)"), + id="functions", + ), + ], +) +def test_names_against_expectation(df, formula, expected_names): + model_tabmat = tm.from_formula(formula, df, ensure_full_rank=True) + assert model_tabmat.model_spec.column_names == expected_names + assert model_tabmat.column_names == list(expected_names) + + +@pytest.mark.parametrize( + "formula, expected_names", + [ + pytest.param( + "1 + cat_1", ("intercept", "cat_1__b", "cat_1__c"), id="categorical" + ), + pytest.param( + "1 + cat_2 * cat_3", + ( + "intercept", + "cat_2__y", + "cat_2__z", + "cat_3__2", + "cat_2__y__x__cat_3__2", + "cat_2__z__x__cat_3__2", + ), + id="interaction", + ), + pytest.param( + "poly(num_1, 3) - 1", + ("poly(num_1, 3)[1]", "poly(num_1, 3)[2]", "poly(num_1, 3)[3]"), + id="polynomial", + ), + pytest.param( + "1 + {np.log(num_1 ** 2)}", + ("intercept", "np.log(num_1 ** 2)"), + id="functions", + ), + ], +) +def test_names_against_expectation_qcl(df, formula, expected_names): + model_tabmat = tm.from_formula( + formula, + df, + ensure_full_rank=True, + categorical_format="{name}__{category}", + interaction_separator="__x__", + intercept_name="intercept", + ) + assert model_tabmat.model_spec.column_names == expected_names + assert model_tabmat.column_names == list(expected_names) + + +@pytest.mark.parametrize( + "formula, expected_names", + [ + pytest.param("1 + cat_1", ("1", "cat_1", "cat_1"), id="categorical"), + pytest.param( + "1 + cat_2 * cat_3", + ( + "1", + "cat_2", + "cat_2", + "cat_3", + "cat_2:cat_3", + "cat_2:cat_3", + ), + id="interaction", + ), + pytest.param( + "poly(num_1, 3) - 1", + ("poly(num_1, 3)", "poly(num_1, 3)", "poly(num_1, 3)"), + id="polynomial", + ), + pytest.param( + "1 + {np.log(num_1 ** 2)}", + ("1", "np.log(num_1 ** 2)"), + id="functions", + ), + ], +) +def test_term_names_against_expectation(df, formula, expected_names): + model_tabmat = tm.from_formula( + formula, + df, + ensure_full_rank=True, + intercept_name="intercept", + ) + assert model_tabmat.term_names == list(expected_names) + + +@pytest.mark.parametrize( + "categorical_format", + ["{name}[{category}]", "{name}__{category}"], + ids=["brackets", "double_underscore"], +) +def test_all_names_against_from_pandas(df, categorical_format): + mat_from_pandas = tm.from_pandas( + df, drop_first=False, object_as_cat=True, categorical_format=categorical_format + ) + mat_from_formula = tm.from_formula( + "num_1 + num_2 + cat_1 + cat_2 + cat_3 + str_1 - 1", + data=df, + ensure_full_rank=False, + categorical_format=categorical_format, + ) + + assert mat_from_formula.column_names == mat_from_pandas.column_names + assert mat_from_formula.term_names == mat_from_pandas.term_names + + +@pytest.mark.parametrize( + "ensure_full_rank", [True, False], ids=["full_rank", "all_levels"] +) +@pytest.mark.parametrize( + "formula", + [ + pytest.param("1 + num_1 + num_2", id="numeric"), + pytest.param("1 + cat_1 + cat_2", id="categorical"), + pytest.param("1 + cat_1 * cat_2 * cat_3", id="interaction"), + pytest.param("1 + num_1 + cat_1 * num_2 * cat_2", id="mixed"), + pytest.param("1 + {np.log(num_1)} + {num_in_scope * num_2}", id="functions"), + pytest.param("1 + {num_1 * num_in_scope}", id="variable_in_scope"), + pytest.param("1 + bs(num_1, 3)", id="spline"), + pytest.param( + "1 + poly(num_1, 3, raw=True) + poly(num_2, 3, raw=False)", id="polynomial" + ), + pytest.param( + "1 + C(num_1)", + id="convert_to_categorical", + ), + pytest.param( + "1 + C(cat_1, spans_intercept=False) * cat_2 * cat_3", + id="custom_contrasts", + ), + ], +) +def test_names_against_pandas(df, formula, ensure_full_rank): + num_in_scope = 2 # noqa + model_df = formulaic.model_matrix(formula, df, ensure_full_rank=ensure_full_rank) + model_tabmat = tm.from_formula( + formula, + df, + ensure_full_rank=ensure_full_rank, + categorical_format="{name}[T.{category}]", + ) + assert model_tabmat.model_spec.column_names == model_df.model_spec.column_names + assert model_tabmat.model_spec.column_names == tuple(model_df.columns) + assert model_tabmat.column_names == list(model_df.columns) + + +@pytest.mark.parametrize( + "ensure_full_rank", [True, False], ids=["full_rank", "all_levels"] +) +@pytest.mark.parametrize( + "formula, formula_with_intercept, formula_wo_intercept", + [ + ("num_1", "1 + num_1", "num_1 - 1"), + ("cat_1", "1 + cat_1", "cat_1 - 1"), + ( + "num_1 * cat_1 * cat_2", + "1 + num_1 * cat_1 * cat_2", + "num_1 * cat_1 * cat_2 - 1", + ), + ], + ids=["numeric", "categorical", "mixed"], +) +def test_include_intercept( + df, formula, formula_with_intercept, formula_wo_intercept, ensure_full_rank +): + model_no_include = tm.from_formula( + formula, df, include_intercept=False, ensure_full_rank=ensure_full_rank + ) + model_no_intercept = tm.from_formula( + formula_wo_intercept, + df, + include_intercept=True, + ensure_full_rank=ensure_full_rank, + ) + np.testing.assert_array_equal(model_no_include.A, model_no_intercept.A) + assert ( + model_no_include.model_spec.column_names + == model_no_intercept.model_spec.column_names + ) + + model_include = tm.from_formula( + formula, df, include_intercept=True, ensure_full_rank=ensure_full_rank + ) + model_intercept = tm.from_formula( + formula_with_intercept, + df, + include_intercept=False, + ensure_full_rank=ensure_full_rank, + ) + np.testing.assert_array_equal(model_include.A, model_intercept.A) + assert ( + model_no_include.model_spec.column_names + == model_no_intercept.model_spec.column_names + ) + + +@pytest.mark.parametrize( + "formula", + [ + pytest.param("str_1 : cat_1", id="implicit"), + pytest.param("C(str_1) : C(cat_1, spans_intercept=False)", id="explicit"), + ], +) +@pytest.mark.parametrize( + "ensure_full_rank", [True, False], ids=["full_rank", "all_levels"] +) +def test_C_state(df, formula, ensure_full_rank): + model_tabmat = tm.from_formula( + "str_1 : cat_1 + 1", df, cat_threshold=0, ensure_full_rank=ensure_full_rank + ) + model_tabmat_2 = model_tabmat.model_spec.get_model_matrix(df[:2]) + np.testing.assert_array_equal(model_tabmat.A[:2, :], model_tabmat_2.A) + np.testing.assert_array_equal( + model_tabmat.matrices[1].cat.categories, + model_tabmat_2.matrices[1].cat.categories, + ) + + +VECTORS = [ + _InteractableDenseVector(np.array([1, 2, 3, 4, 5], dtype=np.float64)).set_name( + "dense" + ), + _InteractableSparseVector( + sps.csc_matrix(np.array([[1, 0, 0, 0, 2]], dtype=np.float64).T) + ).set_name("sparse"), + _InteractableCategoricalVector.from_categorical( + pd.Categorical(["a", "b", "c", "b", "a"]), reduced_rank=True + ).set_name("cat_reduced"), + _InteractableCategoricalVector.from_categorical( + pd.Categorical(["a", "b", "c", "b", "a"]), reduced_rank=False + ).set_name("cat_full"), +] + + +@pytest.mark.parametrize( + "left", VECTORS, ids=["dense", "sparse", "cat_full", "cat_reduced"] +) +@pytest.mark.parametrize( + "right", VECTORS, ids=["dense", "sparse", "cat_full", "cat_reduced"] +) +@pytest.mark.parametrize("reverse", [False, True], ids=["not_reversed", "reversed"]) +def test_interactable_vectors(left, right, reverse): + n = left.to_tabmat().shape[0] + left_np = left.to_tabmat().A.reshape((n, -1)) + right_np = right.to_tabmat().A.reshape((n, -1)) + + if reverse: + left_np, right_np = right_np, left_np + + if isinstance(left, _InteractableCategoricalVector) and isinstance( + right, _InteractableCategoricalVector + ): + result_np = np.zeros((n, left_np.shape[1] * right_np.shape[1])) + for i in range(right_np.shape[1]): + for j in range(left_np.shape[1]): + result_np[:, i * left_np.shape[1] + j] = left_np[:, j] * right_np[:, i] + else: + result_np = left_np * right_np + + result_vec = _interact(left, right, reverse=reverse) + + # Test types + if isinstance(left, _InteractableCategoricalVector) or isinstance( + right, _InteractableCategoricalVector + ): + assert isinstance(result_vec, _InteractableCategoricalVector) + elif isinstance(left, _InteractableSparseVector) or isinstance( + right, _InteractableSparseVector + ): + assert isinstance(result_vec, _InteractableSparseVector) + else: + assert isinstance(result_vec, _InteractableDenseVector) + + # Test values + np.testing.assert_array_equal( + result_vec.to_tabmat().A.squeeze(), result_np.squeeze() + ) + + # Test names + if not reverse: + assert result_vec.name == left.name + ":" + right.name + else: + assert result_vec.name == right.name + ":" + left.name + + +# Tests from formulaic's test suite +# --------------------------------- + +FORMULAIC_TESTS = { + # '': (, , , ) + "a": (["Intercept", "a"], ["Intercept", "a"], ["Intercept", "a"], 2), + "A": ( + ["Intercept", "A[b]", "A[c]"], + ["Intercept", "A[a]", "A[b]", "A[c]"], + ["Intercept", "A[c]"], + 2, + ), + "C(A)": ( + ["Intercept", "C(A)[b]", "C(A)[c]"], + ["Intercept", "C(A)[a]", "C(A)[b]", "C(A)[c]"], + ["Intercept", "C(A)[c]"], + 2, + ), + "A:a": ( + ["Intercept", "A[a]:a", "A[b]:a", "A[c]:a"], + ["Intercept", "A[a]:a", "A[b]:a", "A[c]:a"], + ["Intercept", "A[a]:a"], + 1, + ), + "A:B": ( + [ + "Intercept", + "B[b]", + "B[c]", + "A[b]:B[a]", + "A[c]:B[a]", + "A[b]:B[b]", + "A[c]:B[b]", + "A[b]:B[c]", + "A[c]:B[c]", + ], + [ + "Intercept", + "A[a]:B[a]", + "A[b]:B[a]", + "A[c]:B[a]", + "A[a]:B[b]", + "A[b]:B[b]", + "A[c]:B[b]", + "A[a]:B[c]", + "A[b]:B[c]", + "A[c]:B[c]", + ], + ["Intercept"], + 1, + ), +} + + +class TestFormulaicTests: + @pytest.fixture + def data(self): + return pd.DataFrame( + {"a": [1, 2, 3], "b": [1, 2, 3], "A": ["a", "b", "c"], "B": ["a", "b", "c"]} + ) + + @pytest.fixture + def data_with_nulls(self): + return pd.DataFrame( + {"a": [1, 2, None], "A": ["a", None, "c"], "B": ["a", "b", None]} + ) + + @pytest.fixture + def materializer(self, data): + return TabmatMaterializer(data) + + @pytest.mark.parametrize("formula,tests", FORMULAIC_TESTS.items()) + def test_get_model_matrix(self, materializer, formula, tests): + mm = materializer.get_model_matrix(formula, ensure_full_rank=True) + assert isinstance(mm, tm.MatrixBase) + assert mm.shape == (3, len(tests[0])) + assert list(mm.model_spec.column_names) == tests[0] + + mm = materializer.get_model_matrix(formula, ensure_full_rank=False) + assert isinstance(mm, tm.MatrixBase) + assert mm.shape == (3, len(tests[1])) + assert list(mm.model_spec.column_names) == tests[1] + + def test_get_model_matrix_edge_cases(self, materializer): + mm = materializer.get_model_matrix(("a",), ensure_full_rank=True) + assert isinstance(mm, formulaic.ModelMatrices) + assert isinstance(mm[0], tm.MatrixBase) + + mm = materializer.get_model_matrix("a ~ A", ensure_full_rank=True) + assert isinstance(mm, formulaic.ModelMatrices) + assert "lhs" in mm.model_spec + assert "rhs" in mm.model_spec + + mm = materializer.get_model_matrix(("a ~ A",), ensure_full_rank=True) + assert isinstance(mm, formulaic.ModelMatrices) + assert isinstance(mm[0], formulaic.ModelMatrices) + + def test_get_model_matrix_invalid_output(self, materializer): + with pytest.raises( + formulaic.errors.FormulaMaterializationError, + match=r"Nominated output .* is invalid\. Available output types are: ", + ): + materializer.get_model_matrix( + "a", ensure_full_rank=True, output="invalid_output" + ) + + @pytest.mark.parametrize("formula,tests", FORMULAIC_TESTS.items()) + def test_na_handling(self, data_with_nulls, formula, tests): + mm = TabmatMaterializer(data_with_nulls).get_model_matrix(formula) + assert isinstance(mm, tm.MatrixBase) + assert mm.shape == (tests[3], len(tests[2])) + assert list(mm.model_spec.column_names) == tests[2] + + if formula != "a": + pytest.skip("Tabmat does not allo NAs in categoricals") + + mm = TabmatMaterializer(data_with_nulls).get_model_matrix( + formula, na_action="ignore" + ) + assert isinstance(mm, tm.MatrixBase) + assert mm.shape == (3, len(tests[0]) + (-1 if "A" in formula else 0)) + + def test_state(self, materializer): + mm = materializer.get_model_matrix("center(a) - 1") + assert isinstance(mm, tm.MatrixBase) + assert list(mm.model_spec.column_names) == ["center(a)"] + assert np.allclose(mm.getcol(0).unpack().squeeze(), [-1, 0, 1]) + + mm2 = TabmatMaterializer(pd.DataFrame({"a": [4, 5, 6]})).get_model_matrix( + mm.model_spec + ) + assert isinstance(mm2, tm.MatrixBase) + assert list(mm2.model_spec.column_names) == ["center(a)"] + assert np.allclose(mm2.getcol(0).unpack().squeeze(), [2, 3, 4]) + + mm3 = mm.model_spec.get_model_matrix(pd.DataFrame({"a": [4, 5, 6]})) + assert isinstance(mm3, tm.MatrixBase) + assert list(mm3.model_spec.column_names) == ["center(a)"] + assert np.allclose(mm3.getcol(0).unpack().squeeze(), [2, 3, 4]) + + def test_factor_evaluation_edge_cases(self, materializer): + # Test that categorical kinds are set if type would otherwise be numerical + ev_factor = materializer._evaluate_factor( + Factor("a", eval_method="lookup", kind="categorical"), + formulaic.model_spec.ModelSpec(formula=[]), + drop_rows=set(), + ) + assert ev_factor.metadata.kind.value == "categorical" + + # Test that other kind mismatches result in an exception + materializer.factor_cache = {} + with pytest.raises( + formulaic.errors.FactorEncodingError, + match=re.escape( + "Factor `A` is expecting values of kind 'numerical', " + "but they are actually of kind 'categorical'." + ), + ): + materializer._evaluate_factor( + Factor("A", eval_method="lookup", kind="numerical"), + formulaic.model_spec.ModelSpec(formula=[]), + drop_rows=set(), + ) + + # Test that if an encoding has already been determined, that an exception is raised + # if the new encoding does not match + materializer.factor_cache = {} + with pytest.raises( + formulaic.errors.FactorEncodingError, + match=re.escape( + "The model specification expects factor `a` to have values of kind " + "`categorical`, but they are actually of kind `numerical`." + ), + ): + materializer._evaluate_factor( + Factor("a", eval_method="lookup", kind="numerical"), + formulaic.model_spec.ModelSpec( + formula=[], encoder_state={"a": ("categorical", {})} + ), + drop_rows=set(), + ) + + def test__is_categorical(self, materializer): + assert materializer._is_categorical([1, 2, 3]) is False + assert materializer._is_categorical(pd.Series(["a", "b", "c"])) is True + assert materializer._is_categorical(pd.Categorical(["a", "b", "c"])) is True + assert materializer._is_categorical(FactorValues({}, kind="categorical")) + + def test_encoding_edge_cases(self, materializer): + # Verify that constant encoding works well + encoded_factor = materializer._encode_evaled_factor( + factor=EvaluatedFactor( + factor=Factor("10", eval_method="literal", kind="constant"), + values=FactorValues(10, kind="constant"), + ), + spec=formulaic.model_spec.ModelSpec(formula=[]), + drop_rows=[], + ) + np.testing.assert_array_equal(encoded_factor["10"].values, [10, 10, 10]) + + # Verify that unencoded dictionaries with drop-fields work + encoded_factor = materializer._encode_evaled_factor( + factor=EvaluatedFactor( + factor=Factor("a", eval_method="lookup", kind="numerical"), + values=FactorValues( + {"a": pd.Series([1, 2, 3]), "b": pd.Series([4, 5, 6])}, + kind="numerical", + spans_intercept=True, + drop_field="a", + ), + ), + spec=formulaic.model_spec.ModelSpec(formula=[]), + drop_rows=set(), + ) + np.testing.assert_array_equal(encoded_factor["a[a]"].values, [1, 2, 3]) + np.testing.assert_array_equal(encoded_factor["a[b]"].values, [4, 5, 6]) + + encoded_factor = materializer._encode_evaled_factor( + factor=EvaluatedFactor( + factor=Factor("a", eval_method="lookup", kind="numerical"), + values=FactorValues( + {"a": pd.Series([1, 2, 3]), "b": pd.Series([4, 5, 6])}, + kind="numerical", + spans_intercept=True, + drop_field="a", + ), + ), + spec=formulaic.model_spec.ModelSpec(formula=[]), + drop_rows=set(), + reduced_rank=True, + ) + np.testing.assert_array_equal(encoded_factor["a[b]"].values, [4, 5, 6]) + + # Verify that encoding of nested dictionaries works well + encoded_factor = materializer._encode_evaled_factor( + factor=EvaluatedFactor( + factor=Factor("A", eval_method="python", kind="numerical"), + values=FactorValues( + { + "a": pd.Series([1, 2, 3]), + "b": pd.Series([4, 5, 6]), + "__metadata__": None, + }, + kind="numerical", + ), + ), + spec=formulaic.model_spec.ModelSpec(formula=[]), + drop_rows=[], + ) + np.testing.assert_array_equal(encoded_factor["A[a]"].values, [1, 2, 3]) + + encoded_factor = materializer._encode_evaled_factor( + factor=EvaluatedFactor( + factor=Factor("B", eval_method="python", kind="categorical"), + values=FactorValues( + {"a": pd.Series(["a", "b", "c"])}, kind="categorical" + ), + ), + spec=formulaic.model_spec.ModelSpec(formula=[]), + drop_rows=[], + ) + encoded_matrix = ( + encoded_factor["B[a]"].set_name("B[a]").to_tabmat(cat_threshold=1) + ) + assert list(encoded_matrix.cat) == ["B[a][a]", "B[a][b]", "B[a][c]"] + + def test_empty(self, materializer): + mm = materializer.get_model_matrix("0", ensure_full_rank=True) + assert mm.shape[1] == 0 + mm = materializer.get_model_matrix("0", ensure_full_rank=False) + assert mm.shape[1] == 0 + + def test_category_reordering(self): + data = pd.DataFrame({"A": ["a", "b", "c"]}) + data2 = pd.DataFrame({"A": ["c", "b", "a"]}) + data3 = pd.DataFrame( + {"A": pd.Categorical(["c", "b", "a"], categories=["c", "b", "a"])} + ) + + m = TabmatMaterializer(data).get_model_matrix("A + 0", ensure_full_rank=False) + assert list(m.model_spec.column_names) == ["A[a]", "A[b]", "A[c]"] + + m2 = TabmatMaterializer(data2).get_model_matrix("A + 0", ensure_full_rank=False) + assert list(m2.model_spec.column_names) == ["A[a]", "A[b]", "A[c]"] + + m3 = TabmatMaterializer(data3).get_model_matrix("A + 0", ensure_full_rank=False) + assert list(m3.model_spec.column_names) == ["A[c]", "A[b]", "A[a]"] + + def test_term_clustering(self, materializer): + assert materializer.get_model_matrix( + "a + b + a:A + b:A" + ).model_spec.column_names == ( + "Intercept", + "a", + "b", + "a:A[b]", + "a:A[c]", + "b:A[b]", + "b:A[c]", + ) + assert materializer.get_model_matrix( + "a + b + a:A + b:A", cluster_by="numerical_factors" + ).model_spec.column_names == ( + "Intercept", + "a", + "a:A[b]", + "a:A[c]", + "b", + "b:A[b]", + "b:A[c]", + ) + + def test_model_spec_pickleable(self, materializer): + o = BytesIO() + ms = materializer.get_model_matrix("a ~ a:A") + pickle.dump(ms.model_spec, o) + o.seek(0) + ms2 = pickle.load(o) + assert isinstance(ms, formulaic.parser.types.Structured) + assert ms2.lhs.formula.root == ["a"] diff --git a/tests/test_split_matrix.py b/tests/test_split_matrix.py index bb36303c..5de01638 100644 --- a/tests/test_split_matrix.py +++ b/tests/test_split_matrix.py @@ -7,7 +7,7 @@ import tabmat as tm from tabmat import from_pandas -from tabmat.constructor import _split_sparse_and_dense_parts +from tabmat.constructor_util import _split_sparse_and_dense_parts from tabmat.dense_matrix import DenseMatrix from tabmat.ext.sparse import csr_dense_sandwich from tabmat.split_matrix import SplitMatrix