diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py index 44c47efd..0940d66a 100644 --- a/src/tabmat/categorical_matrix.py +++ b/src/tabmat/categorical_matrix.py @@ -252,7 +252,9 @@ def __init__( dtype: np.dtype = np.float64, ): if pd.isnull(cat_vec).any(): - raise ValueError("Categorical data can't have missing values.") + self.has_missing = True + else: + self.has_missing = False if isinstance(cat_vec, pd.Categorical): self.cat = cat_vec @@ -327,9 +329,15 @@ def matvec( if out is None: out = np.zeros(self.shape[0], dtype=other_m.dtype) - if self.drop_first: + if self.drop_first or self.has_missing: matvec_drop_first( - self.indices, other_m, self.shape[0], cols, self.shape[1], out + self.indices, + other_m, + self.shape[0], + cols, + self.shape[1], + out, + self.drop_first, ) else: matvec(self.indices, other_m, self.shape[0], cols, self.shape[1], out) @@ -394,9 +402,16 @@ def transpose_matvec( if cols is not None: cols = set_up_rows_or_cols(cols, self.shape[1]) - if self.drop_first: + if self.drop_first or self.has_missing: transpose_matvec_drop_first( - self.indices, vec, self.shape[1], vec.dtype, rows, cols, out + self.indices, + vec, + self.shape[1], + vec.dtype, + rows, + cols, + out, + self.drop_first, ) else: transpose_matvec( @@ -430,9 +445,9 @@ def sandwich( """ d = np.asarray(d) rows = set_up_rows_or_cols(rows, self.shape[0]) - if self.drop_first: + if self.drop_first or self.has_missing: res_diag = sandwich_categorical_drop_first( - self.indices, d, rows, d.dtype, self.shape[1] + self.indices, d, rows, d.dtype, self.shape[1], self.drop_first ) else: res_diag = sandwich_categorical( @@ -474,9 +489,9 @@ def getcol(self, i: int) -> sps.csc_matrix: def tocsr(self) -> sps.csr_matrix: """Return scipy csr representation of matrix.""" - if self.drop_first: + if self.drop_first or self.has_missing: nnz, indices, indptr = subset_categorical_drop_first( - self.indices, self.shape[1] + self.indices, self.shape[1], self.drop_first ) return sps.csr_matrix( (np.ones(nnz, dtype=int), indices, indptr), shape=self.shape @@ -615,7 +630,7 @@ def multiply(self, other) -> SparseMatrix: f"Shapes do not match. Expected length of {self.shape[0]}. Got {len(other)}." ) - if self.drop_first: + if self.drop_first or self.has_missing: return SparseMatrix( sps.csr_matrix( multiply_drop_first( @@ -623,6 +638,7 @@ def multiply(self, other) -> SparseMatrix: d=np.squeeze(other), ncols=self.shape[1], dtype=other.dtype, + drop_first=self.drop_first, ), shape=self.shape, ) diff --git a/src/tabmat/ext/cat_split_helpers-tmpl.cpp b/src/tabmat/ext/cat_split_helpers-tmpl.cpp index c40f851c..bbe9aa72 100644 --- a/src/tabmat/ext/cat_split_helpers-tmpl.cpp +++ b/src/tabmat/ext/cat_split_helpers-tmpl.cpp @@ -9,6 +9,9 @@ void _transpose_matvec_${dropfirst}( F* other, F* res, Int res_size + % if dropfirst == 'all_rows_drop_first': + , bool drop_first + % endif ) { #pragma omp parallel { @@ -16,8 +19,8 @@ void _transpose_matvec_${dropfirst}( #pragma omp for for (Py_ssize_t i = 0; i < n_rows; i++) { % if dropfirst == 'all_rows_drop_first': - Py_ssize_t col_idx = indices[i] - 1; - if (col_idx != -1) { + Py_ssize_t col_idx = indices[i] - drop_first; + if (col_idx >= 0) { restemp[col_idx] += other[i]; } % else: @@ -54,11 +57,11 @@ void _sandwich_cat_cat( for (Py_ssize_t k_idx = 0; k_idx < len_rows; k_idx++) { Int k = rows[k_idx]; Int i = i_indices[k] - i_drop_first; - if (i == -1) { + if (i < 0) { continue; } Int j = j_indices[k] - j_drop_first; - if (j == -1) { + if (j < 0) { continue; } restemp[(Py_ssize_t) i * res_n_col + j] += d[k]; @@ -99,13 +102,15 @@ void _sandwich_cat_dense${order}( // instructions // MAYBE TODO: explore whether swapping the loop order for F-ordered mat_j // is useful. - for (Py_ssize_t j_idx = 0; j_idx < len_j_cols; j_idx++) { - Py_ssize_t j = j_cols[j_idx]; - % if order == 'C': - restemp[i * len_j_cols + j_idx] += d[k] * mat_j[k * mat_j_ncol + j]; - % else: - restemp[i * len_j_cols + j_idx] += d[k] * mat_j[j * mat_j_nrow + k]; - % endif + if (i >= 0) { + for (Py_ssize_t j_idx = 0; j_idx < len_j_cols; j_idx++) { + Py_ssize_t j = j_cols[j_idx]; + % if order == 'C': + restemp[i * len_j_cols + j_idx] += d[k] * mat_j[k * mat_j_ncol + j]; + % else: + restemp[i * len_j_cols + j_idx] += d[k] * mat_j[j * mat_j_nrow + k]; + % endif + } } } for (Py_ssize_t i = 0; i < res_size; i++) { diff --git a/src/tabmat/ext/categorical.pyx b/src/tabmat/ext/categorical.pyx index 59ea308a..f28a069f 100644 --- a/src/tabmat/ext/categorical.pyx +++ b/src/tabmat/ext/categorical.pyx @@ -12,7 +12,7 @@ from libcpp cimport bool cdef extern from "cat_split_helpers.cpp": void _transpose_matvec_all_rows[Int, F](Int, Int*, F*, F*, Int) - void _transpose_matvec_all_rows_drop_first[Int, F](Int, Int*, F*, F*, Int) + void _transpose_matvec_all_rows_drop_first[Int, F](Int, Int*, F*, F*, Int, bool) def transpose_matvec( @@ -69,7 +69,8 @@ def transpose_matvec_drop_first( dtype, rows, cols, - floating[:] out + floating[:] out, + bint drop_first ): cdef int row, row_idx, n_keep_rows, col_idx cdef int n_rows = len(indices) @@ -81,15 +82,15 @@ def transpose_matvec_drop_first( # Case 1: No row or col restrictions if no_row_restrictions and no_col_restrictions: - _transpose_matvec_all_rows_drop_first(n_rows, &indices[0], &other[0], &out[0], out_size) + _transpose_matvec_all_rows_drop_first(n_rows, &indices[0], &other[0], &out[0], out_size, drop_first) # Case 2: row restrictions but no col restrictions elif no_col_restrictions: rows_view = rows n_keep_rows = len(rows_view) for row_idx in range(n_keep_rows): row = rows_view[row_idx] - col_idx = indices[row] - 1 - if col_idx != -1: + col_idx = indices[row] - drop_first + if col_idx >= 0: out[col_idx] += other[row] # Cases 3 and 4: col restrictions else: @@ -97,8 +98,8 @@ def transpose_matvec_drop_first( # Case 3: Col restrictions but no row restrictions if no_row_restrictions: for row_idx in range(n_rows): - col_idx = indices[row_idx] - 1 - if (col_idx != -1) and (cols_included[col_idx]): + col_idx = indices[row_idx] - drop_first + if (col_idx >= 0) and (cols_included[col_idx]): out[col_idx] += other[row_idx] # Case 4: Both col restrictions and row restrictions else: @@ -106,8 +107,8 @@ def transpose_matvec_drop_first( n_keep_rows = len(rows_view) for row_idx in range(n_keep_rows): row = rows_view[row_idx] - col_idx = indices[row] - 1 - if (col_idx != -1) and (cols_included[col_idx]): + col_idx = indices[row] - drop_first + if (col_idx >= 0) and (cols_included[col_idx]): out[col_idx] += other[row] @@ -151,7 +152,8 @@ def matvec_drop_first( int n_rows, int[:] cols, int n_cols, - floating[:] out_vec + floating[:] out_vec, + bint drop_first ): """See `matvec`. Here we drop the first category of the CategoricalMatrix so the indices refer to the column index + 1. @@ -161,14 +163,14 @@ def matvec_drop_first( if cols is None: for i in prange(n_rows, nogil=True): - col_idx = indices[i] - 1 # reference category is always 0. - if col_idx != -1: + col_idx = indices[i] - drop_first # reference category is always 0. + if col_idx >= 0: out_vec[i] += other[col_idx] else: col_included = get_col_included(cols, n_cols) for i in prange(n_rows, nogil=True): - col_idx = indices[i] - 1 - if (col_idx != -1) and (col_included[col_idx] == 1): + col_idx = indices[i] - drop_first + if (col_idx >= 0) and (col_included[col_idx] == 1): out_vec[i] += other[col_idx] return @@ -196,7 +198,8 @@ def sandwich_categorical_drop_first( floating[:] d, int[:] rows, dtype, - int n_cols + int n_cols, + bint drop_first ): cdef floating[:] res = np.zeros(n_cols, dtype=dtype) cdef int col_idx, k, k_idx @@ -204,8 +207,8 @@ def sandwich_categorical_drop_first( for k_idx in range(n_rows): k = rows[k_idx] - col_idx = indices[k] - 1 # reference category is always 0. - if col_idx != -1: + col_idx = indices[k] - drop_first # reference category is always 0. + if col_idx >= 0: res[col_idx] += d[k] return np.asarray(res) @@ -215,6 +218,7 @@ def multiply_drop_first( numeric[:] d, int ncols, dtype, + bint drop_first, ): """Multiply a CategoricalMatrix by a vector d. @@ -255,9 +259,9 @@ def multiply_drop_first( for i in range(nrows): vnew_indptr[i] = nonref_cnt - if indices[i] != 0: + if indices[i] >= drop_first: vnew_data[nonref_cnt] = d[i] - vnew_indices[nonref_cnt] = indices[i] - 1 + vnew_indices[nonref_cnt] = indices[i] - drop_first nonref_cnt += 1 vnew_indptr[i+1] = nonref_cnt @@ -268,6 +272,7 @@ def multiply_drop_first( def subset_categorical_drop_first( int[:] indices, int ncols, + bint drop_first ): """Construct the inputs to transform a CategoricalMatrix into a csr_matrix. @@ -299,8 +304,8 @@ def subset_categorical_drop_first( for i in range(nrows): vnew_indptr[i] = nonzero_cnt - if indices[i] != 0: - vnew_indices[nonzero_cnt] = indices[i] - 1 + if indices[i] >= drop_first: + vnew_indices[nonzero_cnt] = indices[i] - drop_first nonzero_cnt += 1 vnew_indptr[i+1] = nonzero_cnt