Skip to content

Commit

Permalink
Add missing support to categoricals
Browse files Browse the repository at this point in the history
  • Loading branch information
stanmart committed Jul 17, 2023
1 parent 212a1c6 commit f97eb34
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 42 deletions.
36 changes: 26 additions & 10 deletions src/tabmat/categorical_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,9 @@ def __init__(
dtype: np.dtype = np.float64,
):
if pd.isnull(cat_vec).any():
raise ValueError("Categorical data can't have missing values.")
self.has_missing = True
else:
self.has_missing = False

if isinstance(cat_vec, pd.Categorical):
self.cat = cat_vec
Expand Down Expand Up @@ -327,9 +329,15 @@ def matvec(
if out is None:
out = np.zeros(self.shape[0], dtype=other_m.dtype)

if self.drop_first:
if self.drop_first or self.has_missing:
matvec_drop_first(
self.indices, other_m, self.shape[0], cols, self.shape[1], out
self.indices,
other_m,
self.shape[0],
cols,
self.shape[1],
out,
self.drop_first,
)
else:
matvec(self.indices, other_m, self.shape[0], cols, self.shape[1], out)
Expand Down Expand Up @@ -394,9 +402,16 @@ def transpose_matvec(
if cols is not None:
cols = set_up_rows_or_cols(cols, self.shape[1])

if self.drop_first:
if self.drop_first or self.has_missing:
transpose_matvec_drop_first(
self.indices, vec, self.shape[1], vec.dtype, rows, cols, out
self.indices,
vec,
self.shape[1],
vec.dtype,
rows,
cols,
out,
self.drop_first,
)
else:
transpose_matvec(
Expand Down Expand Up @@ -430,9 +445,9 @@ def sandwich(
"""
d = np.asarray(d)
rows = set_up_rows_or_cols(rows, self.shape[0])
if self.drop_first:
if self.drop_first or self.has_missing:
res_diag = sandwich_categorical_drop_first(
self.indices, d, rows, d.dtype, self.shape[1]
self.indices, d, rows, d.dtype, self.shape[1], self.drop_first
)
else:
res_diag = sandwich_categorical(
Expand Down Expand Up @@ -474,9 +489,9 @@ def getcol(self, i: int) -> sps.csc_matrix:

def tocsr(self) -> sps.csr_matrix:
"""Return scipy csr representation of matrix."""
if self.drop_first:
if self.drop_first or self.has_missing:
nnz, indices, indptr = subset_categorical_drop_first(
self.indices, self.shape[1]
self.indices, self.shape[1], self.drop_first
)
return sps.csr_matrix(
(np.ones(nnz, dtype=int), indices, indptr), shape=self.shape
Expand Down Expand Up @@ -615,14 +630,15 @@ def multiply(self, other) -> SparseMatrix:
f"Shapes do not match. Expected length of {self.shape[0]}. Got {len(other)}."
)

if self.drop_first:
if self.drop_first or self.has_missing:
return SparseMatrix(
sps.csr_matrix(
multiply_drop_first(
indices=self.indices,
d=np.squeeze(other),
ncols=self.shape[1],
dtype=other.dtype,
drop_first=self.drop_first,
),
shape=self.shape,
)
Expand Down
27 changes: 16 additions & 11 deletions src/tabmat/ext/cat_split_helpers-tmpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,18 @@ void _transpose_matvec_${dropfirst}(
F* other,
F* res,
Int res_size
% if dropfirst == 'all_rows_drop_first':
, bool drop_first
% endif
) {
#pragma omp parallel
{
std::vector<F> restemp(res_size, 0.0);
#pragma omp for
for (Py_ssize_t i = 0; i < n_rows; i++) {
% if dropfirst == 'all_rows_drop_first':
Py_ssize_t col_idx = indices[i] - 1;
if (col_idx != -1) {
Py_ssize_t col_idx = indices[i] - drop_first;
if (col_idx >= 0) {
restemp[col_idx] += other[i];
}
% else:
Expand Down Expand Up @@ -54,11 +57,11 @@ void _sandwich_cat_cat(
for (Py_ssize_t k_idx = 0; k_idx < len_rows; k_idx++) {
Int k = rows[k_idx];
Int i = i_indices[k] - i_drop_first;
if (i == -1) {
if (i < 0) {
continue;
}
Int j = j_indices[k] - j_drop_first;
if (j == -1) {
if (j < 0) {
continue;
}
restemp[(Py_ssize_t) i * res_n_col + j] += d[k];
Expand Down Expand Up @@ -99,13 +102,15 @@ void _sandwich_cat_dense${order}(
// instructions
// MAYBE TODO: explore whether swapping the loop order for F-ordered mat_j
// is useful.
for (Py_ssize_t j_idx = 0; j_idx < len_j_cols; j_idx++) {
Py_ssize_t j = j_cols[j_idx];
% if order == 'C':
restemp[i * len_j_cols + j_idx] += d[k] * mat_j[k * mat_j_ncol + j];
% else:
restemp[i * len_j_cols + j_idx] += d[k] * mat_j[j * mat_j_nrow + k];
% endif
if (i >= 0) {
for (Py_ssize_t j_idx = 0; j_idx < len_j_cols; j_idx++) {
Py_ssize_t j = j_cols[j_idx];
% if order == 'C':
restemp[i * len_j_cols + j_idx] += d[k] * mat_j[k * mat_j_ncol + j];
% else:
restemp[i * len_j_cols + j_idx] += d[k] * mat_j[j * mat_j_nrow + k];
% endif
}
}
}
for (Py_ssize_t i = 0; i < res_size; i++) {
Expand Down
47 changes: 26 additions & 21 deletions src/tabmat/ext/categorical.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ from libcpp cimport bool

cdef extern from "cat_split_helpers.cpp":
void _transpose_matvec_all_rows[Int, F](Int, Int*, F*, F*, Int)
void _transpose_matvec_all_rows_drop_first[Int, F](Int, Int*, F*, F*, Int)
void _transpose_matvec_all_rows_drop_first[Int, F](Int, Int*, F*, F*, Int, bool)


def transpose_matvec(
Expand Down Expand Up @@ -69,7 +69,8 @@ def transpose_matvec_drop_first(
dtype,
rows,
cols,
floating[:] out
floating[:] out,
bint drop_first
):
cdef int row, row_idx, n_keep_rows, col_idx
cdef int n_rows = len(indices)
Expand All @@ -81,33 +82,33 @@ def transpose_matvec_drop_first(

# Case 1: No row or col restrictions
if no_row_restrictions and no_col_restrictions:
_transpose_matvec_all_rows_drop_first(n_rows, &indices[0], &other[0], &out[0], out_size)
_transpose_matvec_all_rows_drop_first(n_rows, &indices[0], &other[0], &out[0], out_size, drop_first)
# Case 2: row restrictions but no col restrictions
elif no_col_restrictions:
rows_view = rows
n_keep_rows = len(rows_view)
for row_idx in range(n_keep_rows):
row = rows_view[row_idx]
col_idx = indices[row] - 1
if col_idx != -1:
col_idx = indices[row] - drop_first
if col_idx >= 0:
out[col_idx] += other[row]
# Cases 3 and 4: col restrictions
else:
cols_included = get_col_included(cols, n_cols)
# Case 3: Col restrictions but no row restrictions
if no_row_restrictions:
for row_idx in range(n_rows):
col_idx = indices[row_idx] - 1
if (col_idx != -1) and (cols_included[col_idx]):
col_idx = indices[row_idx] - drop_first
if (col_idx >= 0) and (cols_included[col_idx]):
out[col_idx] += other[row_idx]
# Case 4: Both col restrictions and row restrictions
else:
rows_view = rows
n_keep_rows = len(rows_view)
for row_idx in range(n_keep_rows):
row = rows_view[row_idx]
col_idx = indices[row] - 1
if (col_idx != -1) and (cols_included[col_idx]):
col_idx = indices[row] - drop_first
if (col_idx >= 0) and (cols_included[col_idx]):
out[col_idx] += other[row]


Expand Down Expand Up @@ -151,7 +152,8 @@ def matvec_drop_first(
int n_rows,
int[:] cols,
int n_cols,
floating[:] out_vec
floating[:] out_vec,
bint drop_first
):
"""See `matvec`. Here we drop the first category of the
CategoricalMatrix so the indices refer to the column index + 1.
Expand All @@ -161,14 +163,14 @@ def matvec_drop_first(

if cols is None:
for i in prange(n_rows, nogil=True):
col_idx = indices[i] - 1 # reference category is always 0.
if col_idx != -1:
col_idx = indices[i] - drop_first # reference category is always 0.
if col_idx >= 0:
out_vec[i] += other[col_idx]
else:
col_included = get_col_included(cols, n_cols)
for i in prange(n_rows, nogil=True):
col_idx = indices[i] - 1
if (col_idx != -1) and (col_included[col_idx] == 1):
col_idx = indices[i] - drop_first
if (col_idx >= 0) and (col_included[col_idx] == 1):
out_vec[i] += other[col_idx]
return

Expand Down Expand Up @@ -196,16 +198,17 @@ def sandwich_categorical_drop_first(
floating[:] d,
int[:] rows,
dtype,
int n_cols
int n_cols,
bint drop_first
):
cdef floating[:] res = np.zeros(n_cols, dtype=dtype)
cdef int col_idx, k, k_idx
cdef int n_rows = len(rows)

for k_idx in range(n_rows):
k = rows[k_idx]
col_idx = indices[k] - 1 # reference category is always 0.
if col_idx != -1:
col_idx = indices[k] - drop_first # reference category is always 0.
if col_idx >= 0:
res[col_idx] += d[k]
return np.asarray(res)

Expand All @@ -215,6 +218,7 @@ def multiply_drop_first(
numeric[:] d,
int ncols,
dtype,
bint drop_first,
):
"""Multiply a CategoricalMatrix by a vector d.
Expand Down Expand Up @@ -255,9 +259,9 @@ def multiply_drop_first(

for i in range(nrows):
vnew_indptr[i] = nonref_cnt
if indices[i] != 0:
if indices[i] >= drop_first:
vnew_data[nonref_cnt] = d[i]
vnew_indices[nonref_cnt] = indices[i] - 1
vnew_indices[nonref_cnt] = indices[i] - drop_first
nonref_cnt += 1

vnew_indptr[i+1] = nonref_cnt
Expand All @@ -268,6 +272,7 @@ def multiply_drop_first(
def subset_categorical_drop_first(
int[:] indices,
int ncols,
bint drop_first
):
"""Construct the inputs to transform a CategoricalMatrix into a csr_matrix.
Expand Down Expand Up @@ -299,8 +304,8 @@ def subset_categorical_drop_first(

for i in range(nrows):
vnew_indptr[i] = nonzero_cnt
if indices[i] != 0:
vnew_indices[nonzero_cnt] = indices[i] - 1
if indices[i] >= drop_first:
vnew_indices[nonzero_cnt] = indices[i] - drop_first
nonzero_cnt += 1

vnew_indptr[i+1] = nonzero_cnt
Expand Down

0 comments on commit f97eb34

Please sign in to comment.