Skip to content

Commit

Permalink
Merge pull request #80 from r-lib/sparse-dummy
Browse files Browse the repository at this point in the history
Sparse dummy
  • Loading branch information
EmilHvitfeldt authored Oct 18, 2024
2 parents 2726b8a + 505431b commit d4890ec
Show file tree
Hide file tree
Showing 10 changed files with 657 additions and 7 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ export(is_sparse_vector)
export(sparse_character)
export(sparse_default)
export(sparse_double)
export(sparse_dummy)
export(sparse_integer)
export(sparse_logical)
export(sparse_mean)
Expand Down
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# sparsevctrs (development version)

* Helper function `sparse_dummy()` has beenn added. (#49)

* Helper functions `sparse_mean()`, `sparse_var()`, `sparse_sd()`, `sparse_median()` has been added. (#49)

* All sparse vector types now have a significant smaller base object size. (#67)
Expand Down
70 changes: 70 additions & 0 deletions R/sparse_dummy.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#' Generate sparse dummy variables
#'
#' @param x A factor.
#' @param one_hot A single logical value. Should the first factor level be
#' ignored. Defaults to `FALSE`.
#'
#' @details
#' Only factor variables can be used with [sparse_dummy()]. A call to
#' `as.factor()` would be required for any other type of data.
#'
#' If only a single level is present after `one_hot` takes effect. Then the
#' vector produced won't be sparse.
#'
#' A missing value at the `i`th element will produce missing values for all
#' dummy variables at thr `i`th position.
#'
#' @return A list of sparse integer dummy variables.
#'
#' @examples
#' x <- factor(c("a", "a", "b", "c", "d", "b"))
#'
#' sparse_dummy(x, one_hot = FALSE)
#'
#' x <- factor(c("a", "a", "b", "c", "d", "b"))
#'
#' sparse_dummy(x, one_hot = TRUE)
#'
#' x <- factor(c("a", NA, "b", "c", "d", NA))
#'
#' sparse_dummy(x, one_hot = FALSE)
#'
#' x <- factor(c("a", NA, "b", "c", "d", NA))
#'
#' sparse_dummy(x, one_hot = TRUE)
#' @export
sparse_dummy <- function(x, one_hot = FALSE) {
if (!is.factor(x)) {
cli::cli_abort("{.arg x} must be a factor, not {.obj_type_friendly {x}}.")
}

lvls <- levels(x)

x <- as.integer(x)

if (one_hot) {
lvls <- lvls[-1]
x <- x - 1L
}

n_lvls <- length(lvls)

if (n_lvls == 1) {
res <- list(rep(1L, length(x)))
names(res) <- lvls
return(res)
}

counts <- tabulate(x, nbins = n_lvls)

if (anyNA(x)) {
n_missing <- sum(is.na(x))
counts <- counts + n_missing
res <- .Call(ffi_sparse_dummy_na, x, lvls, counts, one_hot)
} else {
res <- .Call(ffi_sparse_dummy, x, lvls, counts, one_hot)
}

names(res) <- lvls
res
}
1 change: 1 addition & 0 deletions _pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ reference:
- sparse_var
- sparse_sd
- sparse_median
- sparse_dummy

- title: Utility Functions
contents:
Expand Down
47 changes: 47 additions & 0 deletions man/sparse_dummy.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 9 additions & 7 deletions src/init.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include <Rinternals.h>
#include "sparse-extractors.h"
#include "sparse-utils.h"
#include "sparse-dummy.h"

// Defined in altrep-sparse-double.c
extern SEXP ffi_altrep_new_sparse_double(SEXP);
Expand All @@ -19,15 +20,13 @@ extern SEXP ffi_altrep_new_sparse_logical(SEXP);
extern void sparsevctrs_init_altrep_sparse_logical(DllInfo*);

static const R_CallMethodDef CallEntries[] = {
{"ffi_altrep_new_sparse_double",
(DL_FUNC) &ffi_altrep_new_sparse_double,
1},
{"ffi_altrep_new_sparse_double", (DL_FUNC) &ffi_altrep_new_sparse_double, 1
},
{"ffi_altrep_new_sparse_integer",
(DL_FUNC) &ffi_altrep_new_sparse_integer,
1},
{"ffi_altrep_new_sparse_string",
(DL_FUNC) &ffi_altrep_new_sparse_string,
1},
{"ffi_altrep_new_sparse_string", (DL_FUNC) &ffi_altrep_new_sparse_string, 1
},
{"ffi_altrep_new_sparse_logical",
(DL_FUNC) &ffi_altrep_new_sparse_logical,
1},
Expand All @@ -36,7 +35,10 @@ static const R_CallMethodDef CallEntries[] = {
{"ffi_altrep_sparse_default", (DL_FUNC) &ffi_altrep_sparse_default, 1},
{"ffi_extract_altrep_class", (DL_FUNC) &ffi_extract_altrep_class, 1},
{"ffi_is_sparse_vector", (DL_FUNC) &ffi_is_sparse_vector, 1},
{NULL, NULL, 0}};
{"ffi_sparse_dummy", (DL_FUNC) &ffi_sparse_dummy, 4},
{"ffi_sparse_dummy_na", (DL_FUNC) &ffi_sparse_dummy_na, 4},
{NULL, NULL, 0}
};

void R_init_sparsevctrs(DllInfo* dll) {
R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
Expand Down
Loading

0 comments on commit d4890ec

Please sign in to comment.