From a12acbb0bec5f4522c3cd44aeabbbdc066031378 Mon Sep 17 00:00:00 2001 From: Emil Hvitfeldt Date: Tue, 12 Nov 2024 18:45:53 -0800 Subject: [PATCH 1/4] add `sparse` argument to `step_dummy()` --- NEWS.md | 2 + R/dummy.R | 89 ++++++++++++++++++++++------------ man/step_dummy.Rd | 5 ++ tests/testthat/_snaps/dummy.md | 10 ++++ tests/testthat/test-dummy.R | 28 +++++++++++ 5 files changed, 102 insertions(+), 32 deletions(-) diff --git a/NEWS.md b/NEWS.md index 6173cfbbe..9f9dd322b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -10,6 +10,8 @@ * All steps and checks now require arguments `trained`, `skip`, `role`, and `id` at all times. +* `step_dummy()` gained `sparse` argument. When set to `TRUE`, `step_dummy()` will produce sparse vectors. + # recipes 1.1.0 ## Improvements diff --git a/R/dummy.R b/R/dummy.R index 226b4955b..8781a7e6b 100644 --- a/R/dummy.R +++ b/R/dummy.R @@ -18,6 +18,9 @@ #' @param levels A list that contains the information needed to create dummy #' variables for each variable contained in `terms`. This is `NULL` until the #' step is trained by [prep()]. +#' @param sparse A logical. Should the columns produced be sparse vectors. +#' Sparsity is only supported for `"contr.treatment"` contrasts. Defaults to +#' `FALSE`. #' @template step-return #' @family dummy variable and encoding steps #' @seealso [dummy_names()] @@ -121,6 +124,7 @@ step_dummy <- preserve = deprecated(), naming = dummy_names, levels = NULL, + sparse = FALSE, keep_original_cols = FALSE, skip = FALSE, id = rand_id("dummy")) { @@ -143,6 +147,7 @@ step_dummy <- preserve = keep_original_cols, naming = naming, levels = levels, + sparse = sparse, keep_original_cols = keep_original_cols, skip = skip, id = id @@ -151,7 +156,7 @@ step_dummy <- } step_dummy_new <- - function(terms, role, trained, one_hot, preserve, naming, levels, + function(terms, role, trained, one_hot, preserve, naming, levels, sparse, keep_original_cols, skip, id) { step( subclass = "dummy", @@ -162,6 +167,7 @@ step_dummy_new <- preserve = preserve, naming = naming, levels = levels, + sparse = sparse, keep_original_cols = keep_original_cols, skip = skip, id = id @@ -174,6 +180,7 @@ prep.step_dummy <- function(x, training, info = NULL, ...) { check_type(training[, col_names], types = c("factor", "ordered")) check_bool(x$one_hot, arg = "one_hot") check_function(x$naming, arg = "naming", allow_empty = FALSE) + check_bool(x$sparse, arg = "sparse") if (length(col_names) > 0) { ## I hate doing this but currently we are going to have @@ -218,6 +225,7 @@ prep.step_dummy <- function(x, training, info = NULL, ...) { preserve = x$preserve, naming = x$naming, levels = levels, + sparse = x$sparse, keep_original_cols = get_keep_original_cols(x), skip = x$skip, id = x$id @@ -285,43 +293,60 @@ bake.step_dummy <- function(object, new_data, ...) { col_name, step = "step_dummy" ) + + new_data[, col_name] <- factor( + new_data[[col_name]], + levels = levels_values, + ordered = is_ordered + ) - new_data[, col_name] <- - factor( - new_data[[col_name]], - levels = levels_values, - ordered = is_ordered - ) + if (object$sparse) { + current_contrast <- getOption("contrasts")[is_ordered + 1] + if (current_contrast != "contr.treatment") { + cli::cli_abort( + "when {.code sparse = TRUE}, only {.val contr.treatment} contrasts are + supported. Not {.val {current_contrast}}." + ) + } - indicators <- - model.frame( - rlang::new_formula(lhs = NULL, rhs = rlang::sym(col_name)), - data = new_data[, col_name], - xlev = levels_values, - na.action = na.pass + indicators <- sparsevctrs::sparse_dummy( + x = new_data[[col_name]], + one_hot = object$one_hot ) - - indicators <- tryCatch( - model.matrix(object = levels, data = indicators), - error = function(cnd) { - if (grepl("(vector memory|cannot allocate)", cnd$message)) { - n_levels <- length(attr(levels, "values")) - cli::cli_abort( - "{.var {col_name}} contains too many levels ({n_levels}), \\ - which would result in a data.frame too large to fit in memory.", - call = NULL - ) + indicators <- tibble::new_tibble(indicators) + used_lvl <- colnames(indicators) + } else { + indicators <- + model.frame( + rlang::new_formula(lhs = NULL, rhs = rlang::sym(col_name)), + data = new_data[, col_name], + xlev = levels_values, + na.action = na.pass + ) + + indicators <- tryCatch( + model.matrix(object = levels, data = indicators), + error = function(cnd) { + if (grepl("(vector memory|cannot allocate)", cnd$message)) { + n_levels <- length(attr(levels, "values")) + cli::cli_abort( + "{.var {col_name}} contains too many levels ({n_levels}), \\ + which would result in a data.frame too large to fit in memory.", + call = NULL + ) + } + stop(cnd) } - stop(cnd) + ) + + if (!object$one_hot) { + indicators <- indicators[, colnames(indicators) != "(Intercept)", drop = FALSE] } - ) - - if (!object$one_hot) { - indicators <- indicators[, colnames(indicators) != "(Intercept)", drop = FALSE] + + ## use backticks for nonstandard factor levels here + used_lvl <- gsub(paste0("^\\`?", col_name, "\\`?"), "", colnames(indicators)) } - - ## use backticks for nonstandard factor levels here - used_lvl <- gsub(paste0("^\\`?", col_name, "\\`?"), "", colnames(indicators)) + new_names <- object$naming(col_name, used_lvl, is_ordered) colnames(indicators) <- new_names indicators <- check_name(indicators, new_data, object, new_names) diff --git a/man/step_dummy.Rd b/man/step_dummy.Rd index 0712cab50..3b8425040 100644 --- a/man/step_dummy.Rd +++ b/man/step_dummy.Rd @@ -13,6 +13,7 @@ step_dummy( preserve = deprecated(), naming = dummy_names, levels = NULL, + sparse = FALSE, keep_original_cols = FALSE, skip = FALSE, id = rand_id("dummy") @@ -46,6 +47,10 @@ columns. See Details below.} variables for each variable contained in \code{terms}. This is \code{NULL} until the step is trained by \code{\link[=prep]{prep()}}.} +\item{sparse}{A logical. Should the columns produced be sparse vectors. +Sparsity is only supported for \code{"contr.treatment"} contrasts. Defaults to +\code{FALSE}.} + \item{keep_original_cols}{A logical to keep the original variables in the output. Defaults to \code{FALSE}.} diff --git a/tests/testthat/_snaps/dummy.md b/tests/testthat/_snaps/dummy.md index f64386c0c..bcb518f84 100644 --- a/tests/testthat/_snaps/dummy.md +++ b/tests/testthat/_snaps/dummy.md @@ -154,6 +154,16 @@ Caused by error in `bake()`: ! Only one factor level in `x`: "only-level". +# sparse = TRUE errors on unsupported contrasts + + Code + recipe(~., data = tibble(x = letters)) %>% step_dummy(x, sparse = TRUE) %>% + prep() + Condition + Error in `step_dummy()`: + Caused by error in `bake()`: + ! when `sparse = TRUE`, only "contr.treatment" contrasts are supported. Not "contr.helmert". + # bake method errors when needed non-standard role columns are missing Code diff --git a/tests/testthat/test-dummy.R b/tests/testthat/test-dummy.R index 2a576e524..1d77ea4a8 100644 --- a/tests/testthat/test-dummy.R +++ b/tests/testthat/test-dummy.R @@ -354,6 +354,34 @@ test_that("throws an informative error for single level", { ) }) +test_that("sparse = TRUE works", { + rec <- recipe(~ ., data = tibble(x = c(NA, letters))) + + suppressWarnings({ + dense <- rec %>% step_dummy(x, sparse = FALSE) %>% prep() %>% bake(NULL) + dense <- purrr::map(dense, as.integer) %>% tibble::new_tibble() + sparse <- rec %>% step_dummy(x, sparse = TRUE) %>% prep() %>% bake(NULL) + }) + + expect_identical(dense, sparse) + + expect_false(any(vapply(dense, sparsevctrs::is_sparse_vector, logical(1)))) + expect_true(all(vapply(sparse, sparsevctrs::is_sparse_vector, logical(1)))) +}) + +test_that("sparse = TRUE errors on unsupported contrasts", { + go_helmert <- getOption("contrasts") + go_helmert["unordered"] <- "contr.helmert" + withr::local_options(contrasts = go_helmert) + + expect_snapshot( + error = TRUE, + recipe(~ ., data = tibble(x = letters)) %>% + step_dummy(x, sparse = TRUE) %>% + prep() + ) +}) + # Infrastructure --------------------------------------------------------------- test_that("bake method errors when needed non-standard role columns are missing", { From e708dda4608eab421b50e0a4749a86852752aa11 Mon Sep 17 00:00:00 2001 From: Emil Hvitfeldt Date: Wed, 13 Nov 2024 10:10:24 -0800 Subject: [PATCH 2/4] Apply suggestions from code review Co-authored-by: Simon P. Couch --- NEWS.md | 2 +- R/dummy.R | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index 9f9dd322b..5f1c02ce1 100644 --- a/NEWS.md +++ b/NEWS.md @@ -10,7 +10,7 @@ * All steps and checks now require arguments `trained`, `skip`, `role`, and `id` at all times. -* `step_dummy()` gained `sparse` argument. When set to `TRUE`, `step_dummy()` will produce sparse vectors. +* `step_dummy()` gained `sparse` argument. When set to `TRUE`, `step_dummy()` will produce sparse vectors. (#1392) # recipes 1.1.0 diff --git a/R/dummy.R b/R/dummy.R index 8781a7e6b..9ead0d559 100644 --- a/R/dummy.R +++ b/R/dummy.R @@ -304,8 +304,8 @@ bake.step_dummy <- function(object, new_data, ...) { current_contrast <- getOption("contrasts")[is_ordered + 1] if (current_contrast != "contr.treatment") { cli::cli_abort( - "when {.code sparse = TRUE}, only {.val contr.treatment} contrasts are - supported. Not {.val {current_contrast}}." + "When {.code sparse = TRUE}, only {.val contr.treatment} contrasts are + supported, not {.val {current_contrast}}." ) } From 6e23367e089c82726b89791a205ffc28a5bd9749 Mon Sep 17 00:00:00 2001 From: Emil Hvitfeldt Date: Wed, 13 Nov 2024 10:15:52 -0800 Subject: [PATCH 3/4] document the class of columns produced by step_dummy() --- R/dummy.R | 3 ++- man/step_dummy.Rd | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/R/dummy.R b/R/dummy.R index 9ead0d559..19e85ea8b 100644 --- a/R/dummy.R +++ b/R/dummy.R @@ -63,7 +63,8 @@ #' this step. #' #' Also, there are a number of contrast methods that return fractional values. -#' The columns returned by this step are doubles (not integers). +#' The columns returned by this step are doubles (not integers) when +#' `sparse = FALSE`. The columns returned when `sparse = TRUE` are integers. #' #' The [package vignette for dummy variables](https://recipes.tidymodels.org/articles/Dummies.html) #' and interactions has more information. diff --git a/man/step_dummy.Rd b/man/step_dummy.Rd index 3b8425040..1a6b91560 100644 --- a/man/step_dummy.Rd +++ b/man/step_dummy.Rd @@ -112,7 +112,8 @@ be changed by passing in a different function to the \code{naming} argument for this step. Also, there are a number of contrast methods that return fractional values. -The columns returned by this step are doubles (not integers). +The columns returned by this step are doubles (not integers) when +\code{sparse = FALSE}. The columns returned when \code{sparse = TRUE} are integers. The \href{https://recipes.tidymodels.org/articles/Dummies.html}{package vignette for dummy variables} and interactions has more information. From 6031f7f3664632f9b94ae154bbe1bd47afb08e4e Mon Sep 17 00:00:00 2001 From: Emil Hvitfeldt Date: Wed, 13 Nov 2024 10:29:37 -0800 Subject: [PATCH 4/4] update snapshots --- tests/testthat/_snaps/dummy.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/_snaps/dummy.md b/tests/testthat/_snaps/dummy.md index bcb518f84..2b9da790c 100644 --- a/tests/testthat/_snaps/dummy.md +++ b/tests/testthat/_snaps/dummy.md @@ -162,7 +162,7 @@ Condition Error in `step_dummy()`: Caused by error in `bake()`: - ! when `sparse = TRUE`, only "contr.treatment" contrasts are supported. Not "contr.helmert". + ! When `sparse = TRUE`, only "contr.treatment" contrasts are supported, not "contr.helmert". # bake method errors when needed non-standard role columns are missing