From a12acbb0bec5f4522c3cd44aeabbbdc066031378 Mon Sep 17 00:00:00 2001
From: Emil Hvitfeldt <emilhhvitfeldt@gmail.com>
Date: Tue, 12 Nov 2024 18:45:53 -0800
Subject: [PATCH 1/4] add `sparse` argument to `step_dummy()`

---
 NEWS.md                        |  2 +
 R/dummy.R                      | 89 ++++++++++++++++++++++------------
 man/step_dummy.Rd              |  5 ++
 tests/testthat/_snaps/dummy.md | 10 ++++
 tests/testthat/test-dummy.R    | 28 +++++++++++
 5 files changed, 102 insertions(+), 32 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 6173cfbbe..9f9dd322b 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -10,6 +10,8 @@
 
 * All steps and checks now require arguments `trained`, `skip`, `role`, and `id` at all times.
 
+* `step_dummy()` gained `sparse` argument. When set to `TRUE`, `step_dummy()` will produce sparse vectors.
+
 # recipes 1.1.0
 
 ## Improvements
diff --git a/R/dummy.R b/R/dummy.R
index 226b4955b..8781a7e6b 100644
--- a/R/dummy.R
+++ b/R/dummy.R
@@ -18,6 +18,9 @@
 #' @param levels A list that contains the information needed to create dummy
 #'   variables for each variable contained in `terms`. This is `NULL` until the
 #'   step is trained by [prep()].
+#' @param sparse A logical. Should the columns produced be sparse vectors.
+#'   Sparsity is only supported for `"contr.treatment"` contrasts. Defaults to 
+#'   `FALSE`.
 #' @template step-return
 #' @family dummy variable and encoding steps
 #' @seealso [dummy_names()]
@@ -121,6 +124,7 @@ step_dummy <-
            preserve = deprecated(),
            naming = dummy_names,
            levels = NULL,
+           sparse = FALSE,
            keep_original_cols = FALSE,
            skip = FALSE,
            id = rand_id("dummy")) {
@@ -143,6 +147,7 @@ step_dummy <-
         preserve = keep_original_cols,
         naming = naming,
         levels = levels,
+        sparse = sparse,
         keep_original_cols = keep_original_cols,
         skip = skip,
         id = id
@@ -151,7 +156,7 @@ step_dummy <-
   }
 
 step_dummy_new <-
-  function(terms, role, trained, one_hot, preserve, naming, levels,
+  function(terms, role, trained, one_hot, preserve, naming, levels, sparse,
            keep_original_cols, skip, id) {
     step(
       subclass = "dummy",
@@ -162,6 +167,7 @@ step_dummy_new <-
       preserve = preserve,
       naming = naming,
       levels = levels,
+      sparse = sparse,
       keep_original_cols = keep_original_cols,
       skip = skip,
       id = id
@@ -174,6 +180,7 @@ prep.step_dummy <- function(x, training, info = NULL, ...) {
   check_type(training[, col_names], types = c("factor", "ordered"))
   check_bool(x$one_hot, arg = "one_hot")
   check_function(x$naming, arg = "naming", allow_empty = FALSE)
+  check_bool(x$sparse, arg = "sparse")
 
   if (length(col_names) > 0) {
     ## I hate doing this but currently we are going to have
@@ -218,6 +225,7 @@ prep.step_dummy <- function(x, training, info = NULL, ...) {
     preserve = x$preserve,
     naming = x$naming,
     levels = levels,
+    sparse = x$sparse,
     keep_original_cols = get_keep_original_cols(x),
     skip = x$skip,
     id = x$id
@@ -285,43 +293,60 @@ bake.step_dummy <- function(object, new_data, ...) {
       col_name,
       step = "step_dummy"
     )
+  
+    new_data[, col_name] <- factor(
+      new_data[[col_name]],
+      levels = levels_values,
+      ordered = is_ordered
+    )
 
-    new_data[, col_name] <-
-      factor(
-        new_data[[col_name]],
-        levels = levels_values,
-        ordered = is_ordered
-      )
+    if (object$sparse) {
+      current_contrast <- getOption("contrasts")[is_ordered + 1]
+      if (current_contrast != "contr.treatment") {
+        cli::cli_abort(
+          "when {.code sparse = TRUE}, only {.val contr.treatment} contrasts are
+          supported. Not {.val {current_contrast}}."
+        )
+      } 
 
-    indicators <-
-      model.frame(
-        rlang::new_formula(lhs = NULL, rhs = rlang::sym(col_name)),
-        data = new_data[, col_name],
-        xlev = levels_values,
-        na.action = na.pass
+      indicators <- sparsevctrs::sparse_dummy(
+        x = new_data[[col_name]],
+        one_hot = object$one_hot
       )
-
-    indicators <- tryCatch(
-      model.matrix(object = levels, data = indicators),
-      error = function(cnd) {
-        if (grepl("(vector memory|cannot allocate)", cnd$message)) {
-          n_levels <- length(attr(levels, "values"))
-          cli::cli_abort(
-            "{.var {col_name}} contains too many levels ({n_levels}), \\
-            which would result in a data.frame too large to fit in memory.",
-            call = NULL
-          )
+      indicators <- tibble::new_tibble(indicators)
+      used_lvl <- colnames(indicators)
+    } else {
+      indicators <-
+        model.frame(
+          rlang::new_formula(lhs = NULL, rhs = rlang::sym(col_name)),
+          data = new_data[, col_name],
+          xlev = levels_values,
+          na.action = na.pass
+        )
+  
+      indicators <- tryCatch(
+        model.matrix(object = levels, data = indicators),
+        error = function(cnd) {
+          if (grepl("(vector memory|cannot allocate)", cnd$message)) {
+            n_levels <- length(attr(levels, "values"))
+            cli::cli_abort(
+              "{.var {col_name}} contains too many levels ({n_levels}), \\
+              which would result in a data.frame too large to fit in memory.",
+              call = NULL
+            )
+          }
+          stop(cnd)
         }
-        stop(cnd)
+      )
+  
+      if (!object$one_hot) {
+        indicators <- indicators[, colnames(indicators) != "(Intercept)", drop = FALSE]
       }
-    )
-
-    if (!object$one_hot) {
-      indicators <- indicators[, colnames(indicators) != "(Intercept)", drop = FALSE]
+      
+      ## use backticks for nonstandard factor levels here
+      used_lvl <- gsub(paste0("^\\`?", col_name, "\\`?"), "", colnames(indicators))
     }
-
-    ## use backticks for nonstandard factor levels here
-    used_lvl <- gsub(paste0("^\\`?", col_name, "\\`?"), "", colnames(indicators))
+    
     new_names <- object$naming(col_name, used_lvl, is_ordered)
     colnames(indicators) <- new_names
     indicators <- check_name(indicators, new_data, object, new_names)
diff --git a/man/step_dummy.Rd b/man/step_dummy.Rd
index 0712cab50..3b8425040 100644
--- a/man/step_dummy.Rd
+++ b/man/step_dummy.Rd
@@ -13,6 +13,7 @@ step_dummy(
   preserve = deprecated(),
   naming = dummy_names,
   levels = NULL,
+  sparse = FALSE,
   keep_original_cols = FALSE,
   skip = FALSE,
   id = rand_id("dummy")
@@ -46,6 +47,10 @@ columns. See Details below.}
 variables for each variable contained in \code{terms}. This is \code{NULL} until the
 step is trained by \code{\link[=prep]{prep()}}.}
 
+\item{sparse}{A logical. Should the columns produced be sparse vectors.
+Sparsity is only supported for \code{"contr.treatment"} contrasts. Defaults to
+\code{FALSE}.}
+
 \item{keep_original_cols}{A logical to keep the original variables in the
 output. Defaults to \code{FALSE}.}
 
diff --git a/tests/testthat/_snaps/dummy.md b/tests/testthat/_snaps/dummy.md
index f64386c0c..bcb518f84 100644
--- a/tests/testthat/_snaps/dummy.md
+++ b/tests/testthat/_snaps/dummy.md
@@ -154,6 +154,16 @@
       Caused by error in `bake()`:
       ! Only one factor level in `x`: "only-level".
 
+# sparse = TRUE errors on unsupported contrasts
+
+    Code
+      recipe(~., data = tibble(x = letters)) %>% step_dummy(x, sparse = TRUE) %>%
+        prep()
+    Condition
+      Error in `step_dummy()`:
+      Caused by error in `bake()`:
+      ! when `sparse = TRUE`, only "contr.treatment" contrasts are supported. Not "contr.helmert".
+
 # bake method errors when needed non-standard role columns are missing
 
     Code
diff --git a/tests/testthat/test-dummy.R b/tests/testthat/test-dummy.R
index 2a576e524..1d77ea4a8 100644
--- a/tests/testthat/test-dummy.R
+++ b/tests/testthat/test-dummy.R
@@ -354,6 +354,34 @@ test_that("throws an informative error for single level", {
   )
 })
 
+test_that("sparse = TRUE works", {
+  rec <- recipe(~ ., data = tibble(x = c(NA, letters)))
+
+  suppressWarnings({
+    dense <- rec %>% step_dummy(x, sparse = FALSE) %>% prep() %>% bake(NULL)
+    dense <- purrr::map(dense, as.integer) %>% tibble::new_tibble()
+    sparse <- rec %>% step_dummy(x, sparse = TRUE) %>% prep() %>% bake(NULL)
+  })
+
+  expect_identical(dense, sparse)
+
+  expect_false(any(vapply(dense, sparsevctrs::is_sparse_vector, logical(1))))
+  expect_true(all(vapply(sparse, sparsevctrs::is_sparse_vector, logical(1))))
+})
+
+test_that("sparse = TRUE errors on unsupported contrasts", {
+  go_helmert <- getOption("contrasts")
+  go_helmert["unordered"] <- "contr.helmert"
+  withr::local_options(contrasts = go_helmert)
+
+  expect_snapshot(
+    error = TRUE,
+    recipe(~ ., data = tibble(x = letters)) %>% 
+      step_dummy(x, sparse = TRUE) %>% 
+      prep()
+  )
+})
+
 # Infrastructure ---------------------------------------------------------------
 
 test_that("bake method errors when needed non-standard role columns are missing", {

From e708dda4608eab421b50e0a4749a86852752aa11 Mon Sep 17 00:00:00 2001
From: Emil Hvitfeldt <emil.hvitfeldt@posit.co>
Date: Wed, 13 Nov 2024 10:10:24 -0800
Subject: [PATCH 2/4] Apply suggestions from code review

Co-authored-by: Simon P. Couch <simonpatrickcouch@gmail.com>
---
 NEWS.md   | 2 +-
 R/dummy.R | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 9f9dd322b..5f1c02ce1 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -10,7 +10,7 @@
 
 * All steps and checks now require arguments `trained`, `skip`, `role`, and `id` at all times.
 
-* `step_dummy()` gained `sparse` argument. When set to `TRUE`, `step_dummy()` will produce sparse vectors.
+* `step_dummy()` gained `sparse` argument. When set to `TRUE`, `step_dummy()` will produce sparse vectors. (#1392)
 
 # recipes 1.1.0
 
diff --git a/R/dummy.R b/R/dummy.R
index 8781a7e6b..9ead0d559 100644
--- a/R/dummy.R
+++ b/R/dummy.R
@@ -304,8 +304,8 @@ bake.step_dummy <- function(object, new_data, ...) {
       current_contrast <- getOption("contrasts")[is_ordered + 1]
       if (current_contrast != "contr.treatment") {
         cli::cli_abort(
-          "when {.code sparse = TRUE}, only {.val contr.treatment} contrasts are
-          supported. Not {.val {current_contrast}}."
+          "When {.code sparse = TRUE}, only {.val contr.treatment} contrasts are
+          supported, not {.val {current_contrast}}."
         )
       } 
 

From 6e23367e089c82726b89791a205ffc28a5bd9749 Mon Sep 17 00:00:00 2001
From: Emil Hvitfeldt <emilhhvitfeldt@gmail.com>
Date: Wed, 13 Nov 2024 10:15:52 -0800
Subject: [PATCH 3/4] document the class of columns produced by step_dummy()

---
 R/dummy.R         | 3 ++-
 man/step_dummy.Rd | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/R/dummy.R b/R/dummy.R
index 9ead0d559..19e85ea8b 100644
--- a/R/dummy.R
+++ b/R/dummy.R
@@ -63,7 +63,8 @@
 #' this step.
 #'
 #' Also, there are a number of contrast methods that return fractional values.
-#' The columns returned by this step are doubles (not integers).
+#' The columns returned by this step are doubles (not integers) when 
+#' `sparse = FALSE`. The columns returned when `sparse = TRUE` are integers.
 #'
 #' The [package vignette for dummy variables](https://recipes.tidymodels.org/articles/Dummies.html)
 #' and interactions has more information.
diff --git a/man/step_dummy.Rd b/man/step_dummy.Rd
index 3b8425040..1a6b91560 100644
--- a/man/step_dummy.Rd
+++ b/man/step_dummy.Rd
@@ -112,7 +112,8 @@ be changed by passing in a different function to the \code{naming} argument for
 this step.
 
 Also, there are a number of contrast methods that return fractional values.
-The columns returned by this step are doubles (not integers).
+The columns returned by this step are doubles (not integers) when
+\code{sparse = FALSE}. The columns returned when \code{sparse = TRUE} are integers.
 
 The \href{https://recipes.tidymodels.org/articles/Dummies.html}{package vignette for dummy variables}
 and interactions has more information.

From 6031f7f3664632f9b94ae154bbe1bd47afb08e4e Mon Sep 17 00:00:00 2001
From: Emil Hvitfeldt <emilhhvitfeldt@gmail.com>
Date: Wed, 13 Nov 2024 10:29:37 -0800
Subject: [PATCH 4/4] update snapshots

---
 tests/testthat/_snaps/dummy.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/testthat/_snaps/dummy.md b/tests/testthat/_snaps/dummy.md
index bcb518f84..2b9da790c 100644
--- a/tests/testthat/_snaps/dummy.md
+++ b/tests/testthat/_snaps/dummy.md
@@ -162,7 +162,7 @@
     Condition
       Error in `step_dummy()`:
       Caused by error in `bake()`:
-      ! when `sparse = TRUE`, only "contr.treatment" contrasts are supported. Not "contr.helmert".
+      ! When `sparse = TRUE`, only "contr.treatment" contrasts are supported, not "contr.helmert".
 
 # bake method errors when needed non-standard role columns are missing