From 37afbdf7dccebca385c9db7e44543b78af3a7299 Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Thu, 16 May 2024 10:19:50 +0200
Subject: [PATCH] fix demean()

---
 R/demean.R                      | 74 ++++++++++++++++-----------------
 man/demean.Rd                   | 29 +++++++------
 tests/testthat/_snaps/demean.md |  2 +-
 tests/testthat/test-demean.R    | 20 +++++----
 4 files changed, 68 insertions(+), 57 deletions(-)

diff --git a/R/demean.R b/R/demean.R
index 69e833a4d..fc797fc9e 100644
--- a/R/demean.R
+++ b/R/demean.R
@@ -11,7 +11,7 @@
 #' @param x A data frame.
 #' @param select Character vector (or formula) with names of variables to select
 #'   that should be group- and de-meaned.
-#' @param group Character vector (or formula) with the name of the variable that
+#' @param by Character vector (or formula) with the name of the variable that
 #'   indicates the group- or cluster-ID.
 #' @param center Method for centering. `demean()` always performs
 #'   mean-centering, while `degroup()` can use `center = "median"` or
@@ -25,6 +25,7 @@
 #'   attributes to indicate the within- and between-effects. This is only
 #'   relevant when printing `model_parameters()` - in such cases, the
 #'   within- and between-effects are printed in separated blocks.
+#' @param group Deprecated. Use `by` instead.
 #' @inheritParams center
 #'
 #' @return
@@ -92,7 +93,7 @@
 #'
 #'   \subsection{Terminology}{
 #'     The group-meaned variable is simply the mean of an independent variable
-#'     within each group (or id-level or cluster) represented by `group`.
+#'     within each group (or id-level or cluster) represented by `by`.
 #'     It represents the cluster-mean of an independent variable. The regression
 #'     coefficient of a group-meaned variable is the *between-subject-effect*.
 #'     The de-meaned variable is then the centered version of the group-meaned
@@ -199,10 +200,10 @@
 #' iris$ID <- sample(1:4, nrow(iris), replace = TRUE) # fake-ID
 #' iris$binary <- as.factor(rbinom(150, 1, .35)) # binary variable
 #'
-#' x <- demean(iris, select = c("Sepal.Length", "Petal.Length"), group = "ID")
+#' x <- demean(iris, select = c("Sepal.Length", "Petal.Length"), by = "ID")
 #' head(x)
 #'
-#' x <- demean(iris, select = c("Sepal.Length", "binary", "Species"), group = "ID")
+#' x <- demean(iris, select = c("Sepal.Length", "binary", "Species"), by = "ID")
 #' head(x)
 #'
 #'
@@ -213,23 +214,29 @@
 #'   y = c(1, 2, 1, 2, 4, 3, 2, 1),
 #'   ID = c(1, 2, 3, 1, 2, 3, 1, 2)
 #' )
-#' demean(dat, select = c("a", "x*y"), group = "ID")
+#' demean(dat, select = c("a", "x*y"), by = "ID")
 #'
 #' # or in formula-notation
-#' demean(dat, select = ~ a + x * y, group = ~ID)
+#' demean(dat, select = ~ a + x * y, by = ~ID)
 #'
 #' @export
 demean <- function(x,
                    select,
-                   group,
+                   by,
                    suffix_demean = "_within",
                    suffix_groupmean = "_between",
                    add_attributes = TRUE,
-                   verbose = TRUE) {
+                   verbose = TRUE,
+                   group = NULL) {
+  ## TODO: deprecate later
+  if (!is.null(group)) {
+    by <- group
+  }
+
   degroup(
     x = x,
     select = select,
-    group = group,
+    by = by,
     center = "mean",
     suffix_demean = suffix_demean,
     suffix_groupmean = suffix_groupmean,
@@ -247,12 +254,18 @@ demean <- function(x,
 #' @export
 degroup <- function(x,
                     select,
-                    group,
+                    by,
                     center = "mean",
                     suffix_demean = "_within",
                     suffix_groupmean = "_between",
                     add_attributes = TRUE,
-                    verbose = TRUE) {
+                    verbose = TRUE,
+                    group = NULL) {
+  ## TODO: deprecate later
+  if (!is.null(group)) {
+    by <- group
+  }
+
   # ugly tibbles again...
   x <- .coerce_to_dataframe(x)
 
@@ -266,8 +279,8 @@ degroup <- function(x,
     ))
   }
 
-  if (inherits(group, "formula")) {
-    group <- all.vars(group)
+  if (inherits(by, "formula")) {
+    by <- all.vars(by)
   }
 
   interactions_no <- select[!grepl("(\\*|\\:)", select)]
@@ -296,7 +309,7 @@ degroup <- function(x,
   select <- intersect(colnames(x), select)
 
   # get data to demean...
-  dat <- x[, c(select, group)]
+  dat <- x[, c(select, by)]
 
 
   # find categorical predictors that are coded as factors
@@ -344,31 +357,18 @@ degroup <- function(x,
   # for variables within each group (the group means). assign
   # mean values to a vector of same length as the data
 
-  if (center == "mode") {
-    x_gm_list <- lapply(select, function(i) {
-      stats::ave(dat[[i]], dat[[group]], FUN = function(.gm) distribution_mode(stats::na.omit(.gm)))
-    })
-  } else if (center == "median") {
-    x_gm_list <- lapply(select, function(i) {
-      stats::ave(dat[[i]], dat[[group]], FUN = function(.gm) stats::median(.gm, na.rm = TRUE))
-    })
-  } else if (center == "min") {
-    x_gm_list <- lapply(select, function(i) {
-      stats::ave(dat[[i]], dat[[group]], FUN = function(.gm) min(.gm, na.rm = TRUE))
-    })
-  } else if (center == "max") {
-    x_gm_list <- lapply(select, function(i) {
-      stats::ave(dat[[i]], dat[[group]], FUN = function(.gm) max(.gm, na.rm = TRUE))
-    })
-  } else {
-    x_gm_list <- lapply(select, function(i) {
-      stats::ave(dat[[i]], dat[[group]], FUN = function(.gm) mean(.gm, na.rm = TRUE))
-    })
-  }
-
+  gm_fun <- switch(center,
+    mode = function(.gm) distribution_mode(stats::na.omit(.gm)),
+    median = function(.gm) stats::median(.gm, na.rm = TRUE),
+    min = function(.gm) min(.gm, na.rm = TRUE),
+    max = function(.gm) max(.gm, na.rm = TRUE),
+    function(.gm) mean(.gm, na.rm = TRUE)
+  )
+  x_gm_list <- lapply(select, function(i) {
+    stats::ave(dat[[i]], dat[[by]], FUN = gm_fun)
+  })
   names(x_gm_list) <- select
 
-
   # create de-meaned variables by subtracting the group mean from each individual value
 
   x_dm_list <- lapply(select, function(i) dat[[i]] - x_gm_list[[i]])
diff --git a/man/demean.Rd b/man/demean.Rd
index 422c8d32e..d03a1010b 100644
--- a/man/demean.Rd
+++ b/man/demean.Rd
@@ -9,33 +9,36 @@
 demean(
   x,
   select,
-  group,
+  by,
   suffix_demean = "_within",
   suffix_groupmean = "_between",
   add_attributes = TRUE,
-  verbose = TRUE
+  verbose = TRUE,
+  group = NULL
 )
 
 degroup(
   x,
   select,
-  group,
+  by,
   center = "mean",
   suffix_demean = "_within",
   suffix_groupmean = "_between",
   add_attributes = TRUE,
-  verbose = TRUE
+  verbose = TRUE,
+  group = NULL
 )
 
 detrend(
   x,
   select,
-  group,
+  by,
   center = "mean",
   suffix_demean = "_within",
   suffix_groupmean = "_between",
   add_attributes = TRUE,
-  verbose = TRUE
+  verbose = TRUE,
+  group = NULL
 )
 }
 \arguments{
@@ -44,7 +47,7 @@ detrend(
 \item{select}{Character vector (or formula) with names of variables to select
 that should be group- and de-meaned.}
 
-\item{group}{Character vector (or formula) with the name of the variable that
+\item{by}{Character vector (or formula) with the name of the variable that
 indicates the group- or cluster-ID.}
 
 \item{suffix_demean, suffix_groupmean}{String value, will be appended to the
@@ -59,6 +62,8 @@ within- and between-effects are printed in separated blocks.}
 
 \item{verbose}{Toggle warnings and messages.}
 
+\item{group}{Deprecated. Use \code{by} instead.}
+
 \item{center}{Method for centering. \code{demean()} always performs
 mean-centering, while \code{degroup()} can use \code{center = "median"} or
 \code{center = "mode"} for median- or mode-centering, and also \code{"min"}
@@ -131,7 +136,7 @@ intervals and low statistical power} (\cite{Heisig et al. 2017}).
 
 \subsection{Terminology}{
 The group-meaned variable is simply the mean of an independent variable
-within each group (or id-level or cluster) represented by \code{group}.
+within each group (or id-level or cluster) represented by \code{by}.
 It represents the cluster-mean of an independent variable. The regression
 coefficient of a group-meaned variable is the \emph{between-subject-effect}.
 The de-meaned variable is then the centered version of the group-meaned
@@ -209,10 +214,10 @@ data(iris)
 iris$ID <- sample(1:4, nrow(iris), replace = TRUE) # fake-ID
 iris$binary <- as.factor(rbinom(150, 1, .35)) # binary variable
 
-x <- demean(iris, select = c("Sepal.Length", "Petal.Length"), group = "ID")
+x <- demean(iris, select = c("Sepal.Length", "Petal.Length"), by = "ID")
 head(x)
 
-x <- demean(iris, select = c("Sepal.Length", "binary", "Species"), group = "ID")
+x <- demean(iris, select = c("Sepal.Length", "binary", "Species"), by = "ID")
 head(x)
 
 
@@ -223,10 +228,10 @@ dat <- data.frame(
   y = c(1, 2, 1, 2, 4, 3, 2, 1),
   ID = c(1, 2, 3, 1, 2, 3, 1, 2)
 )
-demean(dat, select = c("a", "x*y"), group = "ID")
+demean(dat, select = c("a", "x*y"), by = "ID")
 
 # or in formula-notation
-demean(dat, select = ~ a + x * y, group = ~ID)
+demean(dat, select = ~ a + x * y, by = ~ID)
 
 }
 \references{
diff --git a/tests/testthat/_snaps/demean.md b/tests/testthat/_snaps/demean.md
index f61ba9fcb..7f12d263d 100644
--- a/tests/testthat/_snaps/demean.md
+++ b/tests/testthat/_snaps/demean.md
@@ -55,7 +55,7 @@
 # demean interaction term
 
     Code
-      demean(dat, select = c("a", "x*y"), group = "ID")
+      demean(dat, select = c("a", "x*y"), by = "ID")
     Output
         a_between x_y_between   a_within x_y_within
       1  2.666667    4.666667 -1.6666667 -0.6666667
diff --git a/tests/testthat/test-demean.R b/tests/testthat/test-demean.R
index a2f803cd7..566bd6097 100644
--- a/tests/testthat/test-demean.R
+++ b/tests/testthat/test-demean.R
@@ -2,29 +2,35 @@ test_that("demean works", {
   df <- iris
 
   set.seed(123)
-  df$ID <- sample(1:4, nrow(df), replace = TRUE) # fake-ID
+  df$ID <- sample.int(4, nrow(df), replace = TRUE) # fake-ID
 
   set.seed(123)
   df$binary <- as.factor(rbinom(150, 1, 0.35)) # binary variable
 
   set.seed(123)
-  x <- demean(df, select = c("Sepal.Length", "Petal.Length"), group = "ID")
+  x <- demean(df, select = c("Sepal.Length", "Petal.Length"), by = "ID")
   expect_snapshot(head(x))
 
   set.seed(123)
   expect_message(
-    x <- demean(df, select = c("Sepal.Length", "binary", "Species"), group = "ID"),
+    {
+      x <- demean(df, select = c("Sepal.Length", "binary", "Species"), by = "ID")
+    },
     "have been coerced to numeric"
   )
   expect_snapshot(head(x))
 
   set.seed(123)
   expect_message(
-    y <- demean(df, select = ~ Sepal.Length + binary + Species, group = ~ID),
+    {
+      y <- demean(df, select = ~ Sepal.Length + binary + Species, by = ~ID)
+    },
     "have been coerced to numeric"
   )
   expect_message(
-    z <- demean(df, select = c("Sepal.Length", "binary", "Species"), group = "ID"),
+    {
+      z <- demean(df, select = c("Sepal.Length", "binary", "Species"), by = "ID")
+    },
     "have been coerced to numeric"
   )
   expect_identical(y, z)
@@ -39,7 +45,7 @@ test_that("demean interaction term", {
   )
 
   set.seed(123)
-  expect_snapshot(demean(dat, select = c("a", "x*y"), group = "ID"))
+  expect_snapshot(demean(dat, select = c("a", "x*y"), by = "ID"))
 })
 
 test_that("demean shows message if some vars don't exist", {
@@ -52,7 +58,7 @@ test_that("demean shows message if some vars don't exist", {
 
   set.seed(123)
   expect_message(
-    demean(dat, select = "foo", group = "ID"),
+    demean(dat, select = "foo", by = "ID"),
     regexp = "not found"
   )
 })