From 37afbdf7dccebca385c9db7e44543b78af3a7299 Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 16 May 2024 10:19:50 +0200 Subject: [PATCH] fix demean() --- R/demean.R | 74 ++++++++++++++++----------------- man/demean.Rd | 29 +++++++------ tests/testthat/_snaps/demean.md | 2 +- tests/testthat/test-demean.R | 20 +++++---- 4 files changed, 68 insertions(+), 57 deletions(-) diff --git a/R/demean.R b/R/demean.R index 69e833a4d..fc797fc9e 100644 --- a/R/demean.R +++ b/R/demean.R @@ -11,7 +11,7 @@ #' @param x A data frame. #' @param select Character vector (or formula) with names of variables to select #' that should be group- and de-meaned. -#' @param group Character vector (or formula) with the name of the variable that +#' @param by Character vector (or formula) with the name of the variable that #' indicates the group- or cluster-ID. #' @param center Method for centering. `demean()` always performs #' mean-centering, while `degroup()` can use `center = "median"` or @@ -25,6 +25,7 @@ #' attributes to indicate the within- and between-effects. This is only #' relevant when printing `model_parameters()` - in such cases, the #' within- and between-effects are printed in separated blocks. +#' @param group Deprecated. Use `by` instead. #' @inheritParams center #' #' @return @@ -92,7 +93,7 @@ #' #' \subsection{Terminology}{ #' The group-meaned variable is simply the mean of an independent variable -#' within each group (or id-level or cluster) represented by `group`. +#' within each group (or id-level or cluster) represented by `by`. #' It represents the cluster-mean of an independent variable. The regression #' coefficient of a group-meaned variable is the *between-subject-effect*. #' The de-meaned variable is then the centered version of the group-meaned @@ -199,10 +200,10 @@ #' iris$ID <- sample(1:4, nrow(iris), replace = TRUE) # fake-ID #' iris$binary <- as.factor(rbinom(150, 1, .35)) # binary variable #' -#' x <- demean(iris, select = c("Sepal.Length", "Petal.Length"), group = "ID") +#' x <- demean(iris, select = c("Sepal.Length", "Petal.Length"), by = "ID") #' head(x) #' -#' x <- demean(iris, select = c("Sepal.Length", "binary", "Species"), group = "ID") +#' x <- demean(iris, select = c("Sepal.Length", "binary", "Species"), by = "ID") #' head(x) #' #' @@ -213,23 +214,29 @@ #' y = c(1, 2, 1, 2, 4, 3, 2, 1), #' ID = c(1, 2, 3, 1, 2, 3, 1, 2) #' ) -#' demean(dat, select = c("a", "x*y"), group = "ID") +#' demean(dat, select = c("a", "x*y"), by = "ID") #' #' # or in formula-notation -#' demean(dat, select = ~ a + x * y, group = ~ID) +#' demean(dat, select = ~ a + x * y, by = ~ID) #' #' @export demean <- function(x, select, - group, + by, suffix_demean = "_within", suffix_groupmean = "_between", add_attributes = TRUE, - verbose = TRUE) { + verbose = TRUE, + group = NULL) { + ## TODO: deprecate later + if (!is.null(group)) { + by <- group + } + degroup( x = x, select = select, - group = group, + by = by, center = "mean", suffix_demean = suffix_demean, suffix_groupmean = suffix_groupmean, @@ -247,12 +254,18 @@ demean <- function(x, #' @export degroup <- function(x, select, - group, + by, center = "mean", suffix_demean = "_within", suffix_groupmean = "_between", add_attributes = TRUE, - verbose = TRUE) { + verbose = TRUE, + group = NULL) { + ## TODO: deprecate later + if (!is.null(group)) { + by <- group + } + # ugly tibbles again... x <- .coerce_to_dataframe(x) @@ -266,8 +279,8 @@ degroup <- function(x, )) } - if (inherits(group, "formula")) { - group <- all.vars(group) + if (inherits(by, "formula")) { + by <- all.vars(by) } interactions_no <- select[!grepl("(\\*|\\:)", select)] @@ -296,7 +309,7 @@ degroup <- function(x, select <- intersect(colnames(x), select) # get data to demean... - dat <- x[, c(select, group)] + dat <- x[, c(select, by)] # find categorical predictors that are coded as factors @@ -344,31 +357,18 @@ degroup <- function(x, # for variables within each group (the group means). assign # mean values to a vector of same length as the data - if (center == "mode") { - x_gm_list <- lapply(select, function(i) { - stats::ave(dat[[i]], dat[[group]], FUN = function(.gm) distribution_mode(stats::na.omit(.gm))) - }) - } else if (center == "median") { - x_gm_list <- lapply(select, function(i) { - stats::ave(dat[[i]], dat[[group]], FUN = function(.gm) stats::median(.gm, na.rm = TRUE)) - }) - } else if (center == "min") { - x_gm_list <- lapply(select, function(i) { - stats::ave(dat[[i]], dat[[group]], FUN = function(.gm) min(.gm, na.rm = TRUE)) - }) - } else if (center == "max") { - x_gm_list <- lapply(select, function(i) { - stats::ave(dat[[i]], dat[[group]], FUN = function(.gm) max(.gm, na.rm = TRUE)) - }) - } else { - x_gm_list <- lapply(select, function(i) { - stats::ave(dat[[i]], dat[[group]], FUN = function(.gm) mean(.gm, na.rm = TRUE)) - }) - } - + gm_fun <- switch(center, + mode = function(.gm) distribution_mode(stats::na.omit(.gm)), + median = function(.gm) stats::median(.gm, na.rm = TRUE), + min = function(.gm) min(.gm, na.rm = TRUE), + max = function(.gm) max(.gm, na.rm = TRUE), + function(.gm) mean(.gm, na.rm = TRUE) + ) + x_gm_list <- lapply(select, function(i) { + stats::ave(dat[[i]], dat[[by]], FUN = gm_fun) + }) names(x_gm_list) <- select - # create de-meaned variables by subtracting the group mean from each individual value x_dm_list <- lapply(select, function(i) dat[[i]] - x_gm_list[[i]]) diff --git a/man/demean.Rd b/man/demean.Rd index 422c8d32e..d03a1010b 100644 --- a/man/demean.Rd +++ b/man/demean.Rd @@ -9,33 +9,36 @@ demean( x, select, - group, + by, suffix_demean = "_within", suffix_groupmean = "_between", add_attributes = TRUE, - verbose = TRUE + verbose = TRUE, + group = NULL ) degroup( x, select, - group, + by, center = "mean", suffix_demean = "_within", suffix_groupmean = "_between", add_attributes = TRUE, - verbose = TRUE + verbose = TRUE, + group = NULL ) detrend( x, select, - group, + by, center = "mean", suffix_demean = "_within", suffix_groupmean = "_between", add_attributes = TRUE, - verbose = TRUE + verbose = TRUE, + group = NULL ) } \arguments{ @@ -44,7 +47,7 @@ detrend( \item{select}{Character vector (or formula) with names of variables to select that should be group- and de-meaned.} -\item{group}{Character vector (or formula) with the name of the variable that +\item{by}{Character vector (or formula) with the name of the variable that indicates the group- or cluster-ID.} \item{suffix_demean, suffix_groupmean}{String value, will be appended to the @@ -59,6 +62,8 @@ within- and between-effects are printed in separated blocks.} \item{verbose}{Toggle warnings and messages.} +\item{group}{Deprecated. Use \code{by} instead.} + \item{center}{Method for centering. \code{demean()} always performs mean-centering, while \code{degroup()} can use \code{center = "median"} or \code{center = "mode"} for median- or mode-centering, and also \code{"min"} @@ -131,7 +136,7 @@ intervals and low statistical power} (\cite{Heisig et al. 2017}). \subsection{Terminology}{ The group-meaned variable is simply the mean of an independent variable -within each group (or id-level or cluster) represented by \code{group}. +within each group (or id-level or cluster) represented by \code{by}. It represents the cluster-mean of an independent variable. The regression coefficient of a group-meaned variable is the \emph{between-subject-effect}. The de-meaned variable is then the centered version of the group-meaned @@ -209,10 +214,10 @@ data(iris) iris$ID <- sample(1:4, nrow(iris), replace = TRUE) # fake-ID iris$binary <- as.factor(rbinom(150, 1, .35)) # binary variable -x <- demean(iris, select = c("Sepal.Length", "Petal.Length"), group = "ID") +x <- demean(iris, select = c("Sepal.Length", "Petal.Length"), by = "ID") head(x) -x <- demean(iris, select = c("Sepal.Length", "binary", "Species"), group = "ID") +x <- demean(iris, select = c("Sepal.Length", "binary", "Species"), by = "ID") head(x) @@ -223,10 +228,10 @@ dat <- data.frame( y = c(1, 2, 1, 2, 4, 3, 2, 1), ID = c(1, 2, 3, 1, 2, 3, 1, 2) ) -demean(dat, select = c("a", "x*y"), group = "ID") +demean(dat, select = c("a", "x*y"), by = "ID") # or in formula-notation -demean(dat, select = ~ a + x * y, group = ~ID) +demean(dat, select = ~ a + x * y, by = ~ID) } \references{ diff --git a/tests/testthat/_snaps/demean.md b/tests/testthat/_snaps/demean.md index f61ba9fcb..7f12d263d 100644 --- a/tests/testthat/_snaps/demean.md +++ b/tests/testthat/_snaps/demean.md @@ -55,7 +55,7 @@ # demean interaction term Code - demean(dat, select = c("a", "x*y"), group = "ID") + demean(dat, select = c("a", "x*y"), by = "ID") Output a_between x_y_between a_within x_y_within 1 2.666667 4.666667 -1.6666667 -0.6666667 diff --git a/tests/testthat/test-demean.R b/tests/testthat/test-demean.R index a2f803cd7..566bd6097 100644 --- a/tests/testthat/test-demean.R +++ b/tests/testthat/test-demean.R @@ -2,29 +2,35 @@ test_that("demean works", { df <- iris set.seed(123) - df$ID <- sample(1:4, nrow(df), replace = TRUE) # fake-ID + df$ID <- sample.int(4, nrow(df), replace = TRUE) # fake-ID set.seed(123) df$binary <- as.factor(rbinom(150, 1, 0.35)) # binary variable set.seed(123) - x <- demean(df, select = c("Sepal.Length", "Petal.Length"), group = "ID") + x <- demean(df, select = c("Sepal.Length", "Petal.Length"), by = "ID") expect_snapshot(head(x)) set.seed(123) expect_message( - x <- demean(df, select = c("Sepal.Length", "binary", "Species"), group = "ID"), + { + x <- demean(df, select = c("Sepal.Length", "binary", "Species"), by = "ID") + }, "have been coerced to numeric" ) expect_snapshot(head(x)) set.seed(123) expect_message( - y <- demean(df, select = ~ Sepal.Length + binary + Species, group = ~ID), + { + y <- demean(df, select = ~ Sepal.Length + binary + Species, by = ~ID) + }, "have been coerced to numeric" ) expect_message( - z <- demean(df, select = c("Sepal.Length", "binary", "Species"), group = "ID"), + { + z <- demean(df, select = c("Sepal.Length", "binary", "Species"), by = "ID") + }, "have been coerced to numeric" ) expect_identical(y, z) @@ -39,7 +45,7 @@ test_that("demean interaction term", { ) set.seed(123) - expect_snapshot(demean(dat, select = c("a", "x*y"), group = "ID")) + expect_snapshot(demean(dat, select = c("a", "x*y"), by = "ID")) }) test_that("demean shows message if some vars don't exist", { @@ -52,7 +58,7 @@ test_that("demean shows message if some vars don't exist", { set.seed(123) expect_message( - demean(dat, select = "foo", group = "ID"), + demean(dat, select = "foo", by = "ID"), regexp = "not found" ) })