diff --git a/.dev/test-value_at.R b/.dev/test-value_at.R new file mode 100644 index 000000000..3c8272101 --- /dev/null +++ b/.dev/test-value_at.R @@ -0,0 +1,12 @@ +test_that("value_at", { + data(efc, package = "datawizard") + expect_equal(value_at(efc$e42dep, 5), 4, ignore_attr = TRUE) + expect_equal(value_at(efc$c12hour, 4), NA_real_, ignore_attr = TRUE) + expect_equal(value_at(efc$c12hour, 4, remove_na = TRUE), 168, ignore_attr = TRUE) + expect_equal(value_at(efc$c12hour, 5:7), efc$c12hour[5:7], ignore_attr = TRUE) + expect_equal(value_at(efc$e42dep, 123456, default = 55), 55, ignore_attr = TRUE) + expect_null(value_at(efc$e42dep, 123456)) + expect_null(value_at(efc$e42dep, NULL)) + expect_error(value_at(efc$e42dep, NA), regex = "`position` can't") + expect_error(value_at(efc$e42dep, c(3, NA)), regex = "`position` can't") +}) diff --git a/.dev/value_at.R b/.dev/value_at.R new file mode 100644 index 000000000..cdadc9dc6 --- /dev/null +++ b/.dev/value_at.R @@ -0,0 +1,52 @@ +#' @title Find the value(s) at a specific position in a variable +#' @name value_at +#' +#' @description This function can be used to extract one or more values at a +#' specific position in a variable. +#' +#' @param x A vector or factor. +#' @param position An integer or a vector of integers, indicating the position(s) +#' of the value(s) to be returned. Negative values are counted from the end of +#' the vector. If `NA`, an error is thrown. +#' @param remove_na Logical, if `TRUE`, missing values are removed before +#' computing the position. If `FALSE`, missing values are included in the +#' computation. +#' @param default The value to be returned if the position is out of range. +#' +#' @seealso `data_summary()` to use `value_at()` inside a `data_summary()` call. +#' +#' @return A vector with the value(s) at the specified position(s). +#' +#' @examples +#' data(mtcars) +#' # 5th value +#' value_at(mtcars$mpg, 5) +#' # last value +#' value_at(mtcars$mpg, -1) +#' # out of range, return default +#' value_at(mtcars$mpg, 150) +#' # return 2nd and fifth value +#' value_at(mtcars$mpg, c(2, 5)) +#' @export +value_at <- function(x, position = 1, default = NULL, remove_na = FALSE) { + if (remove_na) { + x <- x[!is.na(x)] + } + n <- length(x) + unlist(lapply(position, .values_at, x = x, n = n, default = default), use.names = FALSE) +} + +# helper ---- + +.values_at <- function(x, position, n, default) { + if (is.na(position)) { + insight::format_error("`position` can't be `NA`.") + } + if (position < 0L) { + position <- position + n + 1 + } + if (position <= 0 || position > n) { + return(default) + } + x[position] +} diff --git a/R/adjust.R b/R/adjust.R index 01b1b5a2c..821821864 100644 --- a/R/adjust.R +++ b/R/adjust.R @@ -124,11 +124,11 @@ adjust <- function(data, predictors[predictors == predictors_num] <- paste0("s(", predictors_num, ")") } formula_predictors <- paste(c("1", predictors), collapse = " + ") - formula <- paste(var, "~", formula_predictors) + model_formula <- paste(var, "~", formula_predictors) x <- .model_adjust_for( data = data[unique(c(var, effect, facs))], - formula, + model_formula = model_formula, multilevel = multilevel, additive = additive, bayesian = bayesian, @@ -148,7 +148,7 @@ data_adjust <- adjust #' @keywords internal .model_adjust_for <- function(data, - formula, + model_formula, multilevel = FALSE, additive = FALSE, bayesian = FALSE, @@ -159,32 +159,28 @@ data_adjust <- adjust # Bayesian if (bayesian) { insight::check_if_installed("rstanarm") - model <- rstanarm::stan_gamm4(stats::as.formula(formula), random = formula_random, data = data, refresh = 0) + model <- rstanarm::stan_gamm4(stats::as.formula(model_formula), random = formula_random, data = data, refresh = 0) # Frequentist } else { insight::check_if_installed("gamm4") - model <- gamm4::gamm4(stats::as.formula(formula), random = formula_random, data = data) + model <- gamm4::gamm4(stats::as.formula(model_formula), random = formula_random, data = data) } # Linear ------------------------- - } else { + } else if (bayesian) { # Bayesian - if (bayesian) { - insight::check_if_installed("rstanarm") - if (multilevel) { - model <- rstanarm::stan_lmer(paste(formula, formula_random), data = data, refresh = 0) - } else { - model <- rstanarm::stan_glm(formula, data = data, refresh = 0) - } - # Frequentist + insight::check_if_installed("rstanarm") + if (multilevel) { + model <- rstanarm::stan_lmer(paste(model_formula, formula_random), data = data, refresh = 0) } else { - if (multilevel) { - insight::check_if_installed("lme4") - model <- lme4::lmer(paste(formula, formula_random), data = data) - } else { - model <- stats::lm(formula, data = data) - } + model <- rstanarm::stan_glm(model_formula, data = data, refresh = 0) } + } else if (multilevel) { + # Frequentist + insight::check_if_installed("lme4") + model <- lme4::lmer(paste(model_formula, formula_random), data = data) + } else { + model <- stats::lm(model_formula, data = data) } adjusted <- insight::get_residuals(model) diff --git a/R/assign_labels.R b/R/assign_labels.R index 9390101ad..bd35513bf 100644 --- a/R/assign_labels.R +++ b/R/assign_labels.R @@ -79,7 +79,7 @@ assign_labels.numeric <- function(x, variable = NULL, values = NULL, ...) { attr(x, "label") <- variable } else { insight::format_error( - "Variable labels (argument `variable`) must be provided as a single character string, e.g. `variable = \"mylabel\"`." + "Variable labels (argument `variable`) must be provided as a single character string, e.g. `variable = \"mylabel\"`." # nolint ) } } @@ -88,13 +88,13 @@ assign_labels.numeric <- function(x, variable = NULL, values = NULL, ...) { if (!is.null(values)) { # extract unique values unique_values <- as.vector(sort(stats::na.omit(unique(x)))) - labels <- NULL + value_labels <- NULL # do we have a names vector for "values"? # else check if number of labels and values match if (is.null(names(values))) { if (length(values) == length(unique_values)) { - labels <- stats::setNames(unique_values, values) + value_labels <- stats::setNames(unique_values, values) } else { insight::format_error( "Cannot add labels. Number of unique values and number of value labels are not equal.", @@ -114,11 +114,11 @@ assign_labels.numeric <- function(x, variable = NULL, values = NULL, ...) { if (length(values)) { # we need to switch names and values - labels <- stats::setNames(coerce_to_numeric(names(values)), values) + value_labels <- stats::setNames(coerce_to_numeric(names(values)), values) } } - attr(x, "labels") <- labels + attr(x, "labels") <- value_labels } x diff --git a/R/categorize.R b/R/categorize.R index d440069d8..341d0c0c9 100644 --- a/R/categorize.R +++ b/R/categorize.R @@ -145,28 +145,8 @@ categorize.numeric <- function(x, labels = NULL, verbose = TRUE, ...) { - # check arguments - if (is.character(split)) { - split <- match.arg( - split, - choices = c( - "median", "mean", "quantile", "equal_length", "equal_range", - "equal", "equal_distance", "range", "distance" - ) - ) - } - - if (is.character(split) && split %in% c("quantile", "equal_length") && is.null(n_groups)) { - insight::format_error( - "Recoding based on quantiles or equal-sized groups requires the `n_groups` argument to be specified." - ) - } - - if (is.character(split) && split == "equal_range" && is.null(n_groups) && is.null(range)) { - insight::format_error( - "Recoding into groups with equal range requires either the `range` or `n_groups` argument to be specified." - ) - } + # sanity check + split <- .sanitize_split_arg(split, n_groups, range) # handle aliases if (identical(split, "equal_length")) split <- "length" @@ -221,28 +201,7 @@ categorize.numeric <- function(x, original_x[!is.na(original_x)] <- out # turn into factor? - if (!is.null(labels)) { - if (length(labels) == length(unique(out))) { - original_x <- as.factor(original_x) - levels(original_x) <- labels - } else if (length(labels) == 1 && labels %in% c("mean", "median")) { - original_x <- as.factor(original_x) - no_na_x <- original_x[!is.na(original_x)] - if (labels == "mean") { - labels <- stats::aggregate(x, list(no_na_x), FUN = mean, na.rm = TRUE)$x - } else { - labels <- stats::aggregate(x, list(no_na_x), FUN = stats::median, na.rm = TRUE)$x - } - levels(original_x) <- insight::format_value(labels, ...) - } else if (isTRUE(verbose)) { - insight::format_warning( - "Argument `labels` and levels of the recoded variable are not of the same length.", - "Variable will not be converted to factor." - ) - } - } - - original_x + .original_x_to_factor(original_x, x, labels, out, verbose, ...) } @@ -283,15 +242,15 @@ categorize.data.frame <- function(x, # create the new variables and updates "select", so new variables are processed if (!isFALSE(append)) { # process arguments - args <- .process_append( + my_args <- .process_append( x, select, append, append_suffix = "_r" ) # update processed arguments - x <- args$x - select <- args$select + x <- my_args$x + select <- my_args$select } x[select] <- lapply( @@ -342,15 +301,15 @@ categorize.grouped_df <- function(x, # create the new variables and updates "select", so new variables are processed if (!isFALSE(append)) { # process arguments - args <- .process_append( + my_args <- .process_append( x, select, append, append_suffix = "_r" ) # update processed arguments - x <- args$x - select <- args$select + x <- my_args$x + select <- my_args$select } x <- as.data.frame(x) @@ -387,3 +346,56 @@ categorize.grouped_df <- function(x, } seq(lowest, max(x), by = range) } + + +.sanitize_split_arg <- function(split, n_groups, range) { + # check arguments + if (is.character(split)) { + split <- match.arg( + split, + choices = c( + "median", "mean", "quantile", "equal_length", "equal_range", + "equal", "equal_distance", "range", "distance" + ) + ) + } + + if (is.character(split) && split %in% c("quantile", "equal_length") && is.null(n_groups)) { + insight::format_error( + "Recoding based on quantiles or equal-sized groups requires the `n_groups` argument to be specified." + ) + } + + if (is.character(split) && split == "equal_range" && is.null(n_groups) && is.null(range)) { + insight::format_error( + "Recoding into groups with equal range requires either the `range` or `n_groups` argument to be specified." + ) + } + + split +} + + +.original_x_to_factor <- function(original_x, x, labels, out, verbose, ...) { + if (!is.null(labels)) { + if (length(labels) == length(unique(out))) { + original_x <- as.factor(original_x) + levels(original_x) <- labels + } else if (length(labels) == 1 && labels %in% c("mean", "median")) { + original_x <- as.factor(original_x) + no_na_x <- original_x[!is.na(original_x)] + if (labels == "mean") { + labels <- stats::aggregate(x, list(no_na_x), FUN = mean, na.rm = TRUE)$x + } else { + labels <- stats::aggregate(x, list(no_na_x), FUN = stats::median, na.rm = TRUE)$x + } + levels(original_x) <- insight::format_value(labels, ...) + } else if (isTRUE(verbose)) { + insight::format_warning( + "Argument `labels` and levels of the recoded variable are not of the same length.", + "Variable will not be converted to factor." + ) + } + } + original_x +} diff --git a/R/data_modify.R b/R/data_modify.R index 6234255fd..c9b9d035a 100644 --- a/R/data_modify.R +++ b/R/data_modify.R @@ -350,7 +350,14 @@ data_modify.grouped_df <- function(data, ..., .if = NULL, .at = NULL, .modify = } # finally, we can evaluate expression and get values for new variables - new_variable <- try(with(data, eval(symbol)), silent = TRUE) + symbol_string <- insight::safe_deparse(symbol) + if (!is.null(symbol_string) && all(symbol_string == "n()")) { + # "special" functions + new_variable <- nrow(data) + } else { + # default evaluation of expression + new_variable <- try(with(data, eval(symbol)), silent = TRUE) + } # successful, or any errors, like misspelled variable name? if (inherits(new_variable, "try-error")) { diff --git a/R/data_summary.R b/R/data_summary.R index 5f8b0ee08..c0c5bc50f 100644 --- a/R/data_summary.R +++ b/R/data_summary.R @@ -8,13 +8,14 @@ #' @param by Optional character string, indicating the name of a variable in `x`. #' If supplied, the data will be split by this variable and summary statistics #' will be computed for each group. -#' @param include_na Logical, if `TRUE`, missing values are included as a level +#' @param include_na Logical. If `TRUE`, missing values are included as a level #' in the grouping variable. If `FALSE`, missing values are omitted from the #' grouping variable. #' @param ... One or more named expressions that define the new variable name #' and the function to compute the summary statistic. Example: #' `mean_sepal_width = mean(Sepal.Width)`. The expression can also be provided -#' as a character string, e.g. `"mean_sepal_width = mean(Sepal.Width)"`. +#' as a character string, e.g. `"mean_sepal_width = mean(Sepal.Width)"`. The +#' summary function `n()` can be used to count the number of observations. #' #' @return A data frame with the requested summary statistics. #' @@ -38,6 +39,17 @@ #' #' # expressions can also be supplied as character strings #' data_summary(mtcars, "MW = mean(mpg)", "SD = sd(mpg)", by = c("am", "gear")) +#' +#' # count observations within groups +#' data_summary(mtcars, observations = n(), by = c("am", "gear")) +#' +#' # first and last observations of "mpg" within groups +#' data_summary( +#' mtcars, +#' first = mpg[1], +#' last = mpg[length(mpg)], +#' by = c("am", "gear") +#' ) #' @export data_summary <- function(x, ...) { UseMethod("data_summary") @@ -74,19 +86,24 @@ data_summary.data.frame <- function(x, ..., by = NULL, include_na = TRUE) { } else { # sanity check - is "by" a character string? if (!is.character(by)) { - insight::format_error("Argument `by` must be a character string, indicating the name of a variable in the data.") + insight::format_error("Argument `by` must be a character string indicating the name of variables in the data.") } # is "by" in the data? if (!all(by %in% colnames(x))) { by_not_found <- by[!by %in% colnames(x)] insight::format_error( - paste0("Variable \"", by_not_found, "\" not found in the data."), + paste0( + "Variable", + ifelse(length(by_not_found) > 1, "s ", " "), + text_concatenate(by_not_found, enclose = "\""), + " not found in the data." + ), .misspelled_string(colnames(x), by_not_found, "Possibly misspelled?") ) } # split data, add NA levels, if requested l <- lapply(x[by], function(i) { - if (include_na) { + if (include_na && anyNA(i)) { addNA(i) } else { i @@ -178,6 +195,18 @@ data_summary.grouped_df <- function(x, ..., by = NULL, include_na = TRUE) { }) } + # check for correct length of output - must be a single value! + if (any(lengths(out) != 1)) { + insight::format_error( + paste0( + "Each expression must return a single value. Following expression", + ifelse(sum(lengths(out) != 1) > 1, "s", " "), + " returned more than one value: ", + text_concatenate(vapply(dots[lengths(out) != 1], insight::safe_deparse, character(1)), enclose = "\"") + ) + ) + } + out } diff --git a/man/data_summary.Rd b/man/data_summary.Rd index 0602dc7da..ccbf4c524 100644 --- a/man/data_summary.Rd +++ b/man/data_summary.Rd @@ -15,13 +15,14 @@ data_summary(x, ...) \item{...}{One or more named expressions that define the new variable name and the function to compute the summary statistic. Example: \code{mean_sepal_width = mean(Sepal.Width)}. The expression can also be provided -as a character string, e.g. \code{"mean_sepal_width = mean(Sepal.Width)"}.} +as a character string, e.g. \code{"mean_sepal_width = mean(Sepal.Width)"}. The +summary function \code{n()} can be used to count the number of observations.} \item{by}{Optional character string, indicating the name of a variable in \code{x}. If supplied, the data will be split by this variable and summary statistics will be computed for each group.} -\item{include_na}{Logical, if \code{TRUE}, missing values are included as a level +\item{include_na}{Logical. If \code{TRUE}, missing values are included as a level in the grouping variable. If \code{FALSE}, missing values are omitted from the grouping variable.} } @@ -52,4 +53,15 @@ data_summary(mtcars, MW = mean(mpg), SD = sd(mpg), by = c("am", "gear")) # expressions can also be supplied as character strings data_summary(mtcars, "MW = mean(mpg)", "SD = sd(mpg)", by = c("am", "gear")) + +# count observations within groups +data_summary(mtcars, observations = n(), by = c("am", "gear")) + +# first and last observations of "mpg" within groups +data_summary( + mtcars, + first = mpg[1], + last = mpg[length(mpg)], + by = c("am", "gear") +) } diff --git a/tests/testthat/test-data_summary.R b/tests/testthat/test-data_summary.R index aede5c042..746d4c51a 100644 --- a/tests/testthat/test-data_summary.R +++ b/tests/testthat/test-data_summary.R @@ -141,6 +141,26 @@ test_that("data_summary, errors", { data_summary(mtcars, mw = mesn(mpg), by = "am"), regex = "There was an error" ) + # wrong variable name + expect_error( + data_summary(mtcars, n = max(mpeg)), + regex = "There was an error" + ) + # expression returns more than one value + expect_error( + data_summary(mtcars, n = unique(mpg), j = c(min(am), max(am)), by = c("am", "gear")), + regex = "Each expression must return" + ) +}) + + +test_that("data_summary, values_at", { + data(mtcars) + out <- data_summary(mtcars, pos1 = mpg[1], pos_end = mpg[length(mpg)], by = c("am", "gear")) + # same as: + # dplyr::summarise(mtcars, pos1 = dplyr::first(mpg), pos_end = dplyr::last(mpg), .by = c("am", "gear")) + expect_equal(out$pos1, c(21.4, 24.4, 21, 26), tolerance = 1e-3) + expect_equal(out$pos_end, c(19.2, 17.8, 21.4, 15), tolerance = 1e-3) }) @@ -200,3 +220,11 @@ test_that("data_summary, expression as variable", { expect_named(out, c("am", "gear", "MW", "SD")) expect_equal(out$SD, aggregate(mtcars["mpg"], list(mtcars$am, mtcars$gear), sd)$mpg, tolerance = 1e-4) }) + + +test_that("data_summary, extra functions", { + data(mtcars) + # n() + out <- data_summary(mtcars, n = n(), by = c("am", "gear")) + expect_identical(out$n, c(15L, 4L, 8L, 5L)) +})