diff --git a/DESCRIPTION b/DESCRIPTION index 99a599901..41a1744fd 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.9.1 +Version: 0.9.1.1 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")), @@ -72,7 +72,7 @@ VignetteBuilder: Encoding: UTF-8 Language: en-US Roxygen: list(markdown = TRUE) -RoxygenNote: 7.2.3.9000 +RoxygenNote: 7.3.1 Config/testthat/edition: 3 Config/testthat/parallel: true Config/Needs/website: diff --git a/NEWS.md b/NEWS.md index 03a0d7c89..25d043259 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,10 @@ +# datawizard 0.9.2 + +CHANGES + +* `data_modify()` gets three new arguments, `.at`, `.if` and `.modify`, to modify + variables at specific positions or based on logical conditions. + # datawizard 0.9.1 CHANGES diff --git a/R/data_modify.R b/R/data_modify.R index 9fa9a3b9a..0aea4a287 100644 --- a/R/data_modify.R +++ b/R/data_modify.R @@ -26,6 +26,14 @@ #' Note that newly created variables can be used in subsequent expressions. #' See also 'Examples'. #' +#' @param .at A character vector of variable names that should be modified. This +#' argument is used in combination with the `.modify` argument. +#' @param .if A function that returns `TRUE` for columns in the data frame where +#' `.if` applies. This argument is used in combination with the `.modify` argument. +#' @param .modify A function that modifies the variables defined in `.at` or `.if`. +#' This argument is used in combination with either the `.at` or the `.if`argument. +#' If `.modify` is not provided, `.at` and `.if` are ignored. +#' #' @note `data_modify()` can also be used inside functions. However, it is #' recommended to pass the recode-expression as character vector or list of #' characters. @@ -91,6 +99,15 @@ #' #' new_exp <- c("SW_double = 2 * Sepal.Width", "SW_fraction = SW_double / 10") #' foo(iris, new_exp) +#' +#' # modify at specific positions or if condition is met +#' d <- iris[1:5, ] +#' data_modify(d, .at = "Species", .modify = as.numeric) +#' data_modify(d, .if = is.factor, .modify = as.numeric) +#' +#' # can be combined with dots +#' data_modify(d, new_length = Petal.Length * 2, .at = "Species", .modify = as.numeric) +#' #' @export data_modify <- function(data, ...) { UseMethod("data_modify") @@ -102,113 +119,120 @@ data_modify.default <- function(data, ...) { } #' @export -data_modify.data.frame <- function(data, ...) { +data_modify.data.frame <- function(data, ..., .if = NULL, .at = NULL, .modify = NULL) { dots <- eval(substitute(alist(...))) column_names <- colnames(data) - # we check for character vector of expressions, in which case - # "dots" should be unnamed - if (is.null(names(dots))) { - # if we have multiple strings, concatenate them to a character vector - # and put it into a list... - if (length(dots) > 1) { - if (all(vapply(dots, is.character, logical(1)))) { - dots <- list(unlist(dots)) - } else { - insight::format_error("You cannot mix string and literal representation of expressions.") + # check if we have dots, or only at/modify ---- + + if (length(dots)) { + # we check for character vector of expressions, in which case + # "dots" should be unnamed + if (is.null(names(dots))) { + # if we have multiple strings, concatenate them to a character vector + # and put it into a list... + if (length(dots) > 1) { + if (all(vapply(dots, is.character, logical(1)))) { + dots <- list(unlist(dots)) + } else { + insight::format_error("You cannot mix string and literal representation of expressions.") + } + } + # expression is given as character string, e.g. + # a <- "double_SepWidth = 2 * Sepal.Width" + # data_modify(iris, a) + # or as character vector, e.g. + # data_modify(iris, c("var_a = Sepal.Width / 10", "var_b = Sepal.Width * 10")) + character_symbol <- tryCatch(.dynEval(dots[[1]]), error = function(e) NULL) + # do we have a character vector? Then we can proceed + if (is.character(character_symbol)) { + dots <- lapply(character_symbol, function(s) { + # turn value from character vector into expression + str2lang(.dynEval(s)) + }) + names(dots) <- vapply(dots, function(n) insight::safe_deparse(n[[2]]), character(1)) } } - # expression is given as character string, e.g. - # a <- "double_SepWidth = 2 * Sepal.Width" - # data_modify(iris, a) - # or as character vector, e.g. - # data_modify(iris, c("var_a = Sepal.Width / 10", "var_b = Sepal.Width * 10")) - character_symbol <- tryCatch(.dynEval(dots[[1]]), error = function(e) NULL) - # do we have a character vector? Then we can proceed - if (is.character(character_symbol)) { - dots <- lapply(character_symbol, function(s) { - # turn value from character vector into expression - str2lang(.dynEval(s)) - }) - names(dots) <- vapply(dots, function(n) insight::safe_deparse(n[[2]]), character(1)) - } - } - for (i in seq_along(dots)) { - # iterate expressions for new variables - symbol <- dots[[i]] + for (i in seq_along(dots)) { + # iterate expressions for new variables + symbol <- dots[[i]] - # expression is given as character string in a variable, but named, e.g. - # a <- "2 * Sepal.Width" - # data_modify(iris, double_SepWidth = a) - # we reconstruct the symbol as if it were provided as literal expression. - # However, we need to check that we don't have a character vector, - # like: data_modify(iris, new_var = "a") - # this one should be recycled instead. - if (!is.character(symbol)) { - eval_symbol <- .dynEval(symbol, ifnotfound = NULL) - if (is.character(eval_symbol)) { - symbol <- try(str2lang(paste0(names(dots)[i], " = ", eval_symbol)), silent = TRUE) - # we may have the edge-case of having a function that returns a character - # vector, like "new_var = sample(letters[1:3])". In this case, "eval_symbol" - # is of type character, but no symbol, thus str2lang() above creates a - # wrong pattern. We then take "eval_symbol" as character input. - if (inherits(symbol, "try-error")) { - symbol <- str2lang(paste0( - names(dots)[i], - " = c(", paste0("\"", eval_symbol, "\"", collapse = ","), ")" - )) + # expression is given as character string in a variable, but named, e.g. + # a <- "2 * Sepal.Width" + # data_modify(iris, double_SepWidth = a) + # we reconstruct the symbol as if it were provided as literal expression. + # However, we need to check that we don't have a character vector, + # like: data_modify(iris, new_var = "a") + # this one should be recycled instead. + if (!is.character(symbol)) { + eval_symbol <- .dynEval(symbol, ifnotfound = NULL) + if (is.character(eval_symbol)) { + symbol <- try(str2lang(paste0(names(dots)[i], " = ", eval_symbol)), silent = TRUE) + # we may have the edge-case of having a function that returns a character + # vector, like "new_var = sample(letters[1:3])". In this case, "eval_symbol" + # is of type character, but no symbol, thus str2lang() above creates a + # wrong pattern. We then take "eval_symbol" as character input. + if (inherits(symbol, "try-error")) { + symbol <- str2lang(paste0( + names(dots)[i], + " = c(", paste0("\"", eval_symbol, "\"", collapse = ","), ")" + )) + } } } - } - # finally, we can evaluate expression and get values for new variables - new_variable <- try(with(data, eval(symbol)), silent = TRUE) + # finally, we can evaluate expression and get values for new variables + new_variable <- try(with(data, eval(symbol)), silent = TRUE) - # successful, or any errors, like misspelled variable name? - if (inherits(new_variable, "try-error")) { - # in which step did error happen? - step_number <- switch(as.character(i), - "1" = "the first expression", - "2" = "the second expression", - "3" = "the third expression", - paste("expression", i) - ) - step_msg <- paste0("There was an error in ", step_number, ".") - # try to find out which variable was the cause for the error - error_msg <- attributes(new_variable)$condition$message - if (grepl("object '(.*)' not found", error_msg)) { - error_var <- gsub("object '(.*)' not found", "\\1", error_msg) + # successful, or any errors, like misspelled variable name? + if (inherits(new_variable, "try-error")) { + # in which step did error happen? + step_number <- switch(as.character(i), + "1" = "the first expression", + "2" = "the second expression", + "3" = "the third expression", + paste("expression", i) + ) + step_msg <- paste0("There was an error in ", step_number, ".") + # try to find out which variable was the cause for the error + error_msg <- attributes(new_variable)$condition$message + if (grepl("object '(.*)' not found", error_msg)) { + error_var <- gsub("object '(.*)' not found", "\\1", error_msg) + insight::format_error( + paste0(step_msg, " Variable \"", error_var, "\" was not found in the dataset or in the environment."), + .misspelled_string(colnames(data), error_var, "Possibly misspelled or not yet defined?") + ) + } else { + insight::format_error(paste0( + step_msg, " ", insight::format_capitalize(error_msg), + ". Possibly misspelled or not yet defined?" + )) + } + } + + # give informative error when new variable doesn't match number of rows + if (!is.null(new_variable) && length(new_variable) != nrow(data) && (nrow(data) %% length(new_variable)) != 0) { insight::format_error( - paste0(step_msg, " Variable \"", error_var, "\" was not found in the dataset or in the environment."), - .misspelled_string(colnames(data), error_var, "Possibly misspelled or not yet defined?") + "New variable has not the same length as the other variables in the data frame and cannot be recycled." ) - } else { - insight::format_error(paste0( - step_msg, " ", insight::format_capitalize(error_msg), - ". Possibly misspelled or not yet defined?" - )) } - } - # give informative error when new variable doesn't match number of rows - if (!is.null(new_variable) && length(new_variable) != nrow(data) && (nrow(data) %% length(new_variable)) != 0) { - insight::format_error( - "New variable has not the same length as the other variables in the data frame and cannot be recycled." - ) + data[[names(dots)[i]]] <- new_variable } - - data[[names(dots)[i]]] <- new_variable } + # check if we have at/modify ---- + data <- .modify_at(data, .at, .if, .modify, column_names) + data } #' @export -data_modify.grouped_df <- function(data, ...) { +data_modify.grouped_df <- function(data, ..., .if = NULL, .at = NULL, .modify = NULL) { # we need to evaluate dots here, and pass them with "do.call" to # the data.frame method later... - dots <- match.call(expand.dots = FALSE)$`...` + dots <- match.call(expand.dots = FALSE)[["..."]] # works only for dplyr >= 0.8.0 grps <- attr(data, "groups", exact = TRUE) @@ -262,8 +286,55 @@ data_modify.grouped_df <- function(data, ...) { data[rows, ] <- data_modify.data.frame(data[rows, ], ...) } + # check if we have at/modify ---- + data <- .modify_at(data, .at, .if, .modify, column_names) + # set back attributes and class data <- .replace_attrs(data, attr_data) class(data) <- class_attr data } + + +# helper ------------- + +.modify_at <- function(data, .at, .if, .modify, column_names) { + # make sure either .at or .if is defined, not both + if (!is.null(.at) && !is.null(.if)) { + insight::format_error("You cannot use both `.at` and `.if` at the same time.") + } + if ((!is.null(.at) || !is.null(.if)) && !is.null(.modify)) { + # if we have ".if" defined, specify ".at" + if (!is.null(.if)) { + .at <- column_names[vapply(data[column_names], .if, logical(1))] + } + # make sure "modify" is a function + if (!is.function(.modify)) { + insight::format_error("`.modify` must be a function.") + } + # check for valid defined column names + if (!all(.at %in% column_names)) { + not_found <- .at[!.at %in% column_names] + insight::format_alert( + paste0( + "Variable", + ifelse(length(not_found) > 1, "s ", " "), + text_concatenate(not_found, enclose = "\""), + ifelse(length(not_found) > 1, "were", "was"), + " not found in the dataset." + ), + .misspelled_string(column_names, not_found, "Possibly misspelled or not yet defined?") + ) + } + # modify variables + found <- .at[.at %in% column_names] + if (length(found)) { + for (i in found) { + data[[i]] <- .modify(data[[i]]) + } + } else { + insight::format_alert("No variables found in the dataset that match the `.if` or `.at` argument.") + } + } + data +} diff --git a/man/data_modify.Rd b/man/data_modify.Rd index 7f13a8c08..0eadebc20 100644 --- a/man/data_modify.Rd +++ b/man/data_modify.Rd @@ -31,6 +31,16 @@ Example: \code{Petal.Width = NULL}. Note that newly created variables can be used in subsequent expressions. See also 'Examples'.} + +\item{.at}{A character vector of variable names that should be modified. This +argument is used in combination with the \code{.modify} argument.} + +\item{.if}{A function that returns \code{TRUE} for columns in the data frame where +\code{.if} applies. This argument is used in combination with the \code{.modify} argument.} + +\item{.modify}{A function that modifies the variables defined in \code{.at} or \code{.if}. +This argument is used in combination with either the \code{.at} or the \code{.if}argument. +If \code{.modify} is not provided, \code{.at} and \code{.if} are ignored.} } \description{ Create new variables or modify existing variables in a data frame. Unlike \code{base::transform()}, \code{data_modify()} @@ -103,4 +113,13 @@ foo(iris, "var_a = Sepal.Width / 10") new_exp <- c("SW_double = 2 * Sepal.Width", "SW_fraction = SW_double / 10") foo(iris, new_exp) + +# modify at specific positions or if condition is met +d <- iris[1:5, ] +data_modify(d, .at = "Species", .modify = as.numeric) +data_modify(d, .if = is.factor, .modify = as.numeric) + +# can be combined with dots +data_modify(d, new_length = Petal.Length * 2, .at = "Species", .modify = as.numeric) + } diff --git a/man/datawizard-package.Rd b/man/datawizard-package.Rd index a9ea35f0a..ca3dc7e59 100644 --- a/man/datawizard-package.Rd +++ b/man/datawizard-package.Rd @@ -3,7 +3,6 @@ \docType{package} \name{datawizard-package} \alias{datawizard-package} -\alias{_PACKAGE} \alias{datawizard} \title{datawizard: Easy Data Wrangling and Statistical Transformations} \description{