Skip to content

Commit

Permalink
Unexpected missing values in data_tabulate() (#518)
Browse files Browse the repository at this point in the history
* Unexpected missing values in `data_tabulate()`
Fixes #514

* reverse usage

* fix tests (FALSE -> TRUE)

* docs, news

* Update NEWS.md

Co-authored-by: Indrajeet Patil <patilindrajeet.science@gmail.com>

* add  comment

---------

Co-authored-by: Indrajeet Patil <patilindrajeet.science@gmail.com>
  • Loading branch information
strengejacke and IndrajeetPatil authored Jun 23, 2024
1 parent 218fbbb commit ebe48b4
Show file tree
Hide file tree
Showing 10 changed files with 98 additions and 85 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: datawizard
Title: Easy Data Wrangling and Statistical Transformations
Version: 0.11.0.3
Version: 0.11.0.4
Authors@R: c(
person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut",
comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),
Expand Down
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@

BREAKING CHANGES

* The argument `include_na` in `data_tabulate()` and `data_summary()` has been
renamed into `remove_na`. Consequently, to mimic former behaviour, `FALSE` and
`TRUE` need to be switched (i.e. `remove_na = TRUE` is equivalent to the former
`include_na = FALSE`).

* Class names for objects returned by `data_tabulate()` have been changed to
`datawizard_table` and `datawizard_crosstable` (resp. the plural forms,
`*_tables`), to provide a clearer and more consistent naming scheme.
Expand Down
22 changes: 11 additions & 11 deletions R/data_summary.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
#' @param by Optional character string, indicating the name of a variable in `x`.
#' If supplied, the data will be split by this variable and summary statistics
#' will be computed for each group.
#' @param include_na Logical. If `TRUE`, missing values are included as a level
#' in the grouping variable. If `FALSE`, missing values are omitted from the
#' grouping variable.
#' @param remove_na Logical. If `TRUE`, missing values are omitted from the
#' grouping variable. If `FALSE` (default), missing values are included as a
#' level in the grouping variable.
#' @param ... One or more named expressions that define the new variable name
#' and the function to compute the summary statistic. Example:
#' `mean_sepal_width = mean(Sepal.Width)`. The expression can also be provided
Expand Down Expand Up @@ -57,8 +57,8 @@ data_summary <- function(x, ...) {


#' @export
data_summary.matrix <- function(x, ..., by = NULL, include_na = TRUE) {
data_summary(as.data.frame(x), ..., by = by, include_na = include_na)
data_summary.matrix <- function(x, ..., by = NULL, remove_na = FALSE) {
data_summary(as.data.frame(x), ..., by = by, remove_na = remove_na)
}


Expand All @@ -70,7 +70,7 @@ data_summary.default <- function(x, ...) {

#' @rdname data_summary
#' @export
data_summary.data.frame <- function(x, ..., by = NULL, include_na = TRUE) {
data_summary.data.frame <- function(x, ..., by = NULL, remove_na = FALSE) {
dots <- eval(substitute(alist(...)))

# do we have any expression at all?
Expand Down Expand Up @@ -103,10 +103,10 @@ data_summary.data.frame <- function(x, ..., by = NULL, include_na = TRUE) {
}
# split data, add NA levels, if requested
l <- lapply(x[by], function(i) {
if (include_na && anyNA(i)) {
addNA(i)
} else {
if (remove_na || !anyNA(i)) {
i
} else {
addNA(i)
}
})
split_data <- split(x, l, drop = TRUE)
Expand Down Expand Up @@ -137,7 +137,7 @@ data_summary.data.frame <- function(x, ..., by = NULL, include_na = TRUE) {


#' @export
data_summary.grouped_df <- function(x, ..., by = NULL, include_na = TRUE) {
data_summary.grouped_df <- function(x, ..., by = NULL, remove_na = FALSE) {
# extract group variables
grps <- attr(x, "groups", exact = TRUE)
group_variables <- data_remove(grps, ".rows")
Expand All @@ -148,7 +148,7 @@ data_summary.grouped_df <- function(x, ..., by = NULL, include_na = TRUE) {
# remove information specific to grouped df's
attr(x, "groups") <- NULL
class(x) <- "data.frame"
data_summary(x, ..., by = by, include_na = include_na)
data_summary(x, ..., by = by, remove_na = remove_na)
}


Expand Down
56 changes: 30 additions & 26 deletions R/data_tabulate.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
#' factor levels are dropped from the frequency table.
#' @param name Optional character string, which includes the name that is used
#' for printing.
#' @param include_na Logical, if `TRUE`, missing values are included in the
#' @param remove_na Logical, if `FALSE`, missing values are included in the
#' frequency or crosstable, else missing values are omitted.
#' @param collapse Logical, if `TRUE` collapses multiple tables into one larger
#' table for printing. This affects only printing, not the returned object.
Expand All @@ -40,7 +40,7 @@
#' (missing) values by default. The first column indicates values of `x`, the
#' first row indicates values of `by` (including missing values). The last row
#' and column contain the total frequencies for each row and column, respectively.
#' Setting `include_na = FALSE` will omit missing values from the crosstable.
#' Setting `remove_na = FALSE` will omit missing values from the crosstable.
#' Setting `proportions` to `"row"` or `"column"` will add row or column
#' percentages. Setting `proportions` to `"full"` will add relative frequencies
#' for the full table.
Expand All @@ -62,7 +62,7 @@
#' data_tabulate(efc$c172code)
#'
#' # drop missing values
#' data_tabulate(efc$c172code, include_na = FALSE)
#' data_tabulate(efc$c172code, remove_na = TRUE)
#'
#' # data frame
#' data_tabulate(efc, c("e42dep", "c172code"))
Expand Down Expand Up @@ -109,7 +109,7 @@
#' efc$c172code,
#' by = efc$e16sex,
#' proportions = "column",
#' include_na = FALSE
#' remove_na = TRUE
#' )
#'
#' # round percentages
Expand All @@ -133,7 +133,7 @@ data_tabulate.default <- function(x,
by = NULL,
drop_levels = FALSE,
weights = NULL,
include_na = TRUE,
remove_na = FALSE,
proportions = NULL,
name = NULL,
verbose = TRUE,
Expand Down Expand Up @@ -163,7 +163,7 @@ data_tabulate.default <- function(x,
x,
by = by,
weights = weights,
include_na = include_na,
remove_na = remove_na,
proportions = proportions,
obj_name = obj_name,
group_variable = group_variable
Expand All @@ -172,30 +172,34 @@ data_tabulate.default <- function(x,

# frequency table
if (is.null(weights)) {
if (include_na) {
freq_table <- tryCatch(table(addNA(x)), error = function(e) NULL)
} else {
if (remove_na) {
# we have a `.default` and a `.data.frame` method for `data_tabulate()`.
# since this is the default, `x` can be an object which cannot be used
# with `table()`, that's why we add `tryCatch()` here. Below we give an
# informative error message for non-supported objects.
freq_table <- tryCatch(table(x), error = function(e) NULL)
} else {
freq_table <- tryCatch(table(addNA(x)), error = function(e) NULL)
}
} else if (include_na) {
# weighted frequency table, including NA
} else if (remove_na) {
# weighted frequency table, excluding NA
freq_table <- tryCatch(
stats::xtabs(
weights ~ x,
data = data.frame(weights = weights, x = addNA(x)),
na.action = stats::na.pass,
addNA = TRUE
data = data.frame(weights = weights, x = x),
na.action = stats::na.omit,
addNA = FALSE
),
error = function(e) NULL
)
} else {
# weighted frequency table, excluding NA
# weighted frequency table, including NA
freq_table <- tryCatch(
stats::xtabs(
weights ~ x,
data = data.frame(weights = weights, x = x),
na.action = stats::na.omit,
addNA = FALSE
data = data.frame(weights = weights, x = addNA(x)),
na.action = stats::na.pass,
addNA = TRUE
),
error = function(e) NULL
)
Expand All @@ -218,12 +222,12 @@ data_tabulate.default <- function(x,

out$`Raw %` <- 100 * out$N / sum(out$N)
# if we have missing values, we add a row with NA
if (include_na) {
out$`Valid %` <- c(100 * out$N[-nrow(out)] / sum(out$N[-nrow(out)]), NA)
valid_n <- sum(out$N[-length(out$N)], na.rm = TRUE)
} else {
if (remove_na) {
out$`Valid %` <- 100 * out$N / sum(out$N)
valid_n <- sum(out$N, na.rm = TRUE)
} else {
out$`Valid %` <- c(100 * out$N[-nrow(out)] / sum(out$N[-nrow(out)]), NA)
valid_n <- sum(out$N[-length(out$N)], na.rm = TRUE)
}
out$`Cumulative %` <- cumsum(out$`Valid %`)

Expand Down Expand Up @@ -271,7 +275,7 @@ data_tabulate.data.frame <- function(x,
by = NULL,
drop_levels = FALSE,
weights = NULL,
include_na = TRUE,
remove_na = FALSE,
proportions = NULL,
collapse = FALSE,
verbose = TRUE,
Expand All @@ -297,7 +301,7 @@ data_tabulate.data.frame <- function(x,
proportions = proportions,
drop_levels = drop_levels,
weights = weights,
include_na = include_na,
remove_na = remove_na,
name = i,
verbose = verbose,
...
Expand Down Expand Up @@ -326,7 +330,7 @@ data_tabulate.grouped_df <- function(x,
proportions = NULL,
drop_levels = FALSE,
weights = NULL,
include_na = TRUE,
remove_na = FALSE,
collapse = FALSE,
verbose = TRUE,
...) {
Expand Down Expand Up @@ -362,7 +366,7 @@ data_tabulate.grouped_df <- function(x,
verbose = verbose,
drop_levels = drop_levels,
weights = weights,
include_na = include_na,
remove_na = remove_na,
by = by,
proportions = proportions,
group_variable = group_variable,
Expand Down
30 changes: 17 additions & 13 deletions R/data_xtabulate.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
.crosstable <- function(x,
by,
weights = NULL,
include_na = TRUE,
remove_na = FALSE,
proportions = NULL,
obj_name = NULL,
group_variable = NULL) {
Expand All @@ -12,30 +12,34 @@
}
# frequency table
if (is.null(weights)) {
if (include_na) {
x_table <- tryCatch(table(addNA(x), addNA(by)), error = function(e) NULL)
} else {
# we have a `.default` and a `.data.frame` method for `data_tabulate()`.
# since this is the default, `x` can be an object which cannot be used
# with `table()`, that's why we add `tryCatch()` here. Below we give an
# informative error message for non-supported objects.
if (remove_na) {
x_table <- tryCatch(table(x, by), error = function(e) NULL)
} else {
x_table <- tryCatch(table(addNA(x), addNA(by)), error = function(e) NULL)
}
} else if (include_na) {
# weighted frequency table, including NA
} else if (remove_na) {
# weighted frequency table, excluding NA
x_table <- tryCatch(
stats::xtabs(
weights ~ x + by,
data = data.frame(weights = weights, x = addNA(x), by = addNA(by)),
na.action = stats::na.pass,
addNA = TRUE
data = data.frame(weights = weights, x = x, by = by),
na.action = stats::na.omit,
addNA = FALSE
),
error = function(e) NULL
)
} else {
# weighted frequency table, excluding NA
# weighted frequency table, including NA
x_table <- tryCatch(
stats::xtabs(
weights ~ x + by,
data = data.frame(weights = weights, x = x, by = by),
na.action = stats::na.omit,
addNA = FALSE
data = data.frame(weights = weights, x = addNA(x), by = addNA(by)),
na.action = stats::na.pass,
addNA = TRUE
),
error = function(e) NULL
)
Expand Down
8 changes: 4 additions & 4 deletions man/data_summary.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 6 additions & 6 deletions man/data_tabulate.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit ebe48b4

Please sign in to comment.