Skip to content

Commit

Permalink
data_tabulate() gains a weights argument (#479)
Browse files Browse the repository at this point in the history
* `data_tabulate()` gains a `weights` argument

* add tests

* tests

* lintr

* news, comment

* typo

* fix test, remove whitespace
  • Loading branch information
strengejacke authored Feb 7, 2024
1 parent d129767 commit 3be20ad
Show file tree
Hide file tree
Showing 7 changed files with 205 additions and 14 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: datawizard
Title: Easy Data Wrangling and Statistical Transformations
Version: 0.9.1.1
Version: 0.9.1.2
Authors@R: c(
person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut",
comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),
Expand Down
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ CHANGES
* `data_modify()` gets three new arguments, `.at`, `.if` and `.modify`, to modify
variables at specific positions or based on logical conditions.

* `data_tabulate()` gets a `weights` argument, to compute weighted frequency tables.

# datawizard 0.9.1

CHANGES
Expand Down
2 changes: 1 addition & 1 deletion R/data_modify.R
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@
#' .at = c("Species", "new_length"),
#' .modify = as.numeric
#' )}
#'
#'
#' # combine "data_find()" and ".at" argument
#' out <- data_modify(
#' d,
Expand Down
83 changes: 74 additions & 9 deletions R/data_tabulate.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
#' for printing.
#' @param collapse Logical, if `TRUE` collapses multiple tables into one larger
#' table for printing. This affects only printing, not the returned object.
#' @param weights Optional numeric vector of weights. Must be of the same length
#' as `x`. If `weights` is supplied, weighted frequencies are calculated.
#' @param ... not used.
#' @inheritParams find_columns
#'
Expand Down Expand Up @@ -46,6 +48,12 @@
#'
#' # to remove the big mark, use "print(..., big_mark = "")"
#' print(data_tabulate(x), big_mark = "")
#'
#' # weighted frequencies
#' set.seed(123)
#' efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5))
#' data_tabulate(efc$e42dep, weights = efc$weights)
#'
#' @export
data_tabulate <- function(x, ...) {
UseMethod("data_tabulate")
Expand All @@ -54,7 +62,7 @@ data_tabulate <- function(x, ...) {

#' @rdname data_tabulate
#' @export
data_tabulate.default <- function(x, drop_levels = FALSE, name = NULL, verbose = TRUE, ...) {
data_tabulate.default <- function(x, drop_levels = FALSE, weights = NULL, name = NULL, verbose = TRUE, ...) {
# save label attribute, before it gets lost...
var_label <- attr(x, "label", exact = TRUE)

Expand All @@ -70,8 +78,26 @@ data_tabulate.default <- function(x, drop_levels = FALSE, name = NULL, verbose =
x <- droplevels(x)
}

# check for correct length of weights - must be equal to "x"
if (!is.null(weights) && length(weights) != length(x)) {
insight::format_error("Length of `weights` must be equal to length of `x`.")
}

# frequency table
freq_table <- tryCatch(table(addNA(x)), error = function(e) NULL)
if (is.null(weights)) {
freq_table <- tryCatch(table(addNA(x)), error = function(e) NULL)
} else {
# weighted frequency table
freq_table <- tryCatch(
stats::xtabs(
weights ~ x,
data = data.frame(weights = weights, x = x),
na.action = stats::na.pass,
addNA = TRUE
),
error = function(e) NULL
)
}

if (is.null(freq_table)) {
insight::format_warning(paste0("Can't compute frequency tables for objects of class `", class(x)[1], "`."))
Expand All @@ -83,6 +109,11 @@ data_tabulate.default <- function(x, drop_levels = FALSE, name = NULL, verbose =
replacement = c("Value", "N")
)

# we want to round N for weighted frequencies
if (!is.null(weights)) {
out$N <- round(out$N)
}

out$`Raw %` <- 100 * out$N / sum(out$N)
out$`Valid %` <- c(100 * out$N[-nrow(out)] / sum(out$N[-nrow(out)]), NA)
out$`Cumulative %` <- cumsum(out$`Valid %`)
Expand Down Expand Up @@ -110,6 +141,7 @@ data_tabulate.default <- function(x, drop_levels = FALSE, name = NULL, verbose =
attr(out, "object") <- obj_name
attr(out, "group_variable") <- group_variable
attr(out, "duplicate_varnames") <- duplicated(out$Variable)
attr(out, "weights") <- weights

attr(out, "total_n") <- sum(out$N, na.rm = TRUE)
attr(out, "valid_n") <- sum(out$N[-length(out$N)], na.rm = TRUE)
Expand All @@ -129,6 +161,7 @@ data_tabulate.data.frame <- function(x,
regex = FALSE,
collapse = FALSE,
drop_levels = FALSE,
weights = NULL,
verbose = TRUE,
...) {
# evaluate arguments
Expand All @@ -140,11 +173,12 @@ data_tabulate.data.frame <- function(x,
verbose = verbose
)
out <- lapply(select, function(i) {
data_tabulate(x[[i]], drop_levels = drop_levels, name = i, verbose = verbose, ...)
data_tabulate(x[[i]], drop_levels = drop_levels, weights = weights, name = i, verbose = verbose, ...)
})

class(out) <- c("dw_data_tabulates", "list")
attr(out, "collapse") <- isTRUE(collapse)
attr(out, "is_weighted") <- !is.null(weights)

out
}
Expand All @@ -159,6 +193,7 @@ data_tabulate.grouped_df <- function(x,
verbose = TRUE,
collapse = FALSE,
drop_levels = FALSE,
weights = NULL,
...) {
# works only for dplyr >= 0.8.0
grps <- attr(x, "groups", exact = TRUE)
Expand Down Expand Up @@ -191,12 +226,14 @@ data_tabulate.grouped_df <- function(x,
ignore_case = ignore_case,
verbose = verbose,
drop_levels = drop_levels,
weights = weights,
group_variable = group_variable,
...
))
}
class(out) <- c("dw_data_tabulates", "list")
attr(out, "collapse") <- isTRUE(collapse)
attr(out, "is_weighted") <- !is.null(weights)

out
}
Expand Down Expand Up @@ -270,7 +307,12 @@ print.dw_data_tabulate <- function(x, big_mark = NULL, ...) {
a$valid_n <- .add_commas_in_numbers(a$valid_n, big_mark)

# summary of total and valid N (we may add mean/sd as well?)
summary_line <- sprintf("# total N=%s valid N=%s\n\n", a$total_n, a$valid_n)
summary_line <- sprintf(
"# total N=%s valid N=%s%s\n\n",
a$total_n,
a$valid_n,
ifelse(is.null(a$weights), "", " (weighted)")
)
cat(insight::print_color(summary_line, "blue"))

# remove information that goes into the header/footer
Expand All @@ -295,7 +337,12 @@ print_html.dw_data_tabulate <- function(x, big_mark = NULL, ...) {
caption <- .table_header(x, "html")

# summary of total and valid N (we may add mean/sd as well?)
footer <- sprintf("total N=%i valid N=%i\n\n", a$total_n, a$valid_n)
footer <- sprintf(
"total N=%i valid N=%i%s",
a$total_n,
a$valid_n,
ifelse(is.null(a$weights), "", " (weighted)")
)

# remove information that goes into the header/footer
x$Variable <- NULL
Expand All @@ -320,7 +367,12 @@ print_md.dw_data_tabulate <- function(x, big_mark = NULL, ...) {
caption <- .table_header(x, "markdown")

# summary of total and valid N (we may add mean/sd as well?)
footer <- sprintf("total N=%i valid N=%i\n\n", a$total_n, a$valid_n)
footer <- sprintf(
"total N=%i valid N=%i%s\n\n",
a$total_n,
a$valid_n,
ifelse(is.null(a$weights), "", " (weighted)")
)

# remove information that goes into the header/footer
x$Variable <- NULL
Expand All @@ -339,6 +391,9 @@ print_md.dw_data_tabulate <- function(x, big_mark = NULL, ...) {

#' @export
print.dw_data_tabulates <- function(x, big_mark = NULL, ...) {
# check if we have weights
is_weighted <- isTRUE(attributes(x)$is_weighted)

a <- attributes(x)
if (!isTRUE(a$collapse) || length(x) == 1) {
for (i in seq_along(x)) {
Expand All @@ -356,7 +411,11 @@ print.dw_data_tabulates <- function(x, big_mark = NULL, ...) {
})

out <- do.call(rbind, x)
cat(insight::print_color("# Frequency Table\n\n", "blue"))
if (is_weighted) {
cat(insight::print_color("# Frequency Table (weighted)\n\n", "blue"))
} else {
cat(insight::print_color("# Frequency Table\n\n", "blue"))
}

# print table
cat(insight::export_table(
Expand All @@ -371,6 +430,9 @@ print.dw_data_tabulates <- function(x, big_mark = NULL, ...) {

#' @export
print_html.dw_data_tabulates <- function(x, big_mark = NULL, ...) {
# check if we have weights
is_weighted <- isTRUE(attributes(x)$is_weighted)

if (length(x) == 1) {
print_html(x[[1]], big_mark = big_mark, ...)
} else {
Expand All @@ -387,7 +449,7 @@ print_html.dw_data_tabulates <- function(x, big_mark = NULL, ...) {
insight::export_table(
out,
missing = "<NA>",
caption = "Frequency Table",
caption = ifelse(is_weighted, "Frequency Table (weighted)", "Frequency Table"),
format = "html",
group_by = "Group"
)
Expand All @@ -397,6 +459,9 @@ print_html.dw_data_tabulates <- function(x, big_mark = NULL, ...) {

#' @export
print_md.dw_data_tabulates <- function(x, big_mark = NULL, ...) {
# check if we have weights
is_weighted <- isTRUE(attributes(x)$is_weighted)

if (length(x) == 1) {
print_md(x[[1]], big_mark = big_mark, ...)
} else {
Expand All @@ -417,7 +482,7 @@ print_md.dw_data_tabulates <- function(x, big_mark = NULL, ...) {
missing = "(NA)",
empty_line = "-",
format = "markdown",
title = "Frequency Table"
title = ifelse(is_weighted, "Frequency Table (weighted)", "Frequency Table")
)
}
}
Expand Down
18 changes: 17 additions & 1 deletion man/data_tabulate.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

79 changes: 79 additions & 0 deletions tests/testthat/_snaps/data_tabulate.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,82 @@
# data_tabulate, weights

Code
print(data_tabulate(efc$e42dep, weights = efc$weights))
Output
elder's dependency (efc$e42dep) <categorical>
# total N=105 valid N=100 (weighted)
Value | N | Raw % | Valid % | Cumulative %
------+----+-------+---------+-------------
1 | 3 | 2.86 | 3.00 | 3.00
2 | 4 | 3.81 | 4.00 | 7.00
3 | 26 | 24.76 | 26.00 | 33.00
4 | 67 | 63.81 | 67.00 | 100.00
<NA> | 5 | 4.76 | <NA> | <NA>

---

Code
print_md(data_tabulate(efc$e42dep, weights = efc$weights))
Output
[1] "Table: elder's dependency (efc$e42dep) (categorical)"
[2] ""
[3] "|Value | N| Raw %| Valid %| Cumulative %|"
[4] "|:-----|--:|-----:|-------:|------------:|"
[5] "|1 | 3| 2.86| 3.00| 3.00|"
[6] "|2 | 4| 3.81| 4.00| 7.00|"
[7] "|3 | 26| 24.76| 26.00| 33.00|"
[8] "|4 | 67| 63.81| 67.00| 100.00|"
[9] "|(NA) | 5| 4.76| (NA)| (NA)|"
[10] "total N=105 valid N=100 (weighted)\n\n"
attr(,"format")
[1] "pipe"
attr(,"class")
[1] "knitr_kable" "character"

---

Code
print(data_tabulate(efc, c("e42dep", "e16sex"), collapse = TRUE, weights = efc$
weights))
Output
# Frequency Table (weighted)
Variable | Value | N | Raw % | Valid % | Cumulative %
---------+-------+----+-------+---------+-------------
e42dep | 1 | 3 | 2.86 | 3.00 | 3.00
| 2 | 4 | 3.81 | 4.00 | 7.00
| 3 | 26 | 24.76 | 26.00 | 33.00
| 4 | 67 | 63.81 | 67.00 | 100.00
| <NA> | 5 | 4.76 | <NA> | <NA>
---------+-------+----+-------+---------+-------------
e16sex | 1 | 50 | 47.62 | 100.00 | 100.00
| 2 | 55 | 52.38 | <NA> | <NA>
------------------------------------------------------

---

Code
print_md(data_tabulate(efc, c("e42dep", "e16sex"), weights = efc$weights))
Output
[1] "Table: Frequency Table (weighted)"
[2] ""
[3] "|Variable | Value| N| Raw %| Valid %| Cumulative %|"
[4] "|:--------|-----:|--:|-----:|-------:|------------:|"
[5] "|e42dep | 1| 3| 2.86| 3.00| 3.00|"
[6] "| | 2| 4| 3.81| 4.00| 7.00|"
[7] "| | 3| 26| 24.76| 26.00| 33.00|"
[8] "| | 4| 67| 63.81| 67.00| 100.00|"
[9] "| | (NA)| 5| 4.76| (NA)| (NA)|"
[10] "| | | | | | |"
[11] "|e16sex | 1| 50| 47.62| 100.00| 100.00|"
[12] "| | 2| 55| 52.38| (NA)| (NA)|"
[13] "| | | | | | |"
attr(,"format")
[1] "pipe"
attr(,"class")
[1] "knitr_kable" "character"

# data_tabulate print

Code
Expand Down
Loading

0 comments on commit 3be20ad

Please sign in to comment.