diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..81e0262 Binary files /dev/null and b/.DS_Store differ diff --git a/DESCRIPTION b/DESCRIPTION index c8ff1c1..7f9047c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: bfuncs Type: Package Title: This Repository Is A Random Smattering Of Functions I Wrote For Myself - You Can Use Them Too -Version: 0.2 +Version: 0.2.1 Author: person("Brad", "Cannell", email = "brad.cannell@gmail.com", role = c("aut", "cre")) Maintainer: Brad Cannell Description: Random functions for me. diff --git a/R/.DS_Store b/R/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/R/.DS_Store differ diff --git a/R/format_table.R b/R/format_table.R index ef40249..ba924cc 100644 --- a/R/format_table.R +++ b/R/format_table.R @@ -9,7 +9,7 @@ #' #' @param ... Other parameters to be passed on. #' -#' @param digits Determines the number of decimal place to display. Passed to +#' @param digits Determines the number of decimal places to display. Passed to #' the "nsmall =" parameter of the format function. #' #' Note: Changing the digits argument to format_table will change the number @@ -40,30 +40,76 @@ #' #' data(mtcars) #' -#' # Overall mean table +#' # Overall mean table with defaults #' #' mtcars %>% #' mean_table(mpg) %>% #' format_table() #' #' #> # A tibble: 1 x 2 -#' #> var mean_95 -#' #> -#' #> 1 mpg 20.09 (17.92 - 22.26) +#' #> response_var mean_95 +#' #> +#' #> 1 mpg 20.09 (17.92 - 22.26) #' -#' # Grouped means table +#' # Grouped means table with defaults #' #' mtcars %>% #' group_by(cyl) %>% #' mean_table(mpg) %>% #' format_table() #' -#' #> # A tibble: 3 x 3 -#' #> cyl var mean_95 -#' #> -#' #> 1 4 mpg 26.66 (23.63 - 29.69) -#' #> 2 6 mpg 19.74 (18.40 - 21.09) -#' #> 3 8 mpg 15.10 (13.62 - 16.58) +#' #> # A tibble: 3 x 4 +#' #> response_var group_var group_cat mean_95 +#' #> +#' #> 1 mpg cyl 4 26.66 (23.63 - 29.69) +#' #> 2 mpg cyl 6 19.74 (18.40 - 21.09) +#' #> 3 mpg cyl 8 15.10 (13.62 - 16.58) +#' +#' # One-way frequency tables with defaults +#' +#' mtcars %>% +#' group_by(cyl) %>% +#' mean_table(mpg) %>% +#' format_table() +#' #> # A tibble: 2 x 3 +#' #> var cat percent_95 +#' #> +#' #> 1 am 0 59.38 (40.94 - 75.50) +#' #> 2 am 1 40.62 (24.50 - 59.06) +#' +#' # Two-way frequency tables with defaults +#' +#' mtcars %>% +#' group_by(am, cyl) %>% +#' freq_table() %>% +#' format_table() +#' +#' #> # A tibble: 6 x 5 +#' #> row_var row_cat col_var col_cat percent_row_95 +#' #> +#' #> 1 am 0 cyl 4 15.79 (4.78 - 41.20) +#' #> 2 am 0 cyl 6 21.05 (7.58 - 46.44) +#' #> 3 am 0 cyl 8 63.16 (38.76 - 82.28) +#' #> 4 am 1 cyl 4 61.54 (32.30 - 84.29) +#' #> 5 am 1 cyl 6 23.08 (6.91 - 54.82) +#' #> 6 am 1 cyl 8 15.38 (3.43 - 48.18) +#' +#' #' # Two-way frequency tables with with stats = "n and row percent" +#' +#' mtcars %>% +#' group_by(am, cyl) %>% +#' freq_table(output = all) %>% # Don't forget output = all +#' format_table(stats = "n and row percent") +#' +#' #> # A tibble: 6 x 5 +#' #> row_var row_cat col_var col_cat n_percent_row +#' #> +#' #> 1 am 0 cyl 4 3 (15.79) +#' #> 2 am 0 cyl 6 4 (21.05) +#' #> 3 am 0 cyl 8 12 (63.16) +#' #> 4 am 1 cyl 4 8 (61.54) +#' #> 5 am 1 cyl 6 3 (23.08) +#' #> 6 am 1 cyl 8 2 (15.38) # ============================================================================= # S3 Generic function @@ -88,14 +134,16 @@ format_table.mean_table <- function(.data, digits = 2, stats = "mean and ci", .. # ------------------------------------------------------------------ # Prevents R CMD check: "no visible binding for global variable ‘.’" # ------------------------------------------------------------------ - lcl = ucl = n = var = mean_95 = n_mean = NULL + lcl = ucl = n = var = mean_95 = n_mean = response_var = NULL # Format statistics out <- .data %>% dplyr::mutate( mean = format(mean, nsmall = digits), lcl = format(lcl, nsmall = digits), + lcl = trimws(lcl), ucl = format(ucl, nsmall = digits), + ucl = trimws(ucl), mean_95 = paste0(mean, " (", lcl, " - ", ucl, ")"), n_mean = paste0(n, " (", mean, ")") ) @@ -103,11 +151,11 @@ format_table.mean_table <- function(.data, digits = 2, stats = "mean and ci", .. # Control output if (stats == "mean and ci") { out <- out %>% - dplyr::select(var, mean_95) + dplyr::select(response_var, mean_95) } else if (stats == "n and mean") { out <- out %>% - dplyr::select(var, n_mean ) + dplyr::select(response_var, n_mean ) } # Return result @@ -130,7 +178,7 @@ format_table.mean_table_grouped <- function(.data, digits = 2, stats = "mean and # ------------------------------------------------------------------ # Prevents R CMD check: "no visible binding for global variable ‘.’" # ------------------------------------------------------------------ - lcl = ucl = n = var = mean_95 = n_mean = NULL + lcl = ucl = n = var = mean_95 = n_mean = sem = NULL # Format statistics @@ -138,19 +186,23 @@ format_table.mean_table_grouped <- function(.data, digits = 2, stats = "mean and dplyr::mutate( mean = format(mean, nsmall = digits), lcl = format(lcl, nsmall = digits), + lcl = trimws(lcl), ucl = format(ucl, nsmall = digits), + ucl = trimws(ucl), mean_95 = paste0(mean, " (", lcl, " - ", ucl, ")"), n_mean = paste0(n, " (", mean, ")") ) # Control output + # Dropping everything except the variable names, categories, and + # "mean and ci" or "n and mean" if (stats == "mean and ci") { out <- out %>% - dplyr::select(1, var, mean_95) + dplyr::select(-c(n, mean, sem, lcl, ucl, min, max, n_mean)) } else if (stats == "n and mean") { out <- out %>% - dplyr::select(1, var, n_mean ) + dplyr::select(-c(n, mean, sem, lcl, ucl, min, max, mean_95)) } # Return result @@ -173,14 +225,16 @@ format_table.freq_table_one_way <- function(.data, digits = 2, stats = "percent # ------------------------------------------------------------------ # Prevents R CMD check: "no visible binding for global variable ‘.’" # ------------------------------------------------------------------ - percent = lcl = ucl = n = percent_95 = n_percent = NULL + percent = lcl = ucl = n = percent_95 = n_percent = var = NULL # Format statistics out <- .data %>% dplyr::mutate( percent = format(percent, nsmall = digits), lcl = format(lcl, nsmall = digits), + lcl = trimws(lcl), ucl = format(ucl, nsmall = digits), + ucl = trimws(ucl), percent_95 = paste0(percent, " (", lcl, " - ", ucl, ")"), n_percent = paste0(n, " (", percent, ")") ) @@ -188,11 +242,11 @@ format_table.freq_table_one_way <- function(.data, digits = 2, stats = "percent # Control output if (stats == "percent and ci") { out <- out %>% - dplyr::select(1, percent_95) + dplyr::select(var, cat, percent_95) } else if (stats == "n and percent") { out <- out %>% - dplyr::select(1, n_percent ) + dplyr::select(var, cat, n_percent ) } # Return result @@ -217,7 +271,7 @@ format_table.freq_table_two_way <- function(.data, digits = 2, stats = "row perc # ------------------------------------------------------------------ percent_row = lcl_row = ucl_row = n = percent_total = lcl_total = NULL ucl_total = percent_row_95 = n_percent_row = percent_total_95 = NULL - n_percent_total = NULL + n_percent_total = row_var = row_cat = col_var = col_cat = NULL # Figure out if .data includes overall percentages or not @@ -238,8 +292,10 @@ format_table.freq_table_two_way <- function(.data, digits = 2, stats = "row perc dplyr::mutate( percent_row = format(percent_row, nsmall = digits), lcl_row = format(lcl_row, nsmall = digits), + lcl_row = trimws(lcl_row), ucl_row = format(ucl_row, nsmall = digits), percent_row_95 = paste0(percent_row, " (", lcl_row, " - ", ucl_row, ")"), + ucl_row = trimws(ucl_row), n_percent_row = paste0(n, " (", percent_row, ")") ) @@ -250,7 +306,9 @@ format_table.freq_table_two_way <- function(.data, digits = 2, stats = "row perc dplyr::mutate( percent_total = format(percent_total, nsmall = digits), lcl_total = format(lcl_total, nsmall = digits), + lcl_total = trimws(lcl_total), ucl_total = format(ucl_total, nsmall = digits), + ucl_total = trimws(ucl_total), percent_total_95 = paste0(percent_total, " (", lcl_total, " - ", ucl_total, ")"), n_percent_total = paste0(n, " (", percent_total, ")") ) @@ -261,11 +319,11 @@ format_table.freq_table_two_way <- function(.data, digits = 2, stats = "row perc # -------------- if (stats == "row percent and ci") { out <- out %>% - dplyr::select(1:2, percent_row_95) + dplyr::select(row_var, row_cat, col_var, col_cat, percent_row_95) } else if (stats == "n and row percent") { out <- out %>% - dplyr::select(1:2, n_percent_row) + dplyr::select(row_var, row_cat, col_var, col_cat, n_percent_row) } else if ((stats == "percent and ci" || stats == "n and percent") && !has_overall_percent) { stop("In order to pass stats = 'percent and ci' or 'n and percent' to format_table ", @@ -273,11 +331,11 @@ format_table.freq_table_two_way <- function(.data, digits = 2, stats = "row perc } else if (stats == "percent and ci" && has_overall_percent) { out <- out %>% - dplyr::select(1:2, percent_total_95) + dplyr::select(row_var, row_cat, col_var, col_cat, percent_total_95) } else if (stats == "n and percent" && has_overall_percent) { out <- out %>% - dplyr::select(1:2, n_percent_total) + dplyr::select(row_var, row_cat, col_var, col_cat, n_percent_total) } # Return result diff --git a/R/freq_table.R b/R/freq_table.R index 7f2a2f6..1d43525 100644 --- a/R/freq_table.R +++ b/R/freq_table.R @@ -71,6 +71,7 @@ #' #' @examples #' library(tidyverse) +#' library(bfuncs) #' #' data(mtcars) #' @@ -80,11 +81,11 @@ #' group_by(am) %>% #' freq_table() #' -#' #> # A tibble: 2 x 6 -#' #> am n n_total percent lcl_log ucl_log -#' #> -#' #> 1 0 19 32 59.38 40.94 75.50 -#' #> 2 1 13 32 40.62 24.50 59.06 +#' #> # A tibble: 2 x 7 +#' #> var cat n n_total percent lcl ucl +#' #> +#' #> 1 am 0 19 32 59.38 40.94 75.50 +#' #> 2 am 1 13 32 40.62 24.50 59.06 #' #' # Two-way frequency table with defaults #' @@ -92,15 +93,15 @@ #' group_by(am, cyl) %>% #' freq_table() #' -#' #> # A tibble: 6 x 8 -#' #> am cyl n n_row n_total percent_row lcl_row_log ucl_row_log -#' #> -#' #> 1 0 4 3 19 32 15.79 4.78 41.20 -#' #> 2 0 6 4 19 32 21.05 7.58 46.44 -#' #> 3 0 8 12 19 32 63.16 38.76 82.28 -#' #> 4 1 4 8 13 32 61.54 32.30 84.29 -#' #> 5 1 6 3 13 32 23.08 6.91 54.82 -#' #> 6 1 8 2 13 32 15.38 3.43 48.18 +#' #> # A tibble: 6 x 10 +#' #> row_var row_cat col_var col_cat n n_row n_total percent_row lcl_row ucl_row +#' #> +#' #> 1 am 0 cyl 4 3 19 32 15.79 4.78 41.20 +#' #> 2 am 0 cyl 6 4 19 32 21.05 7.58 46.44 +#' #> 3 am 0 cyl 8 12 19 32 63.16 38.76 82.28 +#' #> 4 am 1 cyl 4 8 13 32 61.54 32.30 84.29 +#' #> 5 am 1 cyl 6 3 13 32 23.08 6.91 54.82 +#' #> 6 am 1 cyl 8 2 13 32 15.38 3.43 48.18 freq_table <- function(x, t_prob = 0.975, ci_type = "logit", output = "default", digits = 2, ...) { @@ -112,7 +113,9 @@ freq_table <- function(x, t_prob = 0.975, ci_type = "logit", output = "default", se_total = prop_log_total = t_crit_total = se_log_total = lcl_total_log = NULL percent_total = n_row = prop_row = se_row = prop_log_row = t_crit_row = NULL se_log_row = lcl_row_log = ucl_row_log = percent_row = lcl_row = NULL - ucl_row = lcl_total = ucl_total = ucl_total_log = NULL + ucl_row = lcl_total = ucl_total = ucl_total_log = n_groups = NULL + ci_type_arg = output_arg = `.` = var = row_var = row_cat = NULL + col_var = col_cat = NULL # =========================================================================== # Check for grouped tibble @@ -120,18 +123,33 @@ freq_table <- function(x, t_prob = 0.975, ci_type = "logit", output = "default", if (!("grouped_df" %in% class(x))) { stop(paste("The x argument to freq_table must be a grouped tibble. The class of the current x argument is", class(x))) - } # No else + } + + # =========================================================================== + # Enquo arguments + # enquo/quo_name/UQ the ci_type and output argument so that I don't have to + # use quotation marks around the argument being passed. + # =========================================================================== + ci_type_arg <- rlang::enquo(ci_type) %>% rlang::quo_name() %>% rlang::UQ() + output_arg <- rlang::enquo(output) %>% rlang::quo_name() %>% rlang::UQ() # =========================================================================== # Check for number of group vars: # =========================================================================== - out <- x %>% - dplyr::summarise(n = n()) + n_groups <- attributes(x)$vars %>% length() # =========================================================================== # One-way tables # =========================================================================== - if (ncol(out) == 2) { # else is in two-way tables. + if (n_groups == 1) { # "else" is in two-way tables. + + # Create first three columns of summary table: grouped variable name, + # grouped variable categories, and n of each category + out <- x %>% + dplyr::summarise(n = n()) %>% + dplyr::mutate(var = !!names(.[1])) %>% + dplyr::rename(cat = !!names(.[1])) %>% + dplyr::select(var, cat, n) # Update out to include elements needed for Wald and Logit transformed CI's # One-way tables @@ -147,7 +165,7 @@ freq_table <- function(x, t_prob = 0.975, ci_type = "logit", output = "default", # ------------------- # and put prop, se, and CI's on percent scale # One-way tables - if (ci_type == "wald") { + if (ci_type_arg == "wald") { out <- out %>% dplyr::mutate( @@ -165,20 +183,20 @@ freq_table <- function(x, t_prob = 0.975, ci_type = "logit", output = "default", # Control output # Typically, I only want the frequency, percent and 95% CI # Make that the default - if (output == "default") { + if (output_arg == "default") { out <- out %>% - dplyr::select(1, n, n_total, percent, lcl, ucl) + dplyr::select(var, cat, n, n_total, percent, lcl, ucl) - } else if (output == "all") { + } else if (output_arg == "all") { out <- out %>% - dplyr::select(1, n, n_total, percent, se, t_crit, lcl, ucl) + dplyr::select(var, cat, n, n_total, percent, se, t_crit, lcl, ucl) } # Calculate logit transformed CI's # ------------------------------ # and put prop, se, and CI's on percent scale # One-way tables - } else if (ci_type == "logit") { + } else if (ci_type_arg == "logit") { out <- out %>% dplyr::mutate( @@ -200,13 +218,13 @@ freq_table <- function(x, t_prob = 0.975, ci_type = "logit", output = "default", # Control output # Typically, I only want the frequency, percent and 95% CI # Make that the default - if (output == "default") { + if (output_arg == "default") { out <- out %>% - dplyr::select(1, n, n_total, percent, lcl, ucl) + dplyr::select(var, cat, n, n_total, percent, lcl, ucl) - } else if (output == "all") { + } else if (output_arg == "all") { out <- out %>% - dplyr::select(1, n, n_total, percent, se, t_crit, lcl, ucl) + dplyr::select(var, cat, n, n_total, percent, se, t_crit, lcl, ucl) } } @@ -220,9 +238,23 @@ freq_table <- function(x, t_prob = 0.975, ci_type = "logit", output = "default", # Only logit transformed CI's # Need percent and row percent # =========================================================================== - } else if (ncol(out) == 3) { # if is one-way tables + } else if (n_groups == 2) { # "if" is one-way tables + + # Create first three columns of summary table: row variable name, + # row variable categories, column variable name, column variable categories + # and n of each category row/col combination + out <- x %>% + dplyr::summarise(n = n()) %>% + dplyr::mutate( + row_var = !!names(.[1]), + col_var = !!names(.[2]) + ) %>% + dplyr::rename( + row_cat = !!names(.[1]), + col_cat = !!names(.[2]) + ) %>% + dplyr::select(row_var, row_cat, col_var, col_cat, n) %>% - out <- out %>% # Calculate within row n dplyr::mutate(n_row = sum(n)) %>% # Ungroup to get total_n @@ -271,15 +303,17 @@ freq_table <- function(x, t_prob = 0.975, ci_type = "logit", output = "default", # Control output # Typically, I only want the frequency, row percent and 95% CI for the row percent # Make that the default - if (output == "default") { + if (output_arg == "default") { out <- out %>% - dplyr::select(1:2, n, n_row, n_total, percent_row, lcl_row, ucl_row) + dplyr::select(row_var, row_cat, col_var, col_cat, n, n_row, n_total, + percent_row, lcl_row, ucl_row) - } else if (output == "all") { + } else if (output_arg == "all") { out <- out %>% - dplyr::select(1:2, n, n_row, n_total, percent_total, se_total, t_crit_total, - lcl_total, ucl_total, percent_row, se_row, t_crit_row, - lcl_row, ucl_row) + dplyr::select(row_var, row_cat, col_var, col_cat, n, n_row, n_total, + percent_total, se_total, t_crit_total, + lcl_total, ucl_total, percent_row, se_row, t_crit_row, + lcl_row, ucl_row) } # Add freq_table class to out diff --git a/R/freq_test.R b/R/freq_test.R index 6d077fd..de537b0 100644 --- a/R/freq_test.R +++ b/R/freq_test.R @@ -33,13 +33,13 @@ #' group_by(am) %>% #' freq_table() %>% #' freq_test() %>% -#' select(1:6, p_chi2_pearson) +#' select(var:percent, p_chi2_pearson) #' -#' #> # A tibble: 2 x 7 -#' #> am n n_total percent lcl ucl p_chi2_pearson -#' #> -#' #> 1 0 19 32 59.38 40.94 75.50 0.2888444 -#' #> 2 1 13 32 40.62 24.50 59.06 0.2888444 +#' #> # A tibble: 2 x 6 +#' #> var cat n n_total percent p_chi2_pearson +#' #> +#' #> 1 am 0 19 32 59.38 0.2888444 +#' #> 2 am 1 13 32 40.62 0.2888444 #' #' # Chi-square test of independence #' @@ -47,15 +47,15 @@ #' group_by(am, vs) %>% #' freq_table() %>% #' freq_test() %>% -#' select(1:8, p_chi2_pearson) +#' select(row_var:n, percent_row, p_chi2_pearson) #' -#' #> # A tibble: 4 x 9 -#' #> am vs n n_row n_total percent_row lcl_row ucl_row p_chi2_pearson -#' #> -#' #> 1 0 0 12 19 32 63.16 38.76 82.28 0.3409429 -#' #> 2 0 1 7 19 32 36.84 17.72 61.24 0.3409429 -#' #> 3 1 0 6 13 32 46.15 20.83 73.63 0.3409429 -#' #> 4 1 1 7 13 32 53.85 26.37 79.17 0.3409429 +#' #> # A tibble: 4 x 7 +#' #> row_var row_cat col_var col_cat n percent_row p_chi2_pearson +#' #> +#' #> 1 am 0 vs 0 12 63.16 0.3409429 +#' #> 2 am 0 vs 1 7 36.84 0.3409429 +#' #> 3 am 1 vs 0 6 46.15 0.3409429 +#' #> 4 am 1 vs 1 7 53.85 0.3409429 # ============================================================================= # S3 Generic function @@ -126,7 +126,7 @@ freq_test.freq_table_two_way <- function(x, method = "pearson", ...) { # Prevents R CMD check: "no visible binding for global variable ‘.’" # ------------------------------------------------------------------ n_row = n_col = n_total = n_expected = chi2_contrib = r = pchisq = NULL - chi2_pearson = df = NULL + chi2_pearson = df = col_cat = n = row_cat = NULL # Check to make sure x is a freq_table_two_way # -------------------------------------------- @@ -134,25 +134,19 @@ freq_test.freq_table_two_way <- function(x, method = "pearson", ...) { stop("x must be of class freq_table_two_way. It is currently: ", class(x)) } - # Grab name of row variable (first column from freq_table) - # Grab name of column variable (second column from freq_table) - # ------------------------------------------------------------ - row_var <- x %>% dplyr::select(1) %>% names() %>% rlang::sym() - col_var <- x %>% dplyr::select(2) %>% names() %>% rlang::sym() - # Calculate Pearson's Chi-square test # Test whether population is equally distributed across categories of x # --------------------------------------------------------------------- out <- x %>% - dplyr::group_by(!! col_var) %>% + dplyr::group_by(col_cat) %>% dplyr::mutate(n_col = sum(n)) %>% # Find marginal totals for "columns" dplyr::ungroup() %>% dplyr::mutate( n_expected = (n_row * n_col) / n_total, chi2_contrib = (n - n_expected)**2 / n_expected, chi2_pearson = sum(chi2_contrib), - r = unique(!! row_var) %>% length(), - c = unique(!! col_var) %>% length(), + r = unique(row_cat) %>% length(), + c = unique(col_cat) %>% length(), df = (r -1) * (c - 1), p_chi2_pearson = pchisq(chi2_pearson, df, lower.tail = FALSE) ) @@ -168,8 +162,8 @@ freq_test.freq_table_two_way <- function(x, method = "pearson", ...) { if ("fisher" %in% method) { # Convert x to a matrix - n <- x[, 3] %>% unlist() - mx <- matrix(n, nrow = 2, byrow = TRUE) + n_s <- dplyr::pull(x, n) + mx <- matrix(n_s, nrow = 2, byrow = TRUE) # Use R's built-in fisher.test fisher <- stats::fisher.test(mx) diff --git a/R/mean_table.R b/R/mean_table.R index d8d0917..51680ce 100644 --- a/R/mean_table.R +++ b/R/mean_table.R @@ -18,10 +18,9 @@ #' #' @param output Options for this parameter are "default" and "all". #' -#' Default output includes the n, mean, and 95% confidence interval for the -#' mean. Using output = "all" also returns the standard error of the number -#' of missing values for x, the critical t-value, and the standard error of -#' the mean. +#' Default output includes the n, mean, sem, and 95% confidence interval for +#' the mean. Using output = "all" also returns the the number of missing +#' values for x and the critical t-value. #' #' @param digits Round mean, lcl, and ucl to digits. Default is 2. #' @@ -44,10 +43,10 @@ #' mtcars %>% #' mean_table(mpg) #' -#' #> # A tibble: 1 x 5 -#' #> var n mean lcl ucl min max -#' #> -#' #> 1 mpg 32 20.09 17.92 22.26 10.4 33.9 +#' #> # A tibble: 1 x 8 +#' #> response_var n mean sem lcl ucl min max +#' #> +#' #> 1 mpg 32 20.09 1.065424 17.92 22.26 10.4 33.9 #' #' # Grouped means table with defaults #' @@ -55,92 +54,153 @@ #' group_by(cyl) %>% #' mean_table(mpg) #' -#' #> # A tibble: 3 x 6 -#' #> cyl var n mean lcl ucl min max -#' #> -#' #> 1 4 mpg 11 26.66 23.63 29.69 21.4 33.9 -#' #> 2 6 mpg 7 19.74 18.40 21.09 17.8 21.4 -#' #> 3 8 mpg 14 15.10 13.62 16.58 10.4 19.2 +#' #> # A tibble: 3 x 10 +#' #> response_var group_var group_cat n mean sem lcl ucl min max +#' #> +#' #> 1 mpg cyl 4 11 26.66 1.3597642 23.63 29.69 21.4 33.9 +#' #> 2 mpg cyl 6 7 19.74 0.5493967 18.40 21.09 17.8 21.4 +#' #> 3 mpg cyl 8 14 15.10 0.6842016 13.62 16.58 10.4 19.2 -mean_table <- function(.data, x, t_prob = 0.975, output = "default", digits = 2, ...) { +mean_table <- function(.data, x, t_prob = 0.975, output = default, digits = 2, ...) { # ------------------------------------------------------------------ # Prevents R CMD check: "no visible binding for global variable ‘.’" # ------------------------------------------------------------------ - n = t_crit = sem = lcl = ucl = var = NULL - - - # =========================================================================== - # Enquo the x argument so that it can be used in the dplyr pipeline below. - # =========================================================================== - response_var <- rlang::enquo(x) - + n = t_crit = sem = lcl = ucl = var = n_groups = group_1 = NULL + group_2 = output_arg = default = response_var = `.` = NULL + group_var = group_cat = group_1_cat = group_2_cat = n_miss = NULL # =========================================================================== # Quick data checks + # Input data frame is class data.frame + # The "x" argument is a numeric vector + # There are zero, one, or two group_by variables # =========================================================================== if (!("data.frame" %in% class(.data))) { message("Expecting the class of .data to include data.frame. Instead, the ", - "class was ", class(.data)) + "class was ", class(.data)) } if (missing(x)) { stop("No argument was passed to the 'x' parameter. Expecting 'x' to be a ", "numeric column.") + } + + # =========================================================================== + # Enquo arguments + # enquo the x argument so that it can be used in the dplyr pipeline below. + # The x argument is the variable you want the mean of. + # enquo/quo_name/UQ the output argument so that I don't have to use + # quotation marks around the argument being passed. + # =========================================================================== + x <- rlang::enquo(x) + output_arg <- rlang::enquo(output) %>% rlang::quo_name() %>% rlang::UQ() + + # =========================================================================== + # Grouping variables + # Count the number of them - accept zero, one, or two + # Grab their names - later returned in summary table + # =========================================================================== + if ("grouped_df" %in% class(.data)) { + n_groups <- attributes(.data)$vars %>% length() + } else { + n_groups <- 0L + } + + if (n_groups == 1) { + group_1 <- attributes(.data)$vars + + } else if (n_groups == 2) { + group_1 <- attributes(.data)$vars[1] + group_2 <- attributes(.data)$vars[2] + } else if (n_groups > 2) { + stop(".data can be grouped by up to two variables. It is currently grouped ", + n_groups, " variables") } # =========================================================================== - # One-way table of means and related stats - # Just "works" if grouped_df too + # First, create a general summary table of means and related stats. Then, add + # group variable names to summary table where applicable + # 1. No group_by variables + # 2. One group_by variable + # 3. Two group_by variables # =========================================================================== out <- .data %>% - dplyr::filter(!is.na(!!response_var)) %>% # Drop missing + # Drop missing + dplyr::filter(!is.na(!! x)) %>% dplyr::summarise( - var = rlang::quo_name(response_var), # Grab variable (x) name - n_miss = is.na(.data[[rlang::quo_name(response_var)]]) %>% sum, # Count missing from before drop - n = n(), - mean = mean(!!response_var), - t_crit = stats::qt(t_prob, n - 1), - sem = stats::sd(!!response_var) / sqrt(n), - lcl = mean - t_crit * sem, - ucl = mean + t_crit * sem, - mean = round(mean, digits), # Round mean - lcl = round(lcl, digits), # Round lcl - ucl = round(ucl, digits), # Round ucl - min = min(!!response_var), - max = max(!!response_var) + # Grab variable (x) name + response_var = rlang::quo_name(x), + # Count missing from before drop + n_miss = is.na(.data[[rlang::quo_name(x)]]) %>% sum(), + n = n(), + mean = mean(!! x), + t_crit = stats::qt(t_prob, n - 1), + sem = stats::sd(!! x) / sqrt(n), + lcl = mean - t_crit * sem, + ucl = mean + t_crit * sem, + mean = round(mean, digits), # Round mean + lcl = round(lcl, digits), # Round lcl + ucl = round(ucl, digits), # Round ucl + min = min(!! x), + max = max(!! x) ) %>% tibble::as.tibble() # =========================================================================== - # Classes of output + # Add group variable names to summary table - if applicable + # Then move to the front of the summary table. + + # Also add classes to "out" # If the input data frame (.data) was a grouped data frame, then the output # will be a bivariate analysis of means ("mean_table_grouped"). Pass that - # information on to out. It can be used later in format_table. + # information on to "out." It can be used later in format_table. # Otherwise the output will be a univariate analysis of means ("mean_table") # That class will also be used later in format_table. # =========================================================================== - if ("grouped_df" %in% class(.data)) { - class(out) <- c("mean_table_grouped", class(out)) - } else { + if (n_groups == 0) { + + out <- out %>% + dplyr::select(response_var, dplyr::everything()) class(out) <- c("mean_table", class(out)) + } + else if (n_groups == 1) { + + out <- out %>% + dplyr::mutate(group_var = group_1) %>% + dplyr::rename(group_cat = !! names(.)[1]) %>% + dplyr::select(response_var, group_var, group_cat, dplyr::everything()) + class(out) <- c("mean_table_grouped", class(out)) + + } else if (n_groups == 2) { - # Control output - # Typically, I only want the frequency, mean, and 95% CI - # Make that the default - if (output == "default" && class(out) == "mean_table") { out <- out %>% - dplyr::select(var, n, mean, lcl, ucl, min, max) + dplyr::mutate( + group_1 = group_1, + group_2 = group_2 + ) %>% + dplyr::rename( + group_1_cat = !! names(.)[1], + group_2_cat = !! names(.)[2] + ) %>% + dplyr::select(response_var, group_1, group_1_cat, group_2, group_2_cat, + dplyr::everything()) %>% + dplyr::ungroup() + class(out) <- c("mean_table_grouped", class(out)) + } - } else if (output == "default" && class(out) == "mean_table_grouped") { + # =========================================================================== + # Control output: + # Typically, I only want the frequency, mean, 95% CI, sem, min, and max. + # Make that the default. + # =========================================================================== + if (output_arg == "default") { out <- out %>% - dplyr::select(1, var, n, mean, lcl, ucl, min, max) - } else { - out <- out # do nothing + dplyr::select(-c(n_miss, t_crit)) } - # Return tibble of results + # Return summary table out } diff --git a/inst/doc/descriptive_analysis.html b/inst/doc/descriptive_analysis.html index cf6537b..2d5db12 100644 --- a/inst/doc/descriptive_analysis.html +++ b/inst/doc/descriptive_analysis.html @@ -69,7 +69,7 @@

Descriptive Analysis in a dplyr Pipeline

Brad Cannell

-

Created: 2017-11-10
Updated: 2018-01-05

+

Created: 2017-11-10
Updated: 2018-01-07

@@ -96,21 +96,8 @@

Created: 2017-11-10
Updated: 2018-01-05

Interpretation of confidence intervals


I initially created this file to help develop the freq_table function in bfuncs. I’ve since adapted it to encompass general descriptive data analysis using R in a dplyr pipeline. Herein, descriptive analysis refers to basic univariate or bivariate statistics calculated for continuous and categorical variables (e.g., means and percentages). This vignette is not intended to be representative of every possible descriptive analysis that one may want to carry out on a given data set. Rather, it is intended to be representative of the descriptive analyses I most commonly need while conducting epidemiologic research.

-
library(tidyverse)
-
## ── Attaching packages ─────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
-
## ✔ ggplot2 2.2.1     ✔ purrr   0.2.4
-## ✔ tibble  1.3.4     ✔ dplyr   0.7.4
-## ✔ tidyr   0.7.2     ✔ stringr 1.2.0
-## ✔ readr   1.1.1     ✔ forcats 0.2.0
-
## ── Conflicts ────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
-## ✖ dplyr::filter() masks stats::filter()
-## ✖ dplyr::lag()    masks stats::lag()
-
library(bfuncs)
-
## 
-## Attaching package: 'bfuncs'
-
## The following object is masked _by_ '.GlobalEnv':
-## 
-##     about_data
+
library(tidyverse)
+library(bfuncs)
data(mtcars)

@@ -130,10 +117,10 @@

Univariate means and 95% confidence intervals

This matches the method used by SAS: http://support.sas.com/documentation/cdl/en/proc/65145/HTML/default/viewer.htm#p0klmrp4k89pz0n1p72t0clpavyx.htm

mtcars %>% 
   mean_table(mpg)
-
## # A tibble: 1 x 7
-##     var     n  mean   lcl   ucl   min   max
-##   <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
-## 1   mpg    32 20.09 17.92 22.26  10.4  33.9
+
## # A tibble: 1 x 8
+##   response_var     n  mean      sem   lcl   ucl   min   max
+##          <chr> <int> <dbl>    <dbl> <dbl> <dbl> <dbl> <dbl>
+## 1          mpg    32 20.09 1.065424 17.92 22.26  10.4  33.9

By adjusting the t_prob parameter, it is possible to change the width of the confidence intervals. The example below returns a 99% confidence interval.

The value for t_prob is calculated as 1 - alpha / 2.

alpha <- 1 - .99
@@ -141,19 +128,19 @@ 

Univariate means and 95% confidence intervals

mtcars %>% mean_table(mpg, t_prob = t)
-
## # A tibble: 1 x 7
-##     var     n  mean   lcl   ucl   min   max
-##   <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
-## 1   mpg    32 20.09 17.17 23.01  10.4  33.9
+
## # A tibble: 1 x 8
+##   response_var     n  mean      sem   lcl   ucl   min   max
+##          <chr> <int> <dbl>    <dbl> <dbl> <dbl> <dbl> <dbl>
+## 1          mpg    32 20.09 1.065424 17.17 23.01  10.4  33.9

With the output = "all" option, mean_table also returns the number of missing values, the critical value from student’s t distribution with degrees of freedom n - 1, and the standard error of the mean.

We can also control the precision of the statistics using the digits parameter.

mtcars %>% 
   mean_table(mpg, output = "all", digits = 5)
## # A tibble: 1 x 10
-##     var n_miss     n     mean   t_crit      sem      lcl      ucl   min
-##   <chr>  <int> <int>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl> <dbl>
-## 1   mpg      0    32 20.09062 2.039513 1.065424 17.91768 22.26357  10.4
-## # ... with 1 more variables: max <dbl>
+## response_var n_miss n mean t_crit sem lcl ucl +## <chr> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> +## 1 mpg 0 32 20.09062 2.039513 1.065424 17.91768 22.26357 +## # ... with 2 more variables: min <dbl>, max <dbl>

This output matches the results obtained from SAS proc means and the Stata mean command (shown below).

@@ -176,13 +163,14 @@

Bivariate means and 95% confidence intervals

mtcars %>% 
   group_by(cyl) %>% 
   mean_table(mpg, output = "all", digits = 5)
-
## # A tibble: 3 x 11
-##     cyl   var n_miss     n     mean   t_crit       sem      lcl      ucl
-##   <dbl> <chr>  <int> <int>    <dbl>    <dbl>     <dbl>    <dbl>    <dbl>
-## 1     4   mpg      0    11 26.66364 2.228139 1.3597642 23.63389 29.69338
-## 2     6   mpg      0     7 19.74286 2.446912 0.5493967 18.39853 21.08718
-## 3     8   mpg      0    14 15.10000 2.160369 0.6842016 13.62187 16.57813
-## # ... with 2 more variables: min <dbl>, max <dbl>
+
## # A tibble: 3 x 12
+##   response_var group_var group_cat n_miss     n     mean   t_crit
+##          <chr>     <chr>     <dbl>  <int> <int>    <dbl>    <dbl>
+## 1          mpg       cyl         4      0    11 26.66364 2.228139
+## 2          mpg       cyl         6      0     7 19.74286 2.446912
+## 3          mpg       cyl         8      0    14 15.10000 2.160369
+## # ... with 5 more variables: sem <dbl>, lcl <dbl>, ucl <dbl>, min <dbl>,
+## #   max <dbl>

For comparison, here is the output from SAS proc means and the Stata mean command.

@@ -206,11 +194,11 @@

Logit transformed confidence intervals

mtcars %>% 
   group_by(am) %>% 
   freq_table(output = "all", digits = 5)
-
## # A tibble: 2 x 8
-##      am     n n_total percent       se   t_crit      lcl      ucl
-##   <dbl> <int>   <int>   <dbl>    <dbl>    <dbl>    <dbl>    <dbl>
-## 1     0    19      32  59.375 8.820997 2.039513 40.94225 75.49765
-## 2     1    13      32  40.625 8.820997 2.039513 24.50235 59.05775
+
## # A tibble: 2 x 9
+##     var   cat     n n_total percent       se   t_crit      lcl      ucl
+##   <chr> <dbl> <int>   <int>   <dbl>    <dbl>    <dbl>    <dbl>    <dbl>
+## 1    am     0    19      32  59.375 8.820997 2.039513 40.94225 75.49765
+## 2    am     1    13      32  40.625 8.820997 2.039513 24.50235 59.05775

 

@@ -227,11 +215,11 @@

Wald confidence intervals

mtcars %>% 
   group_by(am) %>% 
   freq_table(ci_type = "wald", output = "all", digits = 5)
-
## # A tibble: 2 x 8
-##      am     n n_total percent       se   t_crit      lcl      ucl
-##   <dbl> <int>   <int>   <dbl>    <dbl>    <dbl>    <dbl>    <dbl>
-## 1     0    19      32  59.375 8.820997 2.039513 41.38446 77.36554
-## 2     1    13      32  40.625 8.820997 2.039513 22.63446 58.61554
+
## # A tibble: 2 x 9
+##     var   cat     n n_total percent       se   t_crit      lcl      ucl
+##   <chr> <dbl> <int>   <int>   <dbl>    <dbl>    <dbl>    <dbl>    <dbl>
+## 1    am     0    19      32  59.375 8.820997 2.039513 41.38446 77.36554
+## 2    am     1    13      32  40.625 8.820997 2.039513 22.63446 58.61554

 

@@ -249,18 +237,18 @@

Bivariate percentages and 95% log transformed confidence intervals

mtcars %>% 
   group_by(am, cyl) %>% 
   freq_table(output = "all", digits = 5)
-
## # A tibble: 6 x 15
-##      am   cyl     n n_row n_total percent_total se_total t_crit_total
-##   <dbl> <dbl> <int> <int>   <int>         <dbl>    <dbl>        <dbl>
-## 1     0     4     3    19      32         9.375 5.235146     2.039513
-## 2     0     6     4    19      32        12.500 5.939887     2.039513
-## 3     0     8    12    19      32        37.500 8.695104     2.039513
-## 4     1     4     8    13      32        25.000 7.777138     2.039513
-## 5     1     6     3    13      32         9.375 5.235146     2.039513
-## 6     1     8     2    13      32         6.250 4.347552     2.039513
-## # ... with 7 more variables: lcl_total <dbl>, ucl_total <dbl>,
-## #   percent_row <dbl>, se_row <dbl>, t_crit_row <dbl>, lcl_row <dbl>,
-## #   ucl_row <dbl>
+
## # A tibble: 6 x 17
+##   row_var row_cat col_var col_cat     n n_row n_total percent_total
+##     <chr>   <dbl>   <chr>   <dbl> <int> <int>   <int>         <dbl>
+## 1      am       0     cyl       4     3    19      32         9.375
+## 2      am       0     cyl       6     4    19      32        12.500
+## 3      am       0     cyl       8    12    19      32        37.500
+## 4      am       1     cyl       4     8    13      32        25.000
+## 5      am       1     cyl       6     3    13      32         9.375
+## 6      am       1     cyl       8     2    13      32         6.250
+## # ... with 9 more variables: se_total <dbl>, t_crit_total <dbl>,
+## #   lcl_total <dbl>, ucl_total <dbl>, percent_row <dbl>, se_row <dbl>,
+## #   t_crit_row <dbl>, lcl_row <dbl>, ucl_row <dbl>
diff --git a/inst/doc/freq_test.html b/inst/doc/freq_test.html index dea8745..0e2761b 100644 --- a/inst/doc/freq_test.html +++ b/inst/doc/freq_test.html @@ -73,7 +73,7 @@

Using the freq_test Function

Brad Cannell

-

Created: 2017-12-06
Updated: 2018-01-05

+

Created: 2017-12-06
Updated: 2018-01-07

@@ -99,8 +99,8 @@

Created: 2017-12-06
Updated: 2018-01-05

Because we want different behavior for different classes of inputs, freq_test is an S3 generic function with multiple methods.

The name freq_test is easy to reason about, sounds similar to freq_table, which must be the input to freq_test, and is general enough to contain chi-square, fisher, etc.

The flowchart below illustrates the scenarios, and related hypothesis tests, discussed in this vignette.

-
- +
+

Table of Contents

    @@ -133,11 +133,11 @@

    Two categories

    students %>% 
       group_by(male) %>% 
       freq_table()
    -#> # A tibble: 2 x 6
    -#>    male     n n_total percent   lcl   ucl
    -#>   <int> <int>   <int>   <dbl> <dbl> <dbl>
    -#> 1     0     9      20      45 23.76 68.23
    -#> 2     1    11      20      55 31.77 76.24
    +#> # A tibble: 2 x 7 +#> var cat n n_total percent lcl ucl +#> <chr> <int> <int> <int> <dbl> <dbl> <dbl> +#> 1 male 0 9 20 45 23.76 68.23 +#> 2 male 1 11 20 55 31.77 76.24

The frequencies returned to us by freq_table tell us that our sample includes 9 females (which is 45% of all students in the sample) and 11 males (which is 55% of all students sampled). But, what about the precision of those estimates?

If we are only interested in the proportion of students in our sample that are male and female, then we’re done. We have our answer. Typically, though, our interest extends beyond our little sample. What we really want to know is if the proportion of males and females differ among all students on our hypothetical campus (our superpopulation).

I like the following quote from Daniel Kaplan that discusses sampling variability and its relationship to precision of our estimate:

@@ -196,11 +196,11 @@

Test for a difference in proportions

group_by(male) %>% freq_table() %>% freq_test() -#> # A tibble: 2 x 11 -#> male n n_total percent lcl ucl n_expected chi2_contrib -#> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> -#> 1 0 9 20 45 23.76 68.23 10 0.1 -#> 2 1 11 20 55 31.77 76.24 10 0.1 +#> # A tibble: 2 x 12 +#> var cat n n_total percent lcl ucl n_expected chi2_contrib +#> <chr> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> +#> 1 male 0 9 20 45 23.76 68.23 10 0.1 +#> 2 male 1 11 20 55 31.77 76.24 10 0.1 #> # ... with 3 more variables: chi2_pearson <dbl>, df <dbl>, #> # p_chi2_pearson <dbl>
@@ -224,10 +224,10 @@

Comparison to other statistical software

freq_test() %>% select(1:6, p_chi2_pearson) #> # A tibble: 2 x 7 -#> am n n_total percent lcl ucl p_chi2_pearson -#> <dbl> <int> <int> <dbl> <dbl> <dbl> <dbl> -#> 1 0 19 32 59.38 40.94 75.50 0.2888444 -#> 2 1 13 32 40.62 24.50 59.06 0.2888444
+#> var cat n n_total percent lcl p_chi2_pearson +#> <chr> <dbl> <int> <int> <dbl> <dbl> <dbl> +#> 1 am 0 19 32 59.38 40.94 0.2888444 +#> 2 am 1 13 32 40.62 24.50 0.2888444

 

These results match the results obtained from Stata and SAS (see below).

 

@@ -274,12 +274,12 @@

More than two categories

group_by(cyl) %>% freq_table() %>% freq_test() -#> # A tibble: 3 x 11 -#> cyl n n_total percent lcl ucl n_expected chi2_contrib -#> <dbl> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> -#> 1 4 11 32 34.38 19.50 53.11 10.66667 0.01041667 -#> 2 6 7 32 21.88 10.35 40.45 10.66667 1.26041667 -#> 3 8 14 32 43.75 27.10 61.94 10.66667 1.04166667 +#> # A tibble: 3 x 12 +#> var cat n n_total percent lcl ucl n_expected chi2_contrib +#> <chr> <dbl> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> +#> 1 cyl 4 11 32 34.38 19.50 53.11 10.66667 0.01041667 +#> 2 cyl 6 7 32 21.88 10.35 40.45 10.66667 1.26041667 +#> 3 cyl 8 14 32 43.75 27.10 61.94 10.66667 1.04166667 #> # ... with 3 more variables: chi2_pearson <dbl>, df <dbl>, #> # p_chi2_pearson <dbl>

 

@@ -317,31 +317,32 @@

Two categories

freq_test() #> Warning in freq_test.freq_table_two_way(.): One or more expected cell #> counts are <= 5. Pearson's Chi-square may not be a valid test. -#> # A tibble: 4 x 16 -#> male binge n n_row n_total percent_row lcl_row ucl_row n_col -#> <int> <int> <int> <int> <int> <dbl> <dbl> <dbl> <int> -#> 1 0 0 8 11 20 72.73 37.63 92.18 15 -#> 2 0 1 3 11 20 27.27 7.82 62.37 5 -#> 3 1 0 7 9 20 77.78 37.12 95.40 15 -#> 4 1 1 2 9 20 22.22 4.60 62.88 5 -#> # ... with 7 more variables: n_expected <dbl>, chi2_contrib <dbl>, -#> # chi2_pearson <dbl>, r <int>, c <int>, df <dbl>, p_chi2_pearson <dbl>
+#> # A tibble: 4 x 18 +#> row_var row_cat col_var col_cat n n_row n_total percent_row lcl_row +#> <chr> <int> <chr> <int> <int> <int> <int> <dbl> <dbl> +#> 1 male 0 binge 0 8 11 20 72.73 37.63 +#> 2 male 0 binge 1 3 11 20 27.27 7.82 +#> 3 male 1 binge 0 7 9 20 77.78 37.12 +#> 4 male 1 binge 1 2 9 20 22.22 4.60 +#> # ... with 9 more variables: ucl_row <dbl>, n_col <int>, n_expected <dbl>, +#> # chi2_contrib <dbl>, chi2_pearson <dbl>, r <int>, c <int>, df <dbl>, +#> # p_chi2_pearson <dbl>

There are a couple things worth mentioning here. First, 27% of the females in our sample engaged in binge drinking, compared to 22% of males – even though we know that the underlying probability in our superpopulation was 10% for females and 30% for males. This discrepancy highlights the potential for sampling variability, and small sample estimates, to be misleading.

Additionally, because some expected cell counts were less than 5, we should estimate our p-value using Fisher’s exact method.

students %>% 
   group_by(male, binge) %>% 
   freq_table() %>% 
   freq_test(method = "fisher")
-#> # A tibble: 4 x 17
-#>    male binge     n n_row n_total percent_row lcl_row ucl_row n_col
-#>   <int> <int> <int> <int>   <int>       <dbl>   <dbl>   <dbl> <int>
-#> 1     0     0     8    11      20       72.73   37.63   92.18    15
-#> 2     0     1     3    11      20       27.27    7.82   62.37     5
-#> 3     1     0     7     9      20       77.78   37.12   95.40    15
-#> 4     1     1     2     9      20       22.22    4.60   62.88     5
-#> # ... with 8 more variables: n_expected <dbl>, chi2_contrib <dbl>,
-#> #   chi2_pearson <dbl>, r <int>, c <int>, df <dbl>, p_chi2_pearson <dbl>,
-#> #   p_fisher <dbl>
+#> # A tibble: 4 x 19 +#> row_var row_cat col_var col_cat n n_row n_total percent_row lcl_row +#> <chr> <int> <chr> <int> <int> <int> <int> <dbl> <dbl> +#> 1 male 0 binge 0 8 11 20 72.73 37.63 +#> 2 male 0 binge 1 3 11 20 27.27 7.82 +#> 3 male 1 binge 0 7 9 20 77.78 37.12 +#> 4 male 1 binge 1 2 9 20 22.22 4.60 +#> # ... with 10 more variables: ucl_row <dbl>, n_col <int>, +#> # n_expected <dbl>, chi2_contrib <dbl>, chi2_pearson <dbl>, r <int>, +#> # c <int>, df <dbl>, p_chi2_pearson <dbl>, p_fisher <dbl>

Interpretation

The large p-value (p = 1.0) calculated using Fisher’s exact method indicates that the data are highly consistent with the null hypothesis of statistical independence between gender and binge drinking at our hypothetical campus, assuming that the null hypothesis is true and the study is free of bias. However, wide range of the 95% confidence intervals makes the precision of the point estimates doubtful. Another study using larger sample size is needed to get a more precise estimate of the relationship between gender and binge drinking.

@@ -365,12 +366,12 @@

Comparison to other statistical software

freq_test() %>% select(1:8, p_chi2_pearson) #> # A tibble: 4 x 9 -#> am vs n n_row n_total percent_row lcl_row ucl_row -#> <dbl> <dbl> <int> <int> <int> <dbl> <dbl> <dbl> -#> 1 0 0 12 19 32 63.16 38.76 82.28 -#> 2 0 1 7 19 32 36.84 17.72 61.24 -#> 3 1 0 6 13 32 46.15 20.83 73.63 -#> 4 1 1 7 13 32 53.85 26.37 79.17 +#> row_var row_cat col_var col_cat n n_row n_total percent_row +#> <chr> <dbl> <chr> <dbl> <int> <int> <int> <dbl> +#> 1 am 0 vs 0 12 19 32 63.16 +#> 2 am 0 vs 1 7 19 32 36.84 +#> 3 am 1 vs 0 6 13 32 46.15 +#> 4 am 1 vs 1 7 13 32 53.85 #> # ... with 1 more variables: p_chi2_pearson <dbl>

 

@@ -404,17 +405,18 @@

More than two categories

freq_test() #> Warning in freq_test.freq_table_two_way(.): One or more expected cell #> counts are <= 5. Pearson's Chi-square may not be a valid test. -#> # A tibble: 6 x 16 -#> am cyl n n_row n_total percent_row lcl_row ucl_row n_col -#> <dbl> <dbl> <int> <int> <int> <dbl> <dbl> <dbl> <int> -#> 1 0 4 3 19 32 15.79 4.78 41.20 11 -#> 2 0 6 4 19 32 21.05 7.58 46.44 7 -#> 3 0 8 12 19 32 63.16 38.76 82.28 14 -#> 4 1 4 8 13 32 61.54 32.30 84.29 11 -#> 5 1 6 3 13 32 23.08 6.91 54.82 7 -#> 6 1 8 2 13 32 15.38 3.43 48.18 14 -#> # ... with 7 more variables: n_expected <dbl>, chi2_contrib <dbl>, -#> # chi2_pearson <dbl>, r <int>, c <int>, df <dbl>, p_chi2_pearson <dbl>
+#> # A tibble: 6 x 18 +#> row_var row_cat col_var col_cat n n_row n_total percent_row lcl_row +#> <chr> <dbl> <chr> <dbl> <int> <int> <int> <dbl> <dbl> +#> 1 am 0 cyl 4 3 19 32 15.79 4.78 +#> 2 am 0 cyl 6 4 19 32 21.05 7.58 +#> 3 am 0 cyl 8 12 19 32 63.16 38.76 +#> 4 am 1 cyl 4 8 13 32 61.54 32.30 +#> 5 am 1 cyl 6 3 13 32 23.08 6.91 +#> 6 am 1 cyl 8 2 13 32 15.38 3.43 +#> # ... with 9 more variables: ucl_row <dbl>, n_col <int>, n_expected <dbl>, +#> # chi2_contrib <dbl>, chi2_pearson <dbl>, r <int>, c <int>, df <dbl>, +#> # p_chi2_pearson <dbl>

 

There is one problem with the chi-square test, which is that the sampling distribution of the test statistic has an approximate chi-square distribution. The larger the sample is, the better this approximation becomes, and in large samples the approximation is good enough to not worry about the fact that it is an approximation. However, in small samples the approximation is not good enough, making significance tests of the chi-square distribution inaccurate.

@@ -427,18 +429,18 @@

More than two categories

#> Warning in freq_test.freq_table_two_way(., method = "fisher"): One or more #> expected cell counts are <= 5. Pearson's Chi-square may not be a valid #> test. -#> # A tibble: 6 x 17 -#> am cyl n n_row n_total percent_row lcl_row ucl_row n_col -#> <dbl> <dbl> <int> <int> <int> <dbl> <dbl> <dbl> <int> -#> 1 0 4 3 19 32 15.79 4.78 41.20 11 -#> 2 0 6 4 19 32 21.05 7.58 46.44 7 -#> 3 0 8 12 19 32 63.16 38.76 82.28 14 -#> 4 1 4 8 13 32 61.54 32.30 84.29 11 -#> 5 1 6 3 13 32 23.08 6.91 54.82 7 -#> 6 1 8 2 13 32 15.38 3.43 48.18 14 -#> # ... with 8 more variables: n_expected <dbl>, chi2_contrib <dbl>, -#> # chi2_pearson <dbl>, r <int>, c <int>, df <dbl>, p_chi2_pearson <dbl>, -#> # p_fisher <dbl> +#> # A tibble: 6 x 19 +#> row_var row_cat col_var col_cat n n_row n_total percent_row lcl_row +#> <chr> <dbl> <chr> <dbl> <int> <int> <int> <dbl> <dbl> +#> 1 am 0 cyl 4 3 19 32 15.79 4.78 +#> 2 am 0 cyl 6 4 19 32 21.05 7.58 +#> 3 am 0 cyl 8 12 19 32 63.16 38.76 +#> 4 am 1 cyl 4 8 13 32 61.54 32.30 +#> 5 am 1 cyl 6 3 13 32 23.08 6.91 +#> 6 am 1 cyl 8 2 13 32 15.38 3.43 +#> # ... with 10 more variables: ucl_row <dbl>, n_col <int>, +#> # n_expected <dbl>, chi2_contrib <dbl>, chi2_pearson <dbl>, r <int>, +#> # c <int>, df <dbl>, p_chi2_pearson <dbl>, p_fisher <dbl>

Interpret

Compare to Stata and SAS

@@ -485,12 +487,12 @@

Calculate Pearson’s Chi-square

freq_test() %>% select(1:4, n_col, n_total, n_expected:chi2_pearson) #> # A tibble: 4 x 9 -#> am vs n n_row n_col n_total n_expected chi2_contrib -#> <dbl> <dbl> <int> <int> <int> <int> <dbl> <dbl> -#> 1 0 0 12 19 18 32 10.6875 0.1611842 -#> 2 0 1 7 19 14 32 8.3125 0.2072368 -#> 3 1 0 6 13 18 32 7.3125 0.2355769 -#> 4 1 1 7 13 14 32 5.6875 0.3028846 +#> row_var row_cat col_var col_cat n_col n_total n_expected chi2_contrib +#> <chr> <dbl> <chr> <dbl> <int> <int> <dbl> <dbl> +#> 1 am 0 vs 0 18 32 10.6875 0.1611842 +#> 2 am 0 vs 1 14 32 8.3125 0.2072368 +#> 3 am 1 vs 0 18 32 7.3125 0.2355769 +#> 4 am 1 vs 1 14 32 5.6875 0.3028846 #> # ... with 1 more variables: chi2_pearson <dbl>

After obtaining the Pearson chi-squared value (0.9068826 above), we compare it to the critical value for the chi-square distribution with degrees of freedom = (rows - 1)(columns - 1) = 1. If the observed value is bigger than the critical value, we conclude that there is a statistically significant relationship.

For example:

@@ -500,12 +502,12 @@

Calculate Pearson’s Chi-square

freq_test() %>% select(1:3, chi2_pearson:p_chi2_pearson) #> # A tibble: 4 x 8 -#> am vs n chi2_pearson r c df p_chi2_pearson -#> <dbl> <dbl> <int> <dbl> <int> <int> <dbl> <dbl> -#> 1 0 0 12 0.9068826 2 2 1 0.3409429 -#> 2 0 1 7 0.9068826 2 2 1 0.3409429 -#> 3 1 0 6 0.9068826 2 2 1 0.3409429 -#> 4 1 1 7 0.9068826 2 2 1 0.3409429 +#> row_var row_cat col_var chi2_pearson r c df p_chi2_pearson +#> <chr> <dbl> <chr> <dbl> <int> <int> <dbl> <dbl> +#> 1 am 0 vs 0.9068826 2 2 1 0.3409429 +#> 2 am 0 vs 0.9068826 2 2 1 0.3409429 +#> 3 am 1 vs 0.9068826 2 2 1 0.3409429 +#> 4 am 1 vs 0.9068826 2 2 1 0.3409429

And this is the same result we get from R’s built-in chi-square test.

chisq.test(mtcars$am, mtcars$vs, correct = FALSE)
 #> 
diff --git a/inst/doc/presentation_dissemination.R b/inst/doc/presentation_dissemination.R
index 3969953..7a17823 100644
--- a/inst/doc/presentation_dissemination.R
+++ b/inst/doc/presentation_dissemination.R
@@ -64,25 +64,26 @@ mtcars %>%
   group_by(am, cyl) %>% 
   freq_table() %>% 
   format_table() %>% 
-  spread(key = am, value = percent_row_95)
+  spread(key = row_cat, value = percent_row_95)
 
 ## ------------------------------------------------------------------------
 mtcars %>% 
   group_by(am, cyl) %>% 
   freq_table() %>% 
   format_table() %>% 
-  spread(key = am, value = percent_row_95) %>% 
-  mutate(variable = colnames(.)[1]) %>% 
-  rename("class" = cyl, "am_0" = `0`, "am_1" = `1`)
+  spread(key = row_cat, value = percent_row_95) %>% 
+  select(-row_var) %>% 
+  # Rename to row bind with table shell
+  rename(variable = col_var, class = col_cat, "am_0" = `0`, "am_1" = `1`) 
 
 ## ------------------------------------------------------------------------
 row <- mtcars %>% 
   group_by(am, cyl) %>% 
   freq_table() %>% 
   format_table() %>% 
-  spread(key = am, value = percent_row_95) %>% 
-  mutate(variable = colnames(.)[1]) %>% 
-  rename("class" = cyl, "am_0" = `0`, "am_1" = `1`) %>% 
+  spread(key = row_cat, value = percent_row_95) %>% 
+  select(-row_var) %>% 
+  rename(variable = col_var, class = col_cat, "am_0" = `0`, "am_1" = `1`) %>% 
   mutate(class = as.character(class)) # Need for bind_rows below
 
 bind_rows(table, row)
@@ -99,10 +100,10 @@ for (i in seq_along(cat_vars)) {
     group_by(am, !!cat_vars[[i]]) %>% 
     freq_table() %>% 
     format_table() %>% 
-    spread(key = am, value = percent_row_95) %>% 
-    mutate(variable = colnames(.)[1]) %>% 
-    rename("class" = !!cat_vars[[i]], "am_0" = `0`, "am_1" = `1`) %>% 
-    mutate(class = as.character(class)) # Need for bind_rows below
+    spread(key = row_cat, value = percent_row_95) %>% 
+    select(-row_var) %>% 
+    rename(variable = col_var, class = col_cat, "am_0" = `0`, "am_1" = `1`) %>% 
+    mutate(class = as.character(class))
 
   # Append to bottom of table
   table <- bind_rows(table, row)
@@ -136,8 +137,9 @@ for (i in seq_along(cont_vars)) {
     group_by(am) %>% 
     bfuncs::mean_table(!!cont_vars[[i]]) %>% 
     bfuncs::format_table() %>% 
-    spread(key = am, value = mean_95) %>% 
-    rename("variable" = var, "am_0" = `0`, "am_1" = `1`)
+    spread(key = group_cat, value = mean_95) %>% 
+    select(-group_var) %>% 
+    rename("variable" = response_var, "am_0" = `0`, "am_1" = `1`)
 
   # Append to bottom of table
   table <- bind_rows(table, row)
@@ -153,10 +155,10 @@ for (i in seq_along(cat_vars)) {
     group_by(am, !!cat_vars[[i]]) %>% 
     freq_table() %>% 
     format_table() %>% 
-    spread(key = am, value = percent_row_95) %>% 
-    mutate(variable = colnames(.)[1]) %>% 
-    rename("class" = !!cat_vars[[i]], "am_0" = `0`, "am_1" = `1`) %>% 
-    mutate(class = as.character(class)) # Need for bind_rows below
+    spread(key = row_cat, value = percent_row_95) %>% 
+    select(-row_var) %>% 
+    rename(variable = col_var, class = col_cat, "am_0" = `0`, "am_1" = `1`) %>% 
+    mutate(class = as.character(class))
 
   # Append to bottom of table
   table <- bind_rows(table, row)
@@ -190,7 +192,7 @@ table <- table %>%
   print
 
 ## ------------------------------------------------------------------------
-# table %>% 
+# table %>%
 #   mutate(
 #     class = stringr::str_replace(class, "^", "---"),
 #     variable = if_else(variable == "", class, variable),
diff --git a/inst/doc/presentation_dissemination.Rmd b/inst/doc/presentation_dissemination.Rmd
index 39de6c0..9b95a3a 100644
--- a/inst/doc/presentation_dissemination.Rmd
+++ b/inst/doc/presentation_dissemination.Rmd
@@ -235,7 +235,7 @@ mtcars %>%
   group_by(am, cyl) %>% 
   freq_table() %>% 
   format_table() %>% 
-  spread(key = am, value = percent_row_95)
+  spread(key = row_cat, value = percent_row_95)
 ```
 
 Where:
@@ -252,9 +252,10 @@ mtcars %>%
   group_by(am, cyl) %>% 
   freq_table() %>% 
   format_table() %>% 
-  spread(key = am, value = percent_row_95) %>% 
-  mutate(variable = colnames(.)[1]) %>% 
-  rename("class" = cyl, "am_0" = `0`, "am_1" = `1`)
+  spread(key = row_cat, value = percent_row_95) %>% 
+  select(-row_var) %>% 
+  # Rename to row bind with table shell
+  rename(variable = col_var, class = col_cat, "am_0" = `0`, "am_1" = `1`) 
 ```
 
 There may be better ways to do this, but this is the best I've found so far.
@@ -281,9 +282,9 @@ row <- mtcars %>%
   group_by(am, cyl) %>% 
   freq_table() %>% 
   format_table() %>% 
-  spread(key = am, value = percent_row_95) %>% 
-  mutate(variable = colnames(.)[1]) %>% 
-  rename("class" = cyl, "am_0" = `0`, "am_1" = `1`) %>% 
+  spread(key = row_cat, value = percent_row_95) %>% 
+  select(-row_var) %>% 
+  rename(variable = col_var, class = col_cat, "am_0" = `0`, "am_1" = `1`) %>% 
   mutate(class = as.character(class)) # Need for bind_rows below
 
 bind_rows(table, row)
@@ -303,10 +304,10 @@ for (i in seq_along(cat_vars)) {
     group_by(am, !!cat_vars[[i]]) %>% 
     freq_table() %>% 
     format_table() %>% 
-    spread(key = am, value = percent_row_95) %>% 
-    mutate(variable = colnames(.)[1]) %>% 
-    rename("class" = !!cat_vars[[i]], "am_0" = `0`, "am_1" = `1`) %>% 
-    mutate(class = as.character(class)) # Need for bind_rows below
+    spread(key = row_cat, value = percent_row_95) %>% 
+    select(-row_var) %>% 
+    rename(variable = col_var, class = col_cat, "am_0" = `0`, "am_1" = `1`) %>% 
+    mutate(class = as.character(class))
 
   # Append to bottom of table
   table <- bind_rows(table, row)
@@ -347,8 +348,9 @@ for (i in seq_along(cont_vars)) {
     group_by(am) %>% 
     bfuncs::mean_table(!!cont_vars[[i]]) %>% 
     bfuncs::format_table() %>% 
-    spread(key = am, value = mean_95) %>% 
-    rename("variable" = var, "am_0" = `0`, "am_1" = `1`)
+    spread(key = group_cat, value = mean_95) %>% 
+    select(-group_var) %>% 
+    rename("variable" = response_var, "am_0" = `0`, "am_1" = `1`)
 
   # Append to bottom of table
   table <- bind_rows(table, row)
@@ -364,10 +366,10 @@ for (i in seq_along(cat_vars)) {
     group_by(am, !!cat_vars[[i]]) %>% 
     freq_table() %>% 
     format_table() %>% 
-    spread(key = am, value = percent_row_95) %>% 
-    mutate(variable = colnames(.)[1]) %>% 
-    rename("class" = !!cat_vars[[i]], "am_0" = `0`, "am_1" = `1`) %>% 
-    mutate(class = as.character(class)) # Need for bind_rows below
+    spread(key = row_cat, value = percent_row_95) %>% 
+    select(-row_var) %>% 
+    rename(variable = col_var, class = col_cat, "am_0" = `0`, "am_1" = `1`) %>% 
+    mutate(class = as.character(class))
 
   # Append to bottom of table
   table <- bind_rows(table, row)
@@ -442,7 +444,7 @@ table <- table %>%
 * The best work around I can come up with is to add dashes, then find and replaces dashes with white space in Word.
 
 ```{r}
-# table %>% 
+# table %>%
 #   mutate(
 #     class = stringr::str_replace(class, "^", "---"),
 #     variable = if_else(variable == "", class, variable),
diff --git a/inst/doc/presentation_dissemination.html b/inst/doc/presentation_dissemination.html
index 1dbcb48..c331ba3 100644
--- a/inst/doc/presentation_dissemination.html
+++ b/inst/doc/presentation_dissemination.html
@@ -69,7 +69,7 @@
 
 

Presenting and Disseminating Results

Brad Cannell

-

Created: 2017-11-28
Updated: 2018-01-05

+

Created: 2017-11-28
Updated: 2018-01-07

@@ -149,20 +149,21 @@

Calculate the statistics of interest

Calculating formatted descriptive statistics for continuous variables

The descriptive analysis vignette gives examples for calculating common descriptive statistics for continuous variables. For example:

mtcars %>% group_by(am) %>% mean_table(mpg)
-
## # A tibble: 2 x 8
-##      am   var     n  mean   lcl   ucl   min   max
-##   <dbl> <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
-## 1     0   mpg    19 17.15 15.30 19.00  10.4  24.4
-## 2     1   mpg    13 24.39 20.67 28.12  15.0  33.9
+
## # A tibble: 2 x 10
+##   response_var group_var group_cat     n  mean       sem   lcl   ucl   min
+##          <chr>     <chr>     <dbl> <int> <dbl>     <dbl> <dbl> <dbl> <dbl>
+## 1          mpg        am         0    19 17.15 0.8795722 15.30 19.00  10.4
+## 2          mpg        am         1    13 24.39 1.7102804 20.67 28.12  15.0
+## # ... with 1 more variables: max <dbl>

Although the statistics we need are there, they aren’t in a form that can easily be put into a Word table. To help with that process, I’ve created the format_table function.

The format_table function is an S3 generic. It currently has methods for formatting the output of the freq_table and mean_table functions. Examples of how to use of format_table will be give here and below.

Example: mean and 95% confidence interval (the default) overall

mtcars %>% mean_table(mpg) %>% format_table()
## # A tibble: 1 x 2
-##     var               mean_95
-##   <chr>                 <chr>
-## 1   mpg 20.09 (17.92 - 22.26)
+## response_var mean_95 +## <chr> <chr> +## 1 mpg 20.09 (17.92 - 22.26)
  • Note: Changing the digits argument to format_table will change the number of digits displayed, but does not change the underlying rounding of the value. That must be changed in the digits argument to mean_table.
@@ -171,27 +172,27 @@

Example: mean and 95% confidence interval (the default) overall

Example: n and mean overall

mtcars %>% mean_table(mpg) %>% format_table(stats = "n and mean")
## # A tibble: 1 x 2
-##     var     n_mean
-##   <chr>      <chr>
-## 1   mpg 32 (20.09)
+## response_var n_mean +## <chr> <chr> +## 1 mpg 32 (20.09)

Example: mean and 95% confidence interval (the default) by subgroup

mtcars %>% group_by(am) %>% mean_table(mpg) %>% format_table()
-
## # A tibble: 2 x 3
-##      am   var               mean_95
-##   <dbl> <chr>                 <chr>
-## 1     0   mpg 17.15 (15.30 - 19.00)
-## 2     1   mpg 24.39 (20.67 - 28.12)
+
## # A tibble: 2 x 4
+##   response_var group_var group_cat               mean_95
+##          <chr>     <chr>     <dbl>                 <chr>
+## 1          mpg        am         0 17.15 (15.30 - 19.00)
+## 2          mpg        am         1 24.39 (20.67 - 28.12)

Example: n and mean by subgroup

mtcars %>% group_by(am) %>% mean_table(mpg) %>% format_table(stats = "n and mean")
-
## # A tibble: 2 x 3
-##      am   var     n_mean
-##   <dbl> <chr>      <chr>
-## 1     0   mpg 19 (17.15)
-## 2     1   mpg 13 (24.39)
+
## # A tibble: 2 x 4
+##   response_var group_var group_cat     n_mean
+##          <chr>     <chr>     <dbl>      <chr>
+## 1          mpg        am         0 19 (17.15)
+## 2          mpg        am         1 13 (24.39)

 


@@ -202,46 +203,46 @@

Calculating formatted descriptive statistics for categorical variables

Example: Overall percent and 95% confidence interval (the default)

mtcars %>% group_by(am) %>% freq_table() %>% format_table()
-
## # A tibble: 2 x 2
-##      am            percent_95
-##   <dbl>                 <chr>
-## 1     0 59.38 (40.94 - 75.50)
-## 2     1 40.62 (24.50 - 59.06)
+
## # A tibble: 2 x 3
+##     var   cat            percent_95
+##   <chr> <dbl>                 <chr>
+## 1    am     0 59.38 (40.94 - 75.50)
+## 2    am     1 40.62 (24.50 - 59.06)

Example: Overall n and percent

mtcars %>% group_by(am) %>% freq_table() %>% format_table(stats = "n and percent")
-
## # A tibble: 2 x 2
-##      am  n_percent
-##   <dbl>      <chr>
-## 1     0 19 (59.38)
-## 2     1 13 (40.62)
+
## # A tibble: 2 x 3
+##     var   cat  n_percent
+##   <chr> <dbl>      <chr>
+## 1    am     0 19 (59.38)
+## 2    am     1 13 (40.62)

Example: Row percent and 95% confidence interval (the default)

mtcars %>% group_by(am, cyl) %>% freq_table() %>% format_table()
-
## # A tibble: 6 x 3
-##      am   cyl        percent_row_95
-##   <dbl> <dbl>                 <chr>
-## 1     0     4 15.79 ( 4.78 - 41.20)
-## 2     0     6 21.05 ( 7.58 - 46.44)
-## 3     0     8 63.16 (38.76 - 82.28)
-## 4     1     4 61.54 (32.30 - 84.29)
-## 5     1     6 23.08 ( 6.91 - 54.82)
-## 6     1     8 15.38 ( 3.43 - 48.18)
+
## # A tibble: 6 x 5
+##   row_var row_cat col_var col_cat        percent_row_95
+##     <chr>   <dbl>   <chr>   <dbl>                 <chr>
+## 1      am       0     cyl       4  15.79 (4.78 - 41.20)
+## 2      am       0     cyl       6  21.05 (7.58 - 46.44)
+## 3      am       0     cyl       8 63.16 (38.76 - 82.28)
+## 4      am       1     cyl       4 61.54 (32.30 - 84.29)
+## 5      am       1     cyl       6  23.08 (6.91 - 54.82)
+## 6      am       1     cyl       8  15.38 (3.43 - 48.18)

Example: N and row percent

mtcars %>% group_by(am, cyl) %>% freq_table(output = "all") %>% format_table(stats = "n and percent")
-
## # A tibble: 6 x 3
-##      am   cyl n_percent_total
-##   <dbl> <dbl>           <chr>
-## 1     0     4       3 ( 9.38)
-## 2     0     6       4 (12.50)
-## 3     0     8      12 (37.50)
-## 4     1     4       8 (25.00)
-## 5     1     6       3 ( 9.38)
-## 6     1     8       2 ( 6.25)
+
## # A tibble: 6 x 5
+##   row_var row_cat col_var col_cat n_percent_total
+##     <chr>   <dbl>   <chr>   <dbl>           <chr>
+## 1      am       0     cyl       4       3 ( 9.38)
+## 2      am       0     cyl       6       4 (12.50)
+## 3      am       0     cyl       8      12 (37.50)
+## 4      am       1     cyl       4       8 (25.00)
+## 5      am       1     cyl       6       3 ( 9.38)
+## 6      am       1     cyl       8       2 ( 6.25)

Creating tables of summary statistics

top

 

@@ -257,13 +258,13 @@

Further formatting for presentation

group_by(am, cyl) %>% freq_table() %>% format_table() %>% - spread(key = am, value = percent_row_95)
-
## # A tibble: 3 x 3
-##     cyl                   `0`                   `1`
-## * <dbl>                 <chr>                 <chr>
-## 1     4 15.79 ( 4.78 - 41.20) 61.54 (32.30 - 84.29)
-## 2     6 21.05 ( 7.58 - 46.44) 23.08 ( 6.91 - 54.82)
-## 3     8 63.16 (38.76 - 82.28) 15.38 ( 3.43 - 48.18)
+ spread(key = row_cat, value = percent_row_95) +
## # A tibble: 3 x 5
+##   row_var col_var col_cat                   `0`                   `1`
+## *   <chr>   <chr>   <dbl>                 <chr>                 <chr>
+## 1      am     cyl       4  15.79 (4.78 - 41.20) 61.54 (32.30 - 84.29)
+## 2      am     cyl       6  21.05 (7.58 - 46.44)  23.08 (6.91 - 54.82)
+## 3      am     cyl       8 63.16 (38.76 - 82.28)  15.38 (3.43 - 48.18)

Where:

  • Key = the variable whose levels make up our comparison groups of interest

  • @@ -274,15 +275,16 @@

    Further formatting for presentation

    group_by(am, cyl) %>% freq_table() %>% format_table() %>% - spread(key = am, value = percent_row_95) %>% - mutate(variable = colnames(.)[1]) %>% - rename("class" = cyl, "am_0" = `0`, "am_1" = `1`) + spread(key = row_cat, value = percent_row_95) %>% + select(-row_var) %>% + # Rename to row bind with table shell + rename(variable = col_var, class = col_cat, "am_0" = `0`, "am_1" = `1`)
    ## # A tibble: 3 x 4
    -##   class                  am_0                  am_1 variable
    -##   <dbl>                 <chr>                 <chr>    <chr>
    -## 1     4 15.79 ( 4.78 - 41.20) 61.54 (32.30 - 84.29)      cyl
    -## 2     6 21.05 ( 7.58 - 46.44) 23.08 ( 6.91 - 54.82)      cyl
    -## 3     8 63.16 (38.76 - 82.28) 15.38 ( 3.43 - 48.18)      cyl
    +## variable class am_0 am_1 +## * <chr> <dbl> <chr> <chr> +## 1 cyl 4 15.79 (4.78 - 41.20) 61.54 (32.30 - 84.29) +## 2 cyl 6 21.05 (7.58 - 46.44) 23.08 (6.91 - 54.82) +## 3 cyl 8 63.16 (38.76 - 82.28) 15.38 (3.43 - 48.18)

    There may be better ways to do this, but this is the best I’ve found so far.

    Creating tables of summary statistics

    top

    @@ -297,9 +299,9 @@

    Fill in the table shell

    group_by(am, cyl) %>% freq_table() %>% format_table() %>% - spread(key = am, value = percent_row_95) %>% - mutate(variable = colnames(.)[1]) %>% - rename("class" = cyl, "am_0" = `0`, "am_1" = `1`) %>% + spread(key = row_cat, value = percent_row_95) %>% + select(-row_var) %>% + rename(variable = col_var, class = col_cat, "am_0" = `0`, "am_1" = `1`) %>% mutate(class = as.character(class)) # Need for bind_rows below bind_rows(table, row) @@ -307,9 +309,9 @@

    Fill in the table shell

    ## variable class am_0 am_1 ## <chr> <chr> <chr> <chr> ## 1 N = 19 N = 13 -## 2 cyl 4 15.79 ( 4.78 - 41.20) 61.54 (32.30 - 84.29) -## 3 cyl 6 21.05 ( 7.58 - 46.44) 23.08 ( 6.91 - 54.82) -## 4 cyl 8 63.16 (38.76 - 82.28) 15.38 ( 3.43 - 48.18) +## 2 cyl 4 15.79 (4.78 - 41.20) 61.54 (32.30 - 84.29) +## 3 cyl 6 21.05 (7.58 - 46.44) 23.08 (6.91 - 54.82) +## 4 cyl 8 63.16 (38.76 - 82.28) 15.38 (3.43 - 48.18)

    Then, it’s trivial to extend this method to multiple variables using a for loop.

    # Select variables 
     cat_vars <- quos(cyl, vs)
    @@ -322,10 +324,10 @@ 

    Fill in the table shell

    group_by(am, !!cat_vars[[i]]) %>% freq_table() %>% format_table() %>% - spread(key = am, value = percent_row_95) %>% - mutate(variable = colnames(.)[1]) %>% - rename("class" = !!cat_vars[[i]], "am_0" = `0`, "am_1" = `1`) %>% - mutate(class = as.character(class)) # Need for bind_rows below + spread(key = row_cat, value = percent_row_95) %>% + select(-row_var) %>% + rename(variable = col_var, class = col_cat, "am_0" = `0`, "am_1" = `1`) %>% + mutate(class = as.character(class)) # Append to bottom of table table <- bind_rows(table, row) @@ -336,9 +338,9 @@

    Fill in the table shell

    ## variable class am_0 am_1 ## <chr> <chr> <chr> <chr> ## 1 N = 19 N = 13 -## 2 cyl 4 15.79 ( 4.78 - 41.20) 61.54 (32.30 - 84.29) -## 3 cyl 6 21.05 ( 7.58 - 46.44) 23.08 ( 6.91 - 54.82) -## 4 cyl 8 63.16 (38.76 - 82.28) 15.38 ( 3.43 - 48.18) +## 2 cyl 4 15.79 (4.78 - 41.20) 61.54 (32.30 - 84.29) +## 3 cyl 6 21.05 (7.58 - 46.44) 23.08 (6.91 - 54.82) +## 4 cyl 8 63.16 (38.76 - 82.28) 15.38 (3.43 - 48.18) ## 5 vs 0 63.16 (38.76 - 82.28) 46.15 (20.83 - 73.63) ## 6 vs 1 36.84 (17.72 - 61.24) 53.85 (26.37 - 79.17)
      @@ -372,8 +374,9 @@

      Example: Continuous and categorical variables

      group_by(am) %>% bfuncs::mean_table(!!cont_vars[[i]]) %>% bfuncs::format_table() %>% - spread(key = am, value = mean_95) %>% - rename("variable" = var, "am_0" = `0`, "am_1" = `1`) + spread(key = group_cat, value = mean_95) %>% + select(-group_var) %>% + rename("variable" = response_var, "am_0" = `0`, "am_1" = `1`) # Append to bottom of table table <- bind_rows(table, row) @@ -389,10 +392,10 @@

      Example: Continuous and categorical variables

      group_by(am, !!cat_vars[[i]]) %>% freq_table() %>% format_table() %>% - spread(key = am, value = percent_row_95) %>% - mutate(variable = colnames(.)[1]) %>% - rename("class" = !!cat_vars[[i]], "am_0" = `0`, "am_1" = `1`) %>% - mutate(class = as.character(class)) # Need for bind_rows below + spread(key = row_cat, value = percent_row_95) %>% + select(-row_var) %>% + rename(variable = col_var, class = col_cat, "am_0" = `0`, "am_1" = `1`) %>% + mutate(class = as.character(class)) # Append to bottom of table table <- bind_rows(table, row) @@ -400,16 +403,16 @@

      Example: Continuous and categorical variables

      print(table)
    ## # A tibble: 8 x 4
    -##   variable class                     am_0                     am_1
    -##      <chr> <chr>                    <chr>                    <chr>
    -## 1                                  N = 19                   N = 13
    -## 2      mpg  <NA>    17.15 (15.30 - 19.00)    24.39 (20.67 - 28.12)
    -## 3     disp  <NA> 290.38 (237.28 - 343.48) 143.53 ( 90.83 - 196.23)
    -## 4      cyl     4    15.79 ( 4.78 - 41.20)    61.54 (32.30 - 84.29)
    -## 5      cyl     6    21.05 ( 7.58 - 46.44)    23.08 ( 6.91 - 54.82)
    -## 6      cyl     8    63.16 (38.76 - 82.28)    15.38 ( 3.43 - 48.18)
    -## 7       vs     0    63.16 (38.76 - 82.28)    46.15 (20.83 - 73.63)
    -## 8       vs     1    36.84 (17.72 - 61.24)    53.85 (26.37 - 79.17)
    +## variable class am_0 am_1 +## <chr> <chr> <chr> <chr> +## 1 N = 19 N = 13 +## 2 mpg <NA> 17.15 (15.30 - 19.00) 24.39 (20.67 - 28.12) +## 3 disp <NA> 290.38 (237.28 - 343.48) 143.53 (90.83 - 196.23) +## 4 cyl 4 15.79 (4.78 - 41.20) 61.54 (32.30 - 84.29) +## 5 cyl 6 21.05 (7.58 - 46.44) 23.08 (6.91 - 54.82) +## 6 cyl 8 63.16 (38.76 - 82.28) 15.38 (3.43 - 48.18) +## 7 vs 0 63.16 (38.76 - 82.28) 46.15 (20.83 - 73.63) +## 8 vs 1 36.84 (17.72 - 61.24) 53.85 (26.37 - 79.17)
    • Again, this still feels clunky.
    @@ -443,8 +446,8 @@

    Example: Improve row headers

    ## 1 N = 19 ## 2 Miles per gallon, mean (95% CI) <NA> 17.15 (15.30 - 19.00) ## 3 Displacement, mean (95% CI) <NA> 290.38 (237.28 - 343.48) -## 4 Number of cylinders, percent (95% CI) 4 15.79 ( 4.78 - 41.20) -## 5 Number of cylinders, percent (95% CI) 6 21.05 ( 7.58 - 46.44) +## 4 Number of cylinders, percent (95% CI) 4 15.79 (4.78 - 41.20) +## 5 Number of cylinders, percent (95% CI) 6 21.05 (7.58 - 46.44) ## 6 Number of cylinders, percent (95% CI) 8 63.16 (38.76 - 82.28) ## 7 V/S, percent (95% CI) 0 63.16 (38.76 - 82.28) ## 8 V/S, percent (95% CI) 1 36.84 (17.72 - 61.24) @@ -471,8 +474,8 @@

    Example: Remove duplicate variable names for categorical variables

    ## 1 N = 19 ## 2 Miles per gallon, mean (95% CI) <NA> 17.15 (15.30 - 19.00) ## 3 Displacement, mean (95% CI) <NA> 290.38 (237.28 - 343.48) -## 4 Number of cylinders, percent (95% CI) 4 15.79 ( 4.78 - 41.20) -## 5 6 21.05 ( 7.58 - 46.44) +## 4 Number of cylinders, percent (95% CI) 4 15.79 (4.78 - 41.20) +## 5 6 21.05 (7.58 - 46.44) ## 6 8 63.16 (38.76 - 82.28) ## 7 V/S, percent (95% CI) 0 63.16 (38.76 - 82.28) ## 8 1 36.84 (17.72 - 61.24) @@ -488,7 +491,7 @@

    Example: Slide classes to the left, under variable names

  • For some reason, R automatically strips the leading white space.

  • The best work around I can come up with is to add dashes, then find and replaces dashes with white space in Word.

-
# table %>% 
+
# table %>%
 #   mutate(
 #     class = stringr::str_replace(class, "^", "---"),
 #     variable = if_else(variable == "", class, variable),
@@ -515,15 +518,15 @@ 

Knit to Kable

print(table_kable)
## 
 ## 
-## Characteristic                          Class   Automatic Transmission     manual Transmission      
-## --------------------------------------  ------  -------------------------  -------------------------
-##                                                 N = 19                     N = 13                   
-## Miles per gallon, mean (95% CI)         NA      17.15 (15.30 - 19.00)      24.39 (20.67 - 28.12)    
-## Displacement, mean (95% CI)             NA      290.38 (237.28 - 343.48)   143.53 ( 90.83 - 196.23) 
-## Number of cylinders, percent (95% CI)   4       15.79 ( 4.78 - 41.20)      61.54 (32.30 - 84.29)    
-##                                         6       21.05 ( 7.58 - 46.44)      23.08 ( 6.91 - 54.82)    
-##                                         8       63.16 (38.76 - 82.28)      15.38 ( 3.43 - 48.18)    
-## V/S, percent (95% CI)                   0       63.16 (38.76 - 82.28)      46.15 (20.83 - 73.63)    
+## Characteristic                          Class   Automatic Transmission     manual Transmission     
+## --------------------------------------  ------  -------------------------  ------------------------
+##                                                 N = 19                     N = 13                  
+## Miles per gallon, mean (95% CI)         NA      17.15 (15.30 - 19.00)      24.39 (20.67 - 28.12)   
+## Displacement, mean (95% CI)             NA      290.38 (237.28 - 343.48)   143.53 (90.83 - 196.23) 
+## Number of cylinders, percent (95% CI)   4       15.79 (4.78 - 41.20)       61.54 (32.30 - 84.29)   
+##                                         6       21.05 (7.58 - 46.44)       23.08 (6.91 - 54.82)    
+##                                         8       63.16 (38.76 - 82.28)      15.38 (3.43 - 48.18)    
+## V/S, percent (95% CI)                   0       63.16 (38.76 - 82.28)      46.15 (20.83 - 73.63)   
 ##                                         1       36.84 (17.72 - 61.24)      53.85 (26.37 - 79.17)
  • Get rid of class NA.
  • diff --git a/inst/doc/using_tabstat_with_dplyr.html b/inst/doc/using_tabstat_with_dplyr.html index 9eb698a..6784a63 100644 --- a/inst/doc/using_tabstat_with_dplyr.html +++ b/inst/doc/using_tabstat_with_dplyr.html @@ -69,7 +69,7 @@

    Using tabstat with dplyr

    Brad Cannell

    -

    Created: 2017-11-17
    Updated: 2018-01-05

    +

    Created: 2017-11-17
    Updated: 2018-01-07

    @@ -138,31 +138,33 @@

    Example 4: Tabstat (multiple return values) with grouping

    [1] stats graphics grDevices utils datasets methods base other attached packages: - [1] DiagrammeR_0.9.2 bindrcpp_0.2 bfuncs_0.2 forcats_0.2.0 - [5] stringr_1.2.0 dplyr_0.7.4 purrr_0.2.4 readr_1.1.1 - [9] tidyr_0.7.2 tibble_1.3.4 ggplot2_2.2.1 tidyverse_1.2.1 + [1] DiagrammeR_0.9.2 testthat_1.0.2 bindrcpp_0.2 + [4] bfuncs_0.2 forcats_0.2.0 stringr_1.2.0 + [7] dplyr_0.7.4 purrr_0.2.4 readr_1.1.1 +[10] tidyr_0.7.2 tibble_1.3.4 ggplot2_2.2.1.9000 +[13] tidyverse_1.2.1 loaded via a namespace (and not attached): - [1] Rcpp_0.12.13 lubridate_1.7.1 lattice_0.20-35 + [1] Rcpp_0.12.14 lubridate_1.7.1 lattice_0.20-35 [4] visNetwork_2.0.2 assertthat_0.2.0 rprojroot_1.2 - [7] digest_0.6.13 psych_1.7.8 R6_2.2.2 + [7] digest_0.6.12 psych_1.7.8 R6_2.2.2 [10] cellranger_1.1.0 plyr_1.8.4 backports_1.1.0 [13] evaluate_0.10.1 httr_1.3.1 highr_0.6 -[16] rlang_0.1.6 lazyeval_0.2.0 readxl_1.0.0 +[16] rlang_0.1.6 lazyeval_0.2.1 readxl_1.0.0 [19] rstudioapi_0.7 rmarkdown_1.8 devtools_1.13.3 [22] downloader_0.4 foreign_0.8-69 htmlwidgets_0.9 [25] igraph_1.1.2 munsell_0.4.3 broom_0.4.2 -[28] compiler_3.4.3 influenceR_0.1.0 rgexf_0.15.3 +[28] rgexf_0.15.3 compiler_3.4.3 influenceR_0.1.0 [31] modelr_0.1.1 pkgconfig_2.0.1 mnormt_1.5-5 -[34] htmltools_0.3.6 tidyselect_0.2.3 gridExtra_2.3 +[34] htmltools_0.3.6 tidyselect_0.2.2 gridExtra_2.3 [37] XML_3.98-1.9 viridisLite_0.2.0 crayon_1.3.4 -[40] withr_2.0.0 grid_3.4.3 nlme_3.1-131 +[40] withr_2.1.0.9000 grid_3.4.3 nlme_3.1-131 [43] jsonlite_1.5 gtable_0.2.0 magrittr_1.5 -[46] scales_0.5.0 cli_1.0.0 stringi_1.1.5 +[46] scales_0.5.0.9000 cli_1.0.0 stringi_1.1.5 [49] reshape2_1.4.2 viridis_0.4.0 xml2_1.1.1 [52] brew_1.0-6 RColorBrewer_1.1-2 tools_3.4.3 -[55] glue_1.2.0 hms_0.3 Rook_1.1-1 -[58] parallel_3.4.3 yaml_2.1.16 colorspace_1.3-2 +[55] glue_1.2.0 Rook_1.1-1 hms_0.3 +[58] parallel_3.4.3 yaml_2.1.14 colorspace_1.3-2 [61] rvest_0.3.2 memoise_1.1.0 knitr_1.17 [64] bindr_0.1 haven_1.1.0
diff --git a/man/format_table.Rd b/man/format_table.Rd index 6b05c01..7953d2f 100644 --- a/man/format_table.Rd +++ b/man/format_table.Rd @@ -27,7 +27,7 @@ format_table(.data, ...) \item{...}{Other parameters to be passed on.} -\item{digits}{Determines the number of decimal place to display. Passed to +\item{digits}{Determines the number of decimal places to display. Passed to the "nsmall =" parameter of the format function. Note: Changing the digits argument to format_table will change the number @@ -62,28 +62,74 @@ library(bfuncs) data(mtcars) -# Overall mean table +# Overall mean table with defaults mtcars \%>\% mean_table(mpg) \%>\% format_table() #> # A tibble: 1 x 2 -#> var mean_95 -#> -#> 1 mpg 20.09 (17.92 - 22.26) +#> response_var mean_95 +#> +#> 1 mpg 20.09 (17.92 - 22.26) -# Grouped means table +# Grouped means table with defaults mtcars \%>\% group_by(cyl) \%>\% mean_table(mpg) \%>\% format_table() -#> # A tibble: 3 x 3 -#> cyl var mean_95 -#> -#> 1 4 mpg 26.66 (23.63 - 29.69) -#> 2 6 mpg 19.74 (18.40 - 21.09) -#> 3 8 mpg 15.10 (13.62 - 16.58) +#> # A tibble: 3 x 4 +#> response_var group_var group_cat mean_95 +#> +#> 1 mpg cyl 4 26.66 (23.63 - 29.69) +#> 2 mpg cyl 6 19.74 (18.40 - 21.09) +#> 3 mpg cyl 8 15.10 (13.62 - 16.58) + +# One-way frequency tables with defaults + +mtcars \%>\% +group_by(cyl) \%>\% + mean_table(mpg) \%>\% + format_table() +#> # A tibble: 2 x 3 +#> var cat percent_95 +#> +#> 1 am 0 59.38 (40.94 - 75.50) +#> 2 am 1 40.62 (24.50 - 59.06) + +# Two-way frequency tables with defaults + +mtcars \%>\% + group_by(am, cyl) \%>\% + freq_table() \%>\% + format_table() + +#> # A tibble: 6 x 5 +#> row_var row_cat col_var col_cat percent_row_95 +#> +#> 1 am 0 cyl 4 15.79 (4.78 - 41.20) +#> 2 am 0 cyl 6 21.05 (7.58 - 46.44) +#> 3 am 0 cyl 8 63.16 (38.76 - 82.28) +#> 4 am 1 cyl 4 61.54 (32.30 - 84.29) +#> 5 am 1 cyl 6 23.08 (6.91 - 54.82) +#> 6 am 1 cyl 8 15.38 (3.43 - 48.18) + +#' # Two-way frequency tables with with stats = "n and row percent" + +mtcars \%>\% + group_by(am, cyl) \%>\% + freq_table(output = all) \%>\% # Don't forget output = all + format_table(stats = "n and row percent") + +#> # A tibble: 6 x 5 +#> row_var row_cat col_var col_cat n_percent_row +#> +#> 1 am 0 cyl 4 3 (15.79) +#> 2 am 0 cyl 6 4 (21.05) +#> 3 am 0 cyl 8 12 (63.16) +#> 4 am 1 cyl 4 8 (61.54) +#> 5 am 1 cyl 6 3 (23.08) +#> 6 am 1 cyl 8 2 (15.38) } diff --git a/man/freq_table.Rd b/man/freq_table.Rd index 3d8bfb7..2d4d3e8 100644 --- a/man/freq_table.Rd +++ b/man/freq_table.Rd @@ -73,6 +73,7 @@ The freq_table function produces one-way and two-way frequency } \examples{ library(tidyverse) +library(bfuncs) data(mtcars) @@ -82,11 +83,11 @@ mtcars \%>\% group_by(am) \%>\% freq_table() -#> # A tibble: 2 x 6 -#> am n n_total percent lcl_log ucl_log -#> -#> 1 0 19 32 59.38 40.94 75.50 -#> 2 1 13 32 40.62 24.50 59.06 +#> # A tibble: 2 x 7 +#> var cat n n_total percent lcl ucl +#> +#> 1 am 0 19 32 59.38 40.94 75.50 +#> 2 am 1 13 32 40.62 24.50 59.06 # Two-way frequency table with defaults @@ -94,15 +95,15 @@ mtcars \%>\% group_by(am, cyl) \%>\% freq_table() -#> # A tibble: 6 x 8 -#> am cyl n n_row n_total percent_row lcl_row_log ucl_row_log -#> -#> 1 0 4 3 19 32 15.79 4.78 41.20 -#> 2 0 6 4 19 32 21.05 7.58 46.44 -#> 3 0 8 12 19 32 63.16 38.76 82.28 -#> 4 1 4 8 13 32 61.54 32.30 84.29 -#> 5 1 6 3 13 32 23.08 6.91 54.82 -#> 6 1 8 2 13 32 15.38 3.43 48.18 +#> # A tibble: 6 x 10 +#> row_var row_cat col_var col_cat n n_row n_total percent_row lcl_row ucl_row +#> +#> 1 am 0 cyl 4 3 19 32 15.79 4.78 41.20 +#> 2 am 0 cyl 6 4 19 32 21.05 7.58 46.44 +#> 3 am 0 cyl 8 12 19 32 63.16 38.76 82.28 +#> 4 am 1 cyl 4 8 13 32 61.54 32.30 84.29 +#> 5 am 1 cyl 6 3 13 32 23.08 6.91 54.82 +#> 6 am 1 cyl 8 2 13 32 15.38 3.43 48.18 } \references{ Agresti, A. (2012). Categorical Data Analysis (3rd ed.). Hoboken, NJ: Wiley. diff --git a/man/freq_test.Rd b/man/freq_test.Rd index 5d8e31c..71010a4 100644 --- a/man/freq_test.Rd +++ b/man/freq_test.Rd @@ -45,13 +45,13 @@ mtcars \%>\% group_by(am) \%>\% freq_table() \%>\% freq_test() \%>\% - select(1:6, p_chi2_pearson) + select(var:percent, p_chi2_pearson) -#> # A tibble: 2 x 7 -#> am n n_total percent lcl ucl p_chi2_pearson -#> -#> 1 0 19 32 59.38 40.94 75.50 0.2888444 -#> 2 1 13 32 40.62 24.50 59.06 0.2888444 +#> # A tibble: 2 x 6 +#> var cat n n_total percent p_chi2_pearson +#> +#> 1 am 0 19 32 59.38 0.2888444 +#> 2 am 1 13 32 40.62 0.2888444 # Chi-square test of independence @@ -59,13 +59,13 @@ mtcars \%>\% group_by(am, vs) \%>\% freq_table() \%>\% freq_test() \%>\% - select(1:8, p_chi2_pearson) + select(row_var:n, percent_row, p_chi2_pearson) -#> # A tibble: 4 x 9 -#> am vs n n_row n_total percent_row lcl_row ucl_row p_chi2_pearson -#> -#> 1 0 0 12 19 32 63.16 38.76 82.28 0.3409429 -#> 2 0 1 7 19 32 36.84 17.72 61.24 0.3409429 -#> 3 1 0 6 13 32 46.15 20.83 73.63 0.3409429 -#> 4 1 1 7 13 32 53.85 26.37 79.17 0.3409429 +#> # A tibble: 4 x 7 +#> row_var row_cat col_var col_cat n percent_row p_chi2_pearson +#> +#> 1 am 0 vs 0 12 63.16 0.3409429 +#> 2 am 0 vs 1 7 36.84 0.3409429 +#> 3 am 1 vs 0 6 46.15 0.3409429 +#> 4 am 1 vs 1 7 53.85 0.3409429 } diff --git a/man/mean_table.Rd b/man/mean_table.Rd index dc04f74..d33b1b3 100644 --- a/man/mean_table.Rd +++ b/man/mean_table.Rd @@ -4,7 +4,7 @@ \alias{mean_table} \title{Estimate Percents and 95 Percent Confidence Intervals in dplyr Pipelines} \usage{ -mean_table(.data, x, t_prob = 0.975, output = "default", digits = 2, ...) +mean_table(.data, x, t_prob = 0.975, output = default, digits = 2, ...) } \arguments{ \item{.data}{A tibble or grouped tibble.} @@ -18,10 +18,9 @@ distribution with n - 1 degrees of freedom.} \item{output}{Options for this parameter are "default" and "all". - Default output includes the n, mean, and 95% confidence interval for the - mean. Using output = "all" also returns the standard error of the number - of missing values for x, the critical t-value, and the standard error of - the mean.} + Default output includes the n, mean, sem, and 95% confidence interval for + the mean. Using output = "all" also returns the the number of missing + values for x and the critical t-value.} \item{digits}{Round mean, lcl, and ucl to digits. Default is 2.} @@ -49,10 +48,10 @@ data(mtcars) mtcars \%>\% mean_table(mpg) -#> # A tibble: 1 x 5 -#> var n mean lcl ucl min max -#> -#> 1 mpg 32 20.09 17.92 22.26 10.4 33.9 +#> # A tibble: 1 x 8 +#> response_var n mean sem lcl ucl min max +#> +#> 1 mpg 32 20.09 1.065424 17.92 22.26 10.4 33.9 # Grouped means table with defaults @@ -60,12 +59,12 @@ mtcars \%>\% group_by(cyl) \%>\% mean_table(mpg) -#> # A tibble: 3 x 6 -#> cyl var n mean lcl ucl min max -#> -#> 1 4 mpg 11 26.66 23.63 29.69 21.4 33.9 -#> 2 6 mpg 7 19.74 18.40 21.09 17.8 21.4 -#> 3 8 mpg 14 15.10 13.62 16.58 10.4 19.2 +#> # A tibble: 3 x 10 +#> response_var group_var group_cat n mean sem lcl ucl min max +#> +#> 1 mpg cyl 4 11 26.66 1.3597642 23.63 29.69 21.4 33.9 +#> 2 mpg cyl 6 7 19.74 0.5493967 18.40 21.09 17.8 21.4 +#> 3 mpg cyl 8 14 15.10 0.6842016 13.62 16.58 10.4 19.2 } \references{ SAS documentation: http://support.sas.com/documentation/cdl/en/proc/65145/HTML/default/viewer.htm#p0klmrp4k89pz0n1p72t0clpavyx.htm diff --git a/tests/testthat/test-format_table.R b/tests/testthat/test-format_table.R new file mode 100644 index 0000000..38d32e3 --- /dev/null +++ b/tests/testthat/test-format_table.R @@ -0,0 +1,312 @@ +library(tidyverse) +library(bfuncs) +data(mtcars) + +context("test-format_table.R") + +# ============================================================================= +# Test one-way table of means, stats = mean and ci (default) +# ============================================================================= +df <- mtcars %>% + mean_table(mpg) %>% + format_table() + +test_that("Dimensions of the object returned by format_table are as expected", { + dims <- dim(df) + expect_equal(dims, c(1L, 2L)) +}) + +test_that("The correct var name is returned by format_table", { + response_var <- pull(df, response_var) %>% unique() + expect_match(response_var, "mpg") +}) + +test_that("The correct statistics are returned by format_table", { + mean_95 <- pull(df, mean_95) + expect_equal(mean_95, "20.09 (17.92 - 22.26)") +}) + + + + +# ============================================================================= +# Test one-way table of means, stats = n and mean +# ============================================================================= +df <- mtcars %>% + mean_table(mpg) %>% + format_table(stats = "n and mean") + +test_that("Dimensions of the object returned by format_table are as expected", { + dims <- dim(df) + expect_equal(dims, c(1L, 2L)) +}) + +test_that("The correct var name is returned by format_table", { + response_var <- pull(df, response_var) %>% unique() + expect_match(response_var, "mpg") +}) + +test_that("The correct statistics are returned by format_table", { + n_mean <- pull(df, n_mean) + expect_equal(n_mean, "32 (20.09)") +}) + + + + +# ============================================================================= +# Test grouped table of means, stats = mean and ci (default) +# ============================================================================= +df <- mtcars %>% + group_by(am) %>% + mean_table(mpg) %>% + format_table() + +test_that("Dimensions of the object returned by format_table are as expected", { + dims <- dim(df) + expect_equal(dims, c(2L, 4L)) +}) + +test_that("The correct var name is returned by format_table", { + response_var <- pull(df, response_var) %>% unique() + group_var <- pull(df, group_var) %>% unique() + + expect_match(response_var, "mpg") + expect_match(group_var, "am") +}) + +test_that("The expected group categories are returned by format_table", { + group_cat <- pull(df, group_cat) + expect_equal(group_cat, c(0L, 1L)) +}) + +test_that("The correct statistics are returned by format_table", { + mean_95 <- pull(df, mean_95) + expect_equal(mean_95, c("17.15 (15.30 - 19.00)", "24.39 (20.67 - 28.12)")) +}) + + + + +# ============================================================================= +# Test grouped table of means, stats = n and mean +# ============================================================================= +df <- mtcars %>% + group_by(am) %>% + mean_table(mpg) %>% + format_table(stats = "n and mean") + +test_that("Dimensions of the object returned by format_table are as expected", { + dims <- dim(df) + expect_equal(dims, c(2L, 4L)) +}) + +test_that("The correct var name is returned by format_table", { + response_var <- pull(df, response_var) %>% unique() + group_var <- pull(df, group_var) %>% unique() + + expect_match(response_var, "mpg") + expect_match(group_var, "am") +}) + +test_that("The expected group categories are returned by format_table", { + group_cat <- pull(df, group_cat) + expect_equal(group_cat, c(0L, 1L)) +}) + +test_that("The correct statistics are returned by format_table", { + n_mean <- pull(df, n_mean) + expect_equal(n_mean, c("19 (17.15)", "13 (24.39)")) +}) + + + + +# ============================================================================= +# Test grouped table of means with two group_by vars +# ============================================================================= +df <- mtcars %>% + group_by(am, vs) %>% + mean_table(mpg) %>% + format_table(stats = "n and mean") + +test_that("Dimensions of the object returned by format_table are as expected", { + dims <- dim(df) + expect_equal(dims, c(4L, 6L)) +}) + +test_that("The correct var name is returned by format_table", { + response_var <- pull(df, response_var) %>% unique() + group_1 <- pull(df, group_1) %>% unique() + group_2 <- pull(df, group_2) %>% unique() + + expect_match(response_var, "mpg") + expect_match(group_1, "am") + expect_match(group_2, "vs") +}) + +test_that("The expected group categories are returned by format_table", { + group_1_cat <- pull(df, group_1_cat) + group_2_cat <- pull(df, group_2_cat) + + expect_equal(group_1_cat, c(0L, 0L, 1L, 1L)) + expect_equal(group_2_cat, c(0L, 1L, 0L, 1L)) +}) + +test_that("The correct statistics are returned by format_table", { + n_mean <- pull(df, n_mean) + expect_equal(n_mean, c("12 (15.05)", "7 (20.74)", "6 (19.75)", "7 (28.37)")) +}) + + + + +# ============================================================================= +# Test one-way frequency table, stats = percent and ci (default) +# ============================================================================= +df <- mtcars %>% + group_by(am) %>% + freq_table() %>% + format_table() + +test_that("Dimensions of the object returned by format_table are as expected", { + dims <- dim(df) + expect_equal(dims, c(2L, 3L)) +}) + +test_that("The correct var name is returned by format_table", { + var <- pull(df, var) %>% unique() + expect_match(var, "am") +}) + +test_that("The correct variables levels are returned by format_table", { + cat <- pull(df, cat) + expect_equal(cat, c(0L, 1L)) +}) + +test_that("The correct statistics are returned by format_table", { + percent_95 <- pull(df, percent_95) + expect_equal(percent_95, c("59.38 (40.94 - 75.50)", "40.62 (24.50 - 59.06)")) +}) + + + + +# ============================================================================= +# Test one-way frequency table, stats = n and percent +# ============================================================================= +df <- mtcars %>% + group_by(am) %>% + freq_table() %>% + format_table(stats = "n and percent") + +test_that("Dimensions of the object returned by format_table are as expected", { + dims <- dim(df) + expect_equal(dims, c(2L, 3L)) +}) + +test_that("The correct var name is returned by format_table", { + var <- pull(df, var) %>% unique() + expect_match(var, "am") +}) + +test_that("The correct variables levels are returned by format_table", { + cat <- pull(df, cat) + expect_equal(cat, c(0L, 1L)) +}) + +test_that("The correct statistics are returned by format_table", { + n_percent <- pull(df, n_percent) + expect_equal(n_percent, c("19 (59.38)", "13 (40.62)")) +}) + + + + +# ============================================================================= +# Test two-way frequency table, stats = percent and ci (default) +# ============================================================================= +df <- mtcars %>% + group_by(am, vs) %>% + freq_table() %>% + format_table() + +test_that("Dimensions of the object returned by format_table are as expected", { + dims <- dim(df) + expect_equal(dims, c(4L, 5L)) +}) + +test_that("The correct var name is returned by format_table", { + row_var <- pull(df, row_var) %>% unique() + col_var <- pull(df, col_var) %>% unique() + + expect_match(row_var, "am") + expect_match(col_var, "vs") +}) + +test_that("The correct variables levels are returned by format_table", { + row_cat <- pull(df, row_cat) + col_cat <- pull(df, col_cat) + + expect_equal(row_cat, c(0L, 0L, 1L, 1L)) + expect_equal(col_cat, c(0L, 1L, 0L, 1L)) +}) + +test_that("The correct statistics are returned by format_table", { + percent_row_95 <- pull(df, percent_row_95) + expect_equal(percent_row_95, c("63.16 (38.76 - 82.28)", "36.84 (17.72 - 61.24)", + "46.15 (20.83 - 73.63)", "53.85 (26.37 - 79.17)")) +}) + + + + +# ============================================================================= +# Test two-way frequency table, stats = n and percent +# ============================================================================= +df <- mtcars %>% + group_by(am, vs) %>% + freq_table(output = all) %>% + format_table(stats = "n and percent") + +test_that("Dimensions of the object returned by format_table are as expected", { + dims <- dim(df) + expect_equal(dims, c(4L, 5L)) +}) + +test_that("The correct var name is returned by format_table", { + row_var <- pull(df, row_var) %>% unique() + col_var <- pull(df, col_var) %>% unique() + + expect_match(row_var, "am") + expect_match(col_var, "vs") +}) + +test_that("The correct variables levels are returned by format_table", { + row_cat <- pull(df, row_cat) + col_cat <- pull(df, col_cat) + + expect_equal(row_cat, c(0L, 0L, 1L, 1L)) + expect_equal(col_cat, c(0L, 1L, 0L, 1L)) +}) + +test_that("The correct statistics are returned by format_table", { + n_percent_total <- pull(df, n_percent_total) + expect_equal(n_percent_total, c("12 (37.50)", "7 (21.88)", + "6 (18.75)", "7 (21.88)")) +}) + + + + +# ============================================================================= +# Test digits option +# ============================================================================= +df <- mtcars %>% + mean_table(mpg) %>% + format_table(digits = 3) + +test_that("The digits argument of format_table works as expected", { + mean_95 <- pull(df, mean_95) + expect_equal(mean_95, "20.090 (17.920 - 22.260)") +}) diff --git a/tests/testthat/test-freq_table.R b/tests/testthat/test-freq_table.R index de569f7..e5ed6e9 100644 --- a/tests/testthat/test-freq_table.R +++ b/tests/testthat/test-freq_table.R @@ -16,7 +16,7 @@ test_that("Dimensions of the object returned by freq_table are as expected", { columns <- ncol(df) expect_equal(rows, 2L) - expect_equal(columns, 6L) + expect_equal(columns, 7L) }) test_that("Class of freq_table_one_way is freq_table_one_way", { @@ -25,20 +25,25 @@ test_that("Class of freq_table_one_way is freq_table_one_way", { test_that("The correct var name is returned by freq_table", { name <- names(df)[1] - expect_match(name, "am") + expect_match(name, "var") +}) + +test_that("The correct cat var name is returned by freq_table", { + name <- names(df)[2] + expect_match(name, "cat") }) test_that("The correct variables levels are returned by freq_table", { - levels <- df[, 1] %>% unlist() %>% unname() + levels <- pull(df, cat) expect_equal(levels, c(0, 1)) }) test_that("The correct default statistics are returned by freq_table", { - n <- df[, 2] %>% unlist() %>% unname() - n_total <- df[, 3] %>% unlist() %>% unname() - percent <- df[, 4] %>% unlist() %>% unname() - lcl <- df[, 5] %>% unlist() %>% unname() - ucl <- df[, 6] %>% unlist() %>% unname() + n <- pull(df, n) + n_total <- pull(df, n_total) + percent <- pull(df, percent) + lcl <- pull(df, lcl) + ucl <- pull(df, ucl) expect_equal(n, c(19, 13)) expect_equal(n_total, c(32, 32)) @@ -54,11 +59,11 @@ df <- mtcars %>% freq_table(ci_type = "wald") test_that("The correct Wald CI's are returned by freq_table", { - lcl <- df[1, 5] %>% as.numeric() - ucl <- df[1, 6] %>% as.numeric() + lcl <- pull(df, lcl) + ucl <- pull(df, ucl) - expect_equal(lcl, 41.38) - expect_equal(ucl, 77.37) + expect_equal(lcl, c(41.38, 22.63)) + expect_equal(ucl, c(77.37, 58.62)) }) @@ -76,7 +81,7 @@ test_that("Dimensions of the object returned by freq_table are as expected", { columns <- ncol(df) expect_equal(rows, 6L) - expect_equal(columns, 8L) + expect_equal(columns, 10L) }) test_that("Class of freq_table_two_way is freq_table_two_way", { @@ -84,28 +89,28 @@ test_that("Class of freq_table_two_way is freq_table_two_way", { }) test_that("The correct var names are returned by freq_table", { - group_1 <- names(df)[1] - group_2 <- names(df)[2] + row_var <- pull(df, row_var) + col_var <- pull(df, col_var) - expect_match(group_1, "am") - expect_match(group_2, "cyl") + expect_match(row_var, "am") + expect_match(col_var, "cyl") }) test_that("The correct variables levels are returned by freq_table", { - level_1 <- df[, 1] %>% unlist() %>% unname() - level_2 <- df[, 2] %>% unlist() %>% unname() + row_cat <- pull(df, row_cat) + col_cat <- pull(df, col_cat) - expect_equal(level_1, c(0, 0, 0, 1, 1, 1)) - expect_equal(level_2, c(4, 6, 8, 4, 6, 8)) + expect_equal(row_cat, c(0, 0, 0, 1, 1, 1)) + expect_equal(col_cat, c(4, 6, 8, 4, 6, 8)) }) test_that("The correct default statistics are returned by freq_table", { - n <- df[, 3] %>% unlist() %>% unname() - n_row <- df[, 4] %>% unlist() %>% unname() - n_total <- df[, 5] %>% unlist() %>% unname() - percent_row <- df[, 6] %>% unlist() %>% unname() - lcl_row <- df[, 7] %>% unlist() %>% unname() - ucl_row <- df[, 8] %>% unlist() %>% unname() + n <- pull(df, n) + n_row <- pull(df, n_row) + n_total <- pull(df, n_total) + percent_row <- pull(df, percent_row) + lcl_row <- pull(df, lcl_row) + ucl_row <- pull(df, ucl_row) expect_equal(n, c(3, 4, 12, 8, 3, 2)) expect_equal(n_row, c(rep(19, 3), rep(13, 3))) @@ -122,9 +127,9 @@ df <- mtcars %>% freq_table(output = "all") test_that("The correct overall percents and 95% CI's are returned", { - percent_total <- df[, 6] %>% unlist() %>% unname() - lcl_total <- df[, 9] %>% unlist() %>% unname() - ucl_total <- df[, 10] %>% unlist() %>% unname() + percent_total <- pull(df, percent_total) + lcl_total <- pull(df, lcl_total) + ucl_total <- pull(df, ucl_total) expect_equal(percent_total, c(9.38, 12.50, 37.50, 25.00, 9.38, 6.25)) expect_equal(lcl_total, c(2.86, 4.51, 21.97, 12.51, 2.86, 1.45)) @@ -148,8 +153,8 @@ df <- mtcars %>% freq_table(t_prob = t) test_that("The 99% confidence intervals are correct", { - lcl <- df[, 5] %>% unlist() %>% unname() - ucl <- df[, 6] %>% unlist() %>% unname() + lcl <- pull(df, lcl) + ucl <- pull(df, ucl) expect_equal(lcl, c(34.89, 20.05)) expect_equal(ucl, c(79.95, 65.11)) @@ -162,9 +167,9 @@ df <- mtcars %>% freq_table(digits = 3) test_that("The 'digits' parameter works as expected", { - percent <- df[, 4] %>% unlist() %>% unname() - lcl <- df[, 5] %>% unlist() %>% unname() - ucl <- df[, 6] %>% unlist() %>% unname() + percent <- pull(df, percent) + lcl <- pull(df, lcl) + ucl <- pull(df, ucl) expect_equal(percent, c(59.375, 40.625)) expect_equal(lcl, c(40.942, 24.502)) diff --git a/tests/testthat/test-freq_test.R b/tests/testthat/test-freq_test.R index 320132f..cb5844e 100644 --- a/tests/testthat/test-freq_test.R +++ b/tests/testthat/test-freq_test.R @@ -17,29 +17,34 @@ test_that("Dimensions of the object returned by freq_test are as expected", { columns <- ncol(df) expect_equal(rows, 2L) - expect_equal(columns, 11L) + expect_equal(columns, 12L) }) test_that("Class of freq_table_one_way is freq_table_one_way", { expect_is(df, "freq_table_one_way") }) -test_that("The correct var name is returned by freq_test", { +test_that("The correct var name is returned by freq_table", { name <- names(df)[1] - expect_match(name, "am") + expect_match(name, "var") +}) + +test_that("The correct cat var name is returned by freq_table", { + name <- names(df)[2] + expect_match(name, "cat") }) test_that("The correct variables levels are returned by freq_test", { - levels <- df[, 1] %>% unlist() %>% unname() + levels <- pull(df, cat) expect_equal(levels, c(0, 1)) }) test_that("The correct default statistics are returned by freq_test", { - n_expected <- df[, 7] %>% unlist() %>% unname() - chi2_contrib <- df[, 8] %>% unlist() %>% unname() - chi2_pearson <- df[, 9] %>% unlist() %>% unname() - deg_freedom <- df[, 10] %>% unlist() %>% unname() - p_chi2_pearson <- df[, 11] %>% unlist() %>% unname() %>% round(7) + n_expected <- pull(df, n_expected) + chi2_contrib <- pull(df, chi2_contrib) + chi2_pearson <- pull(df, chi2_pearson) + deg_freedom <- pull(df, df) + p_chi2_pearson <- pull(df, p_chi2_pearson) %>% round(7) expect_equal(n_expected, rep(16, 2)) expect_equal(chi2_contrib, rep(0.5625, 2)) @@ -64,7 +69,7 @@ test_that("Dimensions of the object returned by freq_table are as expected", { columns <- ncol(df) expect_equal(rows, 6L) - expect_equal(columns, 16L) + expect_equal(columns, 18L) }) test_that("Class of freq_table_two_way is freq_table_two_way", { @@ -72,30 +77,30 @@ test_that("Class of freq_table_two_way is freq_table_two_way", { }) test_that("The correct var names are returned by freq_table", { - group_1 <- names(df)[1] - group_2 <- names(df)[2] + row_var <- pull(df, row_var) + col_var <- pull(df, col_var) - expect_match(group_1, "am") - expect_match(group_2, "cyl") + expect_match(row_var, "am") + expect_match(col_var, "cyl") }) test_that("The correct variables levels are returned by freq_table", { - level_1 <- df[, 1] %>% unlist() %>% unname() - level_2 <- df[, 2] %>% unlist() %>% unname() + row_cat <- pull(df, row_cat) + col_cat <- pull(df, col_cat) - expect_equal(level_1, c(0, 0, 0, 1, 1, 1)) - expect_equal(level_2, c(4, 6, 8, 4, 6, 8)) + expect_equal(row_cat, c(0, 0, 0, 1, 1, 1)) + expect_equal(col_cat, c(4, 6, 8, 4, 6, 8)) }) test_that("The correct default statistics are returned by freq_table", { - n_col <- df[, 9] %>% unlist() %>% unname() - n_expected <- df[, 10] %>% unlist() %>% unname() - chi2_contrib <- df[, 11] %>% unlist() %>% unname() - chi2_pearson <- df[, 12] %>% unlist() %>% unname() - r_column <- df[, 13] %>% unlist() %>% unname() - c_column <- df[, 14] %>% unlist() %>% unname() - deg_freedom <- df[, 15] %>% unlist() %>% unname() - p_chi2_pearson <- df[, 16] %>% unlist() %>% unname() %>% round(7) + n_col <- pull(df, n_col) + n_expected <- pull(df, n_expected) + chi2_contrib <- pull(df, chi2_contrib) + chi2_pearson <- pull(df, chi2_pearson) + r_column <- pull(df, r) + c_column <- pull(df, c) + deg_freedom <- pull(df, df) + p_chi2_pearson <- pull(df, p_chi2_pearson) %>% round(7) expect_equal(n_col, c(11, 7, 14, 11, 7, 14)) expect_equal(n_expected, c(6.53125, 4.15625, 8.31250, 4.46875, 2.84375, 5.68750)) @@ -116,7 +121,7 @@ df <- mtcars %>% freq_test(method = "fisher") test_that("The expected p-value is returned from the fisher method", { - fisher_p_value <- df[, 17] %>% unlist() %>% unname() + fisher_p_value <- pull(df, p_fisher) expect_equal(fisher_p_value, rep(0.009104702, 6)) }) diff --git a/tests/testthat/test-mean_table.R b/tests/testthat/test-mean_table.R index 9c8db49..1682fbb 100644 --- a/tests/testthat/test-mean_table.R +++ b/tests/testthat/test-mean_table.R @@ -6,14 +6,14 @@ context("test-mean_table.R") # ============================================================================= -# Test one-way table of means +# Test one-way table of means with output = all # ============================================================================= df <- mtcars %>% - mean_table(mpg) + mean_table(mpg, output = all) test_that("Dimensions of the object returned by mean_table are as expected", { dims <- dim(df) - expect_equal(dims, c(1, 7)) + expect_equal(dims, c(1L, 10L)) }) test_that("Class of mean_table is mean_table", { @@ -21,20 +21,26 @@ test_that("Class of mean_table is mean_table", { }) test_that("The correct var name is returned by mean_table", { - name <- df[1, 1] %>% unlist() - expect_match(name, "mpg") + response_var <- pull(df, response_var) %>% unique() + expect_match(response_var, "mpg") }) test_that("The correct statistics are returned by mean_table", { - n <- df[1, 2] %>% as.integer() - mean <- df[1, 3] %>% as.numeric() - lcl <- df[1, 4] %>% as.numeric() - ucl <- df[1, 5] %>% as.numeric() - min <- df[1, 6] %>% as.numeric() - max <- df[1, 7] %>% as.numeric() - + n_miss <- pull(df, n_miss) + n <- pull(df, n) + mean <- pull(df, mean) + t_crit <- pull(df, t_crit) %>% round(6) + sem <- pull(df, sem) %>% round(6) + lcl <- pull(df, lcl) + ucl <- pull(df, ucl) + min <- pull(df, min) + max <- pull(df, max) + + expect_equal(n_miss, 0L) expect_equal(n, 32L) expect_equal(mean, 20.09) + expect_equal(t_crit, 2.039513) + expect_equal(sem, 1.065424) expect_equal(lcl, 17.92) expect_equal(ucl, 22.26) expect_equal(min, 10.4) @@ -45,15 +51,15 @@ test_that("The correct statistics are returned by mean_table", { # ============================================================================= -# Test grouped means table +# Test grouped means table with output = all # ============================================================================= df <- mtcars %>% group_by(cyl) %>% - mean_table(mpg) + mean_table(mpg, output = all) test_that("Dimensions of the object returned by mean_table are as expected", { dims <- dim(df) - expect_equal(dims, c(3, 8)) + expect_equal(dims, c(3L, 12L)) }) test_that("Class of mean_table is mean_table_grouped", { @@ -61,27 +67,88 @@ test_that("Class of mean_table is mean_table_grouped", { }) test_that("The correct var names are returned by mean_table", { - group_level <- df[1, 1] %>% as.numeric() - name <- df[1, 2] %>% unlist() + response_var <- pull(df, response_var) + group_var <- pull(df, group_var) + + expect_match(response_var, "mpg") + expect_match(group_var, "cyl") +}) - expect_equal(group_level, 4) - expect_match(name, "mpg") +test_that("The expected group categories are returned by mean_table", { + group_cat <- pull(df, group_cat) + expect_equal(group_cat, c(4L, 6L, 8L)) }) test_that("The correct statistics are returned by mean_table", { - n <- df[1, 3] %>% as.integer() - mean <- df[1, 4] %>% as.numeric() - lcl <- df[1, 5] %>% as.numeric() - ucl <- df[1, 6] %>% as.numeric() - min <- df[1, 7] %>% as.numeric() - max <- df[1, 8] %>% as.numeric() - - expect_equal(n, 11L) - expect_equal(mean, 26.66) - expect_equal(lcl, 23.63) - expect_equal(ucl, 29.69) - expect_equal(min, 21.4) - expect_equal(max, 33.9) + n_miss <- pull(df, n_miss) + n <- pull(df, n) + mean <- pull(df, mean) + t_crit <- pull(df, t_crit) %>% round(6) + sem <- pull(df, sem) %>% round(7) + lcl <- pull(df, lcl) + ucl <- pull(df, ucl) + min <- pull(df, min) + max <- pull(df, max) + + expect_equal(n_miss, c(0L, 0L, 0L)) + expect_equal(n, c(11L, 7L, 14L)) + expect_equal(mean, c(26.66, 19.74, 15.10)) + expect_equal(t_crit, c(2.228139, 2.446912, 2.160369)) + expect_equal(sem, c(1.3597642, 0.5493967, 0.6842016)) + expect_equal(lcl, c(23.63, 18.40, 13.62)) + expect_equal(ucl, c(29.69, 21.09, 16.58)) + expect_equal(min, c(21.4, 17.8, 10.4)) + expect_equal(max, c(33.9, 21.4, 19.2)) +}) + + + + +# ============================================================================= +# Test grouped means table with two group_by variables and output = all +# ============================================================================= +df <- mtcars %>% + group_by(cyl, am) %>% + mean_table(mpg, output = all) + +test_that("Dimensions of the object returned by mean_table are as expected", { + dims <- dim(df) + expect_equal(dims, c(6L, 14L)) +}) + +test_that("Class of mean_table is mean_table_grouped", { + expect_is(df, "mean_table_grouped") +}) + +test_that("The correct var names are returned by mean_table", { + response_var <- pull(df, response_var) + group_1 <- pull(df, group_1) + group_2 <- pull(df, group_2) + + expect_match(response_var, "mpg") + expect_match(group_1, "cyl") + expect_match(group_2, "am") +}) + +test_that("The expected group categories are returned by mean_table", { + group_1_cat <- pull(df, group_1_cat) + group_2_cat <- pull(df, group_2_cat) + + expect_equal(group_1_cat, c(4L, 4L, 6L, 6L, 8L, 8L)) + expect_equal(group_2_cat, c(0L, 1L, 0L, 1L, 0L, 1L)) +}) + +test_that("The correct statistics are returned by mean_table", { + # Just need to check a subgroup of all stats here. + n <- pull(df, n) + mean <- pull(df, mean) + lcl <- pull(df, lcl) + ucl <- pull(df, ucl) + + expect_equal(n, c(3L, 8L, 4L, 3L, 12L, 2L)) + expect_equal(mean, c(22.90, 28.07, 19.12, 20.57, 15.05, 15.40)) + expect_equal(lcl, c(19.29, 24.33, 16.53, 18.70, 13.29, 10.32)) + expect_equal(ucl, c(26.51, 31.82, 21.72, 22.43, 16.81, 20.48)) }) @@ -99,25 +166,21 @@ df <- mtcars %>% mean_table(mpg, t_prob = t) test_that("The 99% confidence intervals are correct", { - lcl <- df[1, 4] %>% as.numeric() - ucl <- df[1, 5] %>% as.numeric() + lcl <- pull(df, lcl) + ucl <- pull(df, ucl) expect_equal(lcl, 17.17) expect_equal(ucl, 23.01) }) -# Output = "all" +# Output = "default" df <- mtcars %>% - mean_table(mpg, output = "all") - -test_that("Additional statistics from output = 'all' are as expected", { - n_miss <- df[1, 2] %>% as.numeric() - t_crit <- df[1, 5] %>% as.numeric() - sem <- df[1, 6] %>% as.numeric() + mean_table(mpg, output = default) - expect_equal(n_miss, 0) - expect_equal(t_crit, 2.039513, tolerance = .000001) - expect_equal(sem, 1.065424, tolerance = .000001) +test_that("Additional default list of statistics from output = default are as expected", { + vars <- names(df) + expect_equal(vars, c("response_var", "n", "mean", "sem", "lcl", "ucl", + "min", "max")) }) # digits = 3 @@ -125,9 +188,9 @@ df <- mtcars %>% mean_table(mpg, digits = 3) test_that("The 'digits' parameter works as expected", { - mean <- df[1, 3] %>% as.numeric() - lcl <- df[1, 4] %>% as.numeric() - ucl <- df[1, 5] %>% as.numeric() + mean <- pull(df, mean) + lcl <- pull(df, lcl) + ucl <- pull(df, ucl) expect_equal(mean, 20.091) expect_equal(lcl, 17.918) diff --git a/vignettes/presentation_dissemination.Rmd b/vignettes/presentation_dissemination.Rmd index 39de6c0..9b95a3a 100644 --- a/vignettes/presentation_dissemination.Rmd +++ b/vignettes/presentation_dissemination.Rmd @@ -235,7 +235,7 @@ mtcars %>% group_by(am, cyl) %>% freq_table() %>% format_table() %>% - spread(key = am, value = percent_row_95) + spread(key = row_cat, value = percent_row_95) ``` Where: @@ -252,9 +252,10 @@ mtcars %>% group_by(am, cyl) %>% freq_table() %>% format_table() %>% - spread(key = am, value = percent_row_95) %>% - mutate(variable = colnames(.)[1]) %>% - rename("class" = cyl, "am_0" = `0`, "am_1" = `1`) + spread(key = row_cat, value = percent_row_95) %>% + select(-row_var) %>% + # Rename to row bind with table shell + rename(variable = col_var, class = col_cat, "am_0" = `0`, "am_1" = `1`) ``` There may be better ways to do this, but this is the best I've found so far. @@ -281,9 +282,9 @@ row <- mtcars %>% group_by(am, cyl) %>% freq_table() %>% format_table() %>% - spread(key = am, value = percent_row_95) %>% - mutate(variable = colnames(.)[1]) %>% - rename("class" = cyl, "am_0" = `0`, "am_1" = `1`) %>% + spread(key = row_cat, value = percent_row_95) %>% + select(-row_var) %>% + rename(variable = col_var, class = col_cat, "am_0" = `0`, "am_1" = `1`) %>% mutate(class = as.character(class)) # Need for bind_rows below bind_rows(table, row) @@ -303,10 +304,10 @@ for (i in seq_along(cat_vars)) { group_by(am, !!cat_vars[[i]]) %>% freq_table() %>% format_table() %>% - spread(key = am, value = percent_row_95) %>% - mutate(variable = colnames(.)[1]) %>% - rename("class" = !!cat_vars[[i]], "am_0" = `0`, "am_1" = `1`) %>% - mutate(class = as.character(class)) # Need for bind_rows below + spread(key = row_cat, value = percent_row_95) %>% + select(-row_var) %>% + rename(variable = col_var, class = col_cat, "am_0" = `0`, "am_1" = `1`) %>% + mutate(class = as.character(class)) # Append to bottom of table table <- bind_rows(table, row) @@ -347,8 +348,9 @@ for (i in seq_along(cont_vars)) { group_by(am) %>% bfuncs::mean_table(!!cont_vars[[i]]) %>% bfuncs::format_table() %>% - spread(key = am, value = mean_95) %>% - rename("variable" = var, "am_0" = `0`, "am_1" = `1`) + spread(key = group_cat, value = mean_95) %>% + select(-group_var) %>% + rename("variable" = response_var, "am_0" = `0`, "am_1" = `1`) # Append to bottom of table table <- bind_rows(table, row) @@ -364,10 +366,10 @@ for (i in seq_along(cat_vars)) { group_by(am, !!cat_vars[[i]]) %>% freq_table() %>% format_table() %>% - spread(key = am, value = percent_row_95) %>% - mutate(variable = colnames(.)[1]) %>% - rename("class" = !!cat_vars[[i]], "am_0" = `0`, "am_1" = `1`) %>% - mutate(class = as.character(class)) # Need for bind_rows below + spread(key = row_cat, value = percent_row_95) %>% + select(-row_var) %>% + rename(variable = col_var, class = col_cat, "am_0" = `0`, "am_1" = `1`) %>% + mutate(class = as.character(class)) # Append to bottom of table table <- bind_rows(table, row) @@ -442,7 +444,7 @@ table <- table %>% * The best work around I can come up with is to add dashes, then find and replaces dashes with white space in Word. ```{r} -# table %>% +# table %>% # mutate( # class = stringr::str_replace(class, "^", "---"), # variable = if_else(variable == "", class, variable),