Add helper functions to refactor PKCT01 (#1048)

Closes #277 #543
insightsengineering · Sep 8, 2023 · 5dee25f · 5dee25f
1 parent b9c0944
commit 5dee25f
Show file tree

Hide file tree

Showing 35 changed files with 822 additions and 31 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -122,6 +122,7 @@ Collate:
     'h_step.R'
     'h_survival_biomarkers_subgroups.R'
     'h_survival_duration_subgroups.R'
+    'imputation_rule.R'
     'incidence_rate.R'
     'individual_patient_plot.R'
     'kaplan_meier_plot.R'

diff --git a/NAMESPACE b/NAMESPACE
@@ -120,6 +120,7 @@ export(format_extreme_values_ci)
 export(format_fraction)
 export(format_fraction_fixed_dp)
 export(format_fraction_threshold)
+export(format_sigfig)
 export(format_xx)
 export(g_forest)
 export(g_ipp)
@@ -211,6 +212,7 @@ export(has_counts_difference)
 export(has_fraction_in_any_col)
 export(has_fraction_in_cols)
 export(has_fractions_difference)
+export(imputation_rule)
 export(keep_content_rows)
 export(keep_rows)
 export(logistic_regression_cols)

diff --git a/NEWS.md b/NEWS.md
@@ -1,6 +1,12 @@
 # tern 0.9.0.9001
+### New Features
+* Added `imputation_rule` function to apply imputation rule to data.
+* Added new format function `format_sigfig` to allow for numeric value formatting by a specified number of significant figures.
+
 ### Enhancements
 * Refactored `tabulate_rsp_subgroups` to pass sanitation checks by preventing creation of degenerate subtables.
+* Updated `analyze_vars_in_cols` to use caching, allow implementation of imputation rule via the `imp_rule` argument, and allow user to specify cell alignment via the `.aligns` argument.
+* Updated `add_rowcounts` to allow addition of row counts from `alt_counts_df` using the `alt_counts` argument.
 
 # tern 0.9.0
 ### New Features

diff --git a/R/analyze_variables.R b/R/analyze_variables.R
@@ -271,7 +271,7 @@ s_summary.factor <- function(x,
     }
   )
 
-  y$n_blq <- sum(grepl("BLQ|LTR|<[1-9]", x))
+  y$n_blq <- sum(grepl("BLQ|LTR|<[1-9]|<PCLLOQ", x))
 
   y
 }

diff --git a/R/analyze_vars_in_cols.R b/R/analyze_vars_in_cols.R
@@ -8,6 +8,17 @@
 #'
 #' @inheritParams argument_convention
 #' @inheritParams rtables::analyze_colvars
+#' @param imp_rule (`character`)\cr imputation rule setting. Defaults to `NULL` for no imputation rule. Can
+#'   also be `"1/3"` to implement 1/3 imputation rule or `"1/2"` to implement 1/2 imputation rule. In order
+#'   to use an imputation rule, the `avalcat_var` argument must be specified. See [imputation_rule()]
+#'   for more details on imputation.
+#' @param avalcat_var (`character`)\cr if `imp_rule` is not `NULL`, name of variable that indicates whether a
+#'   row in the data corresponds to an analysis value in category `"BLQ"`, `"LTR"`, `"<PCLLOQ"`, or none of
+#'   the above (defaults to `"AVALCAT1"`). Variable must be present in the data and should match the variable
+#'   used to calculate the `n_blq` statistic (if included in `.stats`).
+#' @param cache (`flag`)\cr whether to store computed values in a temporary caching environment. This will
+#'   speed up calculations in large tables, but should be set to `FALSE` if the same `rtable` layout is
+#'   used for multiple tables with different data. Defaults to `FALSE`.
 #' @param row_labels (`character`)\cr as this function works in columns space, usual `.labels`
 #'   character vector applies on the column space. You can change the row labels by defining this
 #'   parameter to a named character vector with names corresponding to the split values. It defaults
@@ -143,10 +154,14 @@ analyze_vars_in_cols <- function(lyt,
                                  row_labels = NULL,
                                  do_summarize_row_groups = FALSE,
                                  split_col_vars = TRUE,
+                                 imp_rule = NULL,
+                                 avalcat_var = "AVALCAT1",
+                                 cache = FALSE,
                                  .indent_mods = NULL,
                                  nested = TRUE,
                                  na_level = NULL,
-                                 .formats = NULL) {
+                                 .formats = NULL,
+                                 .aligns = NULL) {
   checkmate::assert_string(na_level, null.ok = TRUE)
   checkmate::assert_character(row_labels, null.ok = TRUE)
   checkmate::assert_int(.indent_mods, null.ok = TRUE)
@@ -201,17 +216,40 @@ analyze_vars_in_cols <- function(lyt,
     )
   }
 
+  env <- new.env() # create caching environment
+
   if (do_summarize_row_groups) {
     if (length(unique(vars)) > 1) {
       stop("When using do_summarize_row_groups only one label level var should be inserted.")
     }
 
     # Function list for do_summarize_row_groups. Slightly different handling of labels
     cfun_list <- Map(
-      function(stat) {
-        function(u, .spl_context, labelstr, ...) {
+      function(stat, use_cache, cache_env) {
+        function(u, .spl_context, labelstr, .df_row, ...) {
           # Statistic
-          res <- s_summary(u, ...)[[stat]]
+          var_row_val <- paste(
+            gsub("\\._\\[\\[[0-9]+\\]\\]_\\.", "", paste(tail(.spl_context$cur_col_split_val, 1)[[1]], collapse = "_")),
+            paste(.spl_context$value, collapse = "_"),
+            sep = "_"
+          )
+          if (use_cache) {
+            if (is.null(cache_env[[var_row_val]])) cache_env[[var_row_val]] <- s_summary(u, ...)
+            x_stats <- cache_env[[var_row_val]]
+          } else {
+            x_stats <- s_summary(u, ...)
+          }
+
+          if (is.null(imp_rule) || !stat %in% c("mean", "sd", "cv", "geom_mean", "geom_cv", "median", "min", "max")) {
+            res <- x_stats[[stat]]
+          } else {
+            res_imp <- imputation_rule(
+              .df_row, x_stats, stat,
+              imp_rule = imp_rule, post = as.numeric(tail(.spl_context$value, 1)) > 0, avalcat_var = avalcat_var
+            )
+            res <- res_imp[["val"]]
+            na_level <- res_imp[["na_level"]]
+          }
 
           # Label check and replacement
           if (length(row_labels) > 1) {
@@ -234,11 +272,14 @@ analyze_vars_in_cols <- function(lyt,
             label = lbl,
             format = formats_v[names(formats_v) == stat][[1]],
             format_na_str = na_level,
-            indent_mod = ifelse(is.null(.indent_mods), 0L, .indent_mods)
+            indent_mod = ifelse(is.null(.indent_mods), 0L, .indent_mods),
+            align = .aligns
           )
         }
       },
-      stat = .stats
+      stat = .stats,
+      use_cache = cache,
+      cache_env = replicate(length(.stats), env)
     )
 
     # Main call to rtables
@@ -251,10 +292,31 @@ analyze_vars_in_cols <- function(lyt,
   } else {
     # Function list for analyze_colvars
     afun_list <- Map(
-      function(stat) {
-        function(u, .spl_context, ...) {
+      function(stat, use_cache, cache_env) {
+        function(u, .spl_context, .df_row, ...) {
           # Main statistics
-          res <- s_summary(u, ...)[[stat]]
+          var_row_val <- paste(
+            gsub("\\._\\[\\[[0-9]+\\]\\]_\\.", "", paste(tail(.spl_context$cur_col_split_val, 1)[[1]], collapse = "_")),
+            paste(.spl_context$value, collapse = "_"),
+            sep = "_"
+          )
+          if (use_cache) {
+            if (is.null(cache_env[[var_row_val]])) cache_env[[var_row_val]] <- s_summary(u, ...)
+            x_stats <- cache_env[[var_row_val]]
+          } else {
+            x_stats <- s_summary(u, ...)
+          }
+
+          if (is.null(imp_rule) || !stat %in% c("mean", "sd", "cv", "geom_mean", "geom_cv", "median", "min", "max")) {
+            res <- x_stats[[stat]]
+          } else {
+            res_imp <- imputation_rule(
+              .df_row, x_stats, stat,
+              imp_rule = imp_rule, post = as.numeric(tail(.spl_context$value, 1)) > 0, avalcat_var = avalcat_var
+            )
+            res <- res_imp[["val"]]
+            na_level <- res_imp[["na_level"]]
+          }
 
           if (is.list(res)) {
             if (length(res) > 1) {
@@ -292,11 +354,14 @@ analyze_vars_in_cols <- function(lyt,
             label = lbl,
             format = formats_v[names(formats_v) == stat][[1]],
             format_na_str = na_level,
-            indent_mod = ifelse(is.null(.indent_mods), 0L, .indent_mods)
+            indent_mod = ifelse(is.null(.indent_mods), 0L, .indent_mods),
+            align = .aligns
           )
         }
       },
-      stat = .stats
+      stat = .stats,
+      use_cache = cache,
+      cache_env = replicate(length(.stats), env)
     )
 
     # Main call to rtables

diff --git a/R/argument_convention.R b/R/argument_convention.R
@@ -4,6 +4,8 @@
 #' that are used repeatedly to express an analysis.
 #'
 #' @param ... additional arguments for the lower level functions.
+#' @param .aligns (`character`)\cr alignment for table contents (not including labels). When `NULL`, `"center"`
+#'   is applied. See [formatters::list_valid_aligns()] for a list of all currently supported alignments.
 #' @param .all_col_counts (`vector` of `integer`)\cr each value represents a global count for a column. Values are
 #'   taken from `alt_counts_df` if specified (see [rtables::build_table()]).
 #' @param .df_row (`data.frame`)\cr data frame across all of the columns for the given row split.

diff --git a/R/formatting_functions.R b/R/formatting_functions.R
@@ -209,6 +209,33 @@ format_xx <- function(str) {
   return(rtable_format)
 }
 
+#' Formatting Numeric Values By Significant Figures
+#'
+#' Format numeric values to print with a specified number of significant figures.
+#'
+#' @param sigfig (`integer`)\cr number of significant figures to display.
+#'
+#' @return An `rtables` formatting function.
+#'
+#' @examples
+#' fmt_3sf <- format_sigfig(3)
+#' fmt_3sf(1.658)
+#' fmt_3sf(1e1)
+#'
+#' fmt_5sf <- format_sigfig(5)
+#' fmt_5sf(0.57)
+#' fmt_5sf(0.000025645)
+#'
+#' @family formatting functions
+#' @export
+format_sigfig <- function(sigfig) {
+  checkmate::assert_integerish(sigfig)
+  function(x, ...) {
+    if (!is.numeric(x)) stop("`format_sigfig` cannot be used for non-numeric values. Please choose another format.")
+    formatC(signif(x, digits = sigfig), digits = sigfig, format = "fg", flag = "#")
+  }
+}
+
 #' Formatting Fraction with Lower Threshold
 #'
 #' @description `r lifecycle::badge("stable")`

diff --git a/R/imputation_rule.R b/R/imputation_rule.R
@@ -0,0 +1,60 @@
+#' Apply 1/3 or 1/2 Imputation Rule to Data
+#'
+#' @description `r lifecycle::badge("stable")`
+#'
+#' @inheritParams argument_convention
+#' @param x_stats (`named list`)\cr a named list of statistics, typically the results of [s_summary()].
+#' @param stat (`character`)\cr statistic to return the value/NA level of according to the imputation
+#'   rule applied.
+#' @param imp_rule (`character`)\cr imputation rule setting. Set to `"1/3"` to implement 1/3 imputation
+#'   rule or `"1/2"` to implement 1/2 imputation rule.
+#' @param post (`flag`)\cr whether the data corresponds to a post-dose time-point (defaults to `FALSE`).
+#'   This parameter is only used when `imp_rule` is set to `"1/3"`.
+#' @param avalcat_var (`character`)\cr name of variable that indicates whether a row in `df` corresponds
+#'   to an analysis value in category `"BLQ"`, `"LTR"`, `"<PCLLOQ"`, or none of the above
+#'   (defaults to `"AVALCAT1"`). Variable `avalcat_var` must be present in `df`.
+#'
+#' @return A `list` containing statistic value (`val`) and NA level (`na_level`) that should be displayed
+#'   according to the specified imputation rule.
+#'
+#' @seealso [analyze_vars_in_cols()] where this function can be implemented by setting the `imp_rule`
+#'   argument.
+#'
+#' @examples
+#' set.seed(1)
+#' df <- data.frame(
+#'   AVAL = runif(50, 0, 1),
+#'   AVALCAT1 = sample(c(1, "BLQ"), 50, replace = TRUE)
+#' )
+#' x_stats <- s_summary(df$AVAL)
+#' imputation_rule(df, x_stats, "max", "1/3")
+#' imputation_rule(df, x_stats, "geom_mean", "1/3")
+#' imputation_rule(df, x_stats, "mean", "1/2")
+#'
+#' @export
+imputation_rule <- function(df, x_stats, stat, imp_rule, post = FALSE, avalcat_var = "AVALCAT1") {
+  checkmate::assert_choice(avalcat_var, names(df))
+  checkmate::assert_choice(imp_rule, c("1/3", "1/2"))
+  n_blq <- sum(grepl("BLQ|LTR|<[1-9]|<PCLLOQ", df[[avalcat_var]]))
+  ltr_blq_ratio <- n_blq / max(1, nrow(df))
+
+  # defaults
+  val <- x_stats[[stat]]
+  na_level <- "NE"
+
+  if (imp_rule == "1/3") {
+    if (!post && stat == "geom_mean") val <- NA # 1/3_pre_LT, 1/3_pre_GT
+    if (ltr_blq_ratio > 1 / 3) {
+      if (stat != "geom_mean") na_level <- "ND" # 1/3_pre_GT, 1/3_post_GT
+      if (!post && !stat %in% c("median", "max")) val <- NA # 1/3_pre_GT
+      if (post && !stat %in% c("median", "max", "geom_mean")) val <- NA # 1/3_post_GT
+    }
+  } else if (imp_rule == "1/2") {
+    if (ltr_blq_ratio > 1 / 2 && !stat == "max") {
+      val <- NA # 1/2_GT
+      na_level <- "ND" # 1/2_GT
+    }
+  }
+
+  list(val = val, na_level = na_level)
+}
diff --git a/R/utils_rtables.R b/R/utils_rtables.R
@@ -89,21 +89,54 @@ cfun_by_flag <- function(analysis_var,
 
 #' Content Row Function to Add Row Total to Labels
 #'
-#' This takes the label of the latest row split level and adds the row total in parentheses.
+#' This takes the label of the latest row split level and adds the row total from `df` in parentheses.
+#' This function differs from [c_label_n_alt()] by taking row counts from `df` rather than
+#' `alt_counts_df`, and is used by [add_rowcounts()] when `alt_counts` is set to `FALSE`.
 #'
 #' @inheritParams argument_convention
 #'
-#' @return A `list` containing "row_count" with the row count value and the correct label.
+#' @return A list with formatted [rtables::CellValue()] with the row count value and the correct label.
 #'
 #' @note It is important here to not use `df` but rather `.N_row` in the implementation, because
 #'   the former is already split by columns and will refer to the first column of the data only.
 #'
+#' @seealso [c_label_n_alt()] which performs the same function but retrieves row counts from
+#'   `alt_counts_df` instead of `df`.
+#'
 #' @keywords internal
 c_label_n <- function(df,
                       labelstr,
                       .N_row) { # nolint
   label <- paste0(labelstr, " (N=", .N_row, ")")
-  list(row_count = formatters::with_label(c(.N_row, .N_row), label))
+  in_rows(
+    .list = list(row_count = formatters::with_label(c(.N_row, .N_row), label)),
+    .formats = c(row_count = function(x, ...) "")
+  )
+}
+
+#' Content Row Function to Add `alt_counts_df` Row Total to Labels
+#'
+#' This takes the label of the latest row split level and adds the row total from `alt_counts_df`
+#' in parentheses. This function differs from [c_label_n()] by taking row counts from `alt_counts_df`
+#' rather than `df`, and is used by [add_rowcounts()] when `alt_counts` is set to `TRUE`.
+#'
+#' @inheritParams argument_convention
+#'
+#' @return A list with formatted [rtables::CellValue()] with the row count value and the correct label.
+#'
+#' @seealso [c_label_n()] which performs the same function but retrieves row counts from `df` instead
+#'   of `alt_counts_df`.
+#'
+#' @keywords internal
+c_label_n_alt <- function(df,
+                          labelstr,
+                          .alt_df_row) {
+  N_row_alt <- nrow(.alt_df_row) # nolint
+  label <- paste0(labelstr, " (N=", N_row_alt, ")")
+  in_rows(
+    .list = list(row_count = formatters::with_label(c(N_row_alt, N_row_alt), label)),
+    .formats = c(row_count = function(x, ...) "")
+  )
 }
 
 #' Layout Creating Function to Add Row Total Counts
@@ -114,6 +147,8 @@ c_label_n <- function(df,
 #'  is a wrapper for [rtables::summarize_row_groups()].
 #'
 #' @inheritParams argument_convention
+#' @param alt_counts (`flag`)\cr whether row counts should be taken from `alt_counts_df` (`TRUE`)
+#'   or from `df` (`FALSE`). Defaults to `FALSE`.
 #'
 #' @return A modified layout where the latest row split labels now have the row-wise
 #'   total counts (i.e. without column-based subsetting) attached in parentheses.
@@ -131,15 +166,10 @@ c_label_n <- function(df,
 #'   build_table(DM)
 #'
 #' @export
-add_rowcounts <- function(lyt) {
-  c_lbl_n_fun <- make_afun(
-    c_label_n,
-    .stats = c("row_count"),
-    .formats = c(row_count = function(x, ...) "")
-  )
+add_rowcounts <- function(lyt, alt_counts = FALSE) {
   summarize_row_groups(
     lyt,
-    cfun = c_lbl_n_fun
+    cfun = if (alt_counts) c_label_n_alt else c_label_n
   )
 }
 

diff --git a/_pkgdown.yml b/_pkgdown.yml
@@ -68,6 +68,7 @@ reference:
     desc: These functions are useful in defining an analysis.
     contents:
       - starts_with("h_")
+      - imputation_rule
       - starts_with("or_")
       - starts_with("prop_")
       - summary_stats