Merge pull request #361 from n-kall/export-helpers

export helper functions
stan-dev · May 15, 2024 · 86c8fba · 86c8fba
2 parents 419a1a3 + ee9d36f
commit 86c8fba
Show file tree

Hide file tree

Showing 8 changed files with 210 additions and 57 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -474,6 +474,10 @@ export(pareto_khat)
 export(pareto_khat_threshold)
 export(pareto_min_ss)
 export(pareto_smooth)
+export(ps_convergence_rate)
+export(ps_khat_threshold)
+export(ps_min_ss)
+export(ps_tail_length)
 export(quantile2)
 export(r_scale)
 export(rdo)

diff --git a/R/gpd.R b/R/gpd.R
@@ -21,22 +21,28 @@ qgeneralized_pareto <- function(p, mu = 0, sigma = 1, k = 0, lower.tail = TRUE,
 
 #' Estimate parameters of the Generalized Pareto distribution
 #'
-#' Given a sample \eqn{x}, Estimate the parameters \eqn{k} and \eqn{\sigma} of
-#' the generalized Pareto distribution (GPD), assuming the location parameter is
-#' 0. By default the fit uses a prior for \eqn{k} (this is in addition to the prior described by Zhang and Stephens, 2009), which will stabilize
-#' estimates for very small sample sizes (and low effective sample sizes in the
-#' case of MCMC samples). The weakly informative prior is a Gaussian prior
-#' centered at 0.5 (see details in Vehtari et al., 2024).
-#'
-#' @param x A numeric vector. The sample from which to estimate the parameters.
-#' @param wip Logical indicating whether to adjust \eqn{k} based on a weakly
-#'   informative Gaussian prior centered on 0.5. Defaults to `TRUE`.
-#' @param min_grid_pts The minimum number of grid points used in the fitting
-#'   algorithm. The actual number used is `min_grid_pts + floor(sqrt(length(x)))`.
-#' @param sort_x If `TRUE` (the default), the first step in the fitting
-#'   algorithm is to sort the elements of `x`. If `x` is already
-#'   sorted in ascending order then `sort_x` can be set to `FALSE` to
-#'   skip the initial sorting step.
+#' Given a sample \eqn{x}, Estimate the parameters \eqn{k} and
+#' \eqn{\sigma} of the generalized Pareto distribution (GPD), assuming
+#' the location parameter is 0. By default the fit uses a prior for
+#' \eqn{k} (this is in addition to the prior described by Zhang and
+#' Stephens, 2009), which will stabilize estimates for very small
+#' sample sizes (and low effective sample sizes in the case of MCMC
+#' samples). The weakly informative prior is a Gaussian prior centered
+#' at 0.5 (see details in Vehtari et al., 2024). This is used
+#' internally but is exported for use by other packages.
+#' @family helper-functions
+#' @param x A numeric vector. The sample from which to estimate the
+#'   parameters.
+#' @param wip Logical indicating whether to adjust \eqn{k} based on a
+#'   weakly informative Gaussian prior centered on 0.5. Defaults to
+#'   `TRUE`.
+#' @param min_grid_pts The minimum number of grid points used in the
+#'   fitting algorithm. The actual number used is `min_grid_pts +
+#'   floor(sqrt(length(x)))`.
+#' @param sort_x If `TRUE` (the default), the first step in the
+#'   fitting algorithm is to sort the elements of `x`. If `x` is
+#'   already sorted in ascending order then `sort_x` can be set to
+#'   `FALSE` to skip the initial sorting step.
 #' @return A named list with components `k` and `sigma`.
 #'
 #' @details Here the parameter \eqn{k} is the negative of \eqn{k} in Zhang &
@@ -67,7 +73,8 @@ gpdfit <- function(x, wip = TRUE, min_grid_pts = 30, sort_x = TRUE) {
   sigma_hat <- -k_hat / theta_hat
 
   # adjust k_hat based on weakly informative prior, Gaussian centered on 0.5.
-  # this stabilizes estimates for very small Monte Carlo sample sizes and low neff (see Vehtari et al., 2024 for details)
+  # this stabilizes estimates for very small Monte Carlo sample sizes and low ess
+  # (see Vehtari et al., 2024 for details)
   if (wip) {
     k_hat <- (k_hat * N + 0.5 * 10) / (N + 10)
   }

diff --git a/R/pareto_smooth.R b/R/pareto_smooth.R
@@ -281,25 +281,25 @@ pareto_smooth.default <- function(x,
   }
 
   tail <- match.arg(tail)
-  S <- length(x)
+  ndraws <- length(x)
 
   # automatically calculate relative efficiency
   if (is.null(r_eff)) {
-    r_eff <- ess_tail(x) / S
+    r_eff <- ess_tail(x) / ndraws
   }
 
   # automatically calculate tail length
   if (is.null(ndraws_tail)) {
-    ndraws_tail <- ps_tail_length(S, r_eff)
+    ndraws_tail <- ps_tail_length(ndraws, r_eff)
   }
 
   if (tail == "both") {
 
-    if (ndraws_tail > S / 2) {
+    if (ndraws_tail > ndraws / 2) {
       warning("Number of tail draws cannot be more than half ",
               "the total number of draws if both tails are fit, ",
-              "changing to ", S / 2, ".")
-      ndraws_tail <- S / 2
+              "changing to ", ndraws / 2, ".")
+      ndraws_tail <- ndraws / 2
     }
 
     if (ndraws_tail < 5) {
@@ -347,7 +347,7 @@ pareto_smooth.default <- function(x,
   diags_list <- list(khat = k)
 
   if (extra_diags) {
-    ext_diags <- .pareto_smooth_extra_diags(k, S)
+    ext_diags <- .pareto_smooth_extra_diags(k, ndraws)
     diags_list <- c(diags_list, ext_diags)
   }
 
@@ -447,8 +447,8 @@ pareto_convergence_rate.rvar <- function(x, ...) {
 
   tail <- match.arg(tail)
 
-  S <- length(x)
-  tail_ids <- seq(S - ndraws_tail + 1, S)
+  ndraws <- length(x)
+  tail_ids <- seq(ndraws - ndraws_tail + 1, ndraws)
 
   if (tail == "left") {
     x <- -x
@@ -527,15 +527,15 @@ pareto_convergence_rate.rvar <- function(x, ...) {
 #' Extra pareto-k diagnostics
 #'
 #' internal function to calculate the extra diagnostics for a given
-#' pareto k and sample size S
+#' pareto k and number of draws ndraws
 #' @noRd
-.pareto_smooth_extra_diags <- function(k, S, ...) {
+.pareto_smooth_extra_diags <- function(k, ndraws, ...) {
 
   min_ss <- ps_min_ss(k)
 
-  khat_threshold <- ps_khat_threshold(S)
+  khat_threshold <- ps_khat_threshold(ndraws)
 
-  convergence_rate <- ps_convergence_rate(k, S)
+  convergence_rate <- ps_convergence_rate(k, ndraws)
 
   other_diags <- list(
     min_ss = min_ss,
@@ -547,12 +547,15 @@ pareto_convergence_rate.rvar <- function(x, ...) {
 #' Pareto-smoothing minimum sample-size
 #'
 #' Given Pareto-k computes the minimum sample size for reliable Pareto
-#' smoothed estimate (to have small probability of large error)
-#' Equation (11) in PSIS paper
+#' smoothed estimate (to have small probability of large error). See
+#' section 3.2.3 in Vehtari et al. (2024). This function is exported
+#' to be usable by other packages. For user-facing diagnostic functions, see
+#' [`pareto_min_ss`] and [`pareto_diags`].
+#' @family helper-functions
 #' @param k pareto k value
 #' @param ... unused
 #' @return minimum sample size
-#' @noRd
+#' @export
 ps_min_ss <- function(k, ...) {
   if (k < 1) {
     out <- 10^(1 / (1 - max(0, k)))
@@ -562,58 +565,74 @@ ps_min_ss <- function(k, ...) {
   out
 }
 
-#' Pareto-smoothing k-hat threshold
+#' Pareto k-hat threshold
 #'
-#' Given sample size S computes khat threshold for reliable Pareto
+#' Given number of draws, computes khat threshold for reliable Pareto
 #' smoothed estimate (to have small probability of large error). See
-#' section 3.2.4, equation (13).
-#' @param S sample size
+#' section 3.2.4, equation (13) of Vehtari et al. (2024). This
+#' function is exported to be usable by other packages. For
+#' user-facing diagnostic functions, see [`pareto_khat_threshold`] and
+#' [`pareto_diags`].
+#' @family helper-functions
+#' @param ndraws number of draws
 #' @param ... unused
 #' @return threshold
-#' @noRd
-ps_khat_threshold <- function(S, ...) {
-  1 - 1 / log10(S)
+#' @export
+ps_khat_threshold <- function(ndraws, ...) {
+  1 - 1 / log10(ndraws)
 }
 
-#' Pareto-smoothing convergence rate
+#' Pareto convergence rate
 #'
-#' Given S and scalar or array of k's, compute the relative
-#' convergence rate of PSIS estimate RMSE
+#' Given number of draws and scalar or array of k's, compute the
+#' relative convergence rate of PSIS estimate RMSE. See Appendix B of
+#' Vehtari et al. (2024). This function is exported to be usable by
+#' other packages. For user-facing diagnostic functions, see
+#' [`pareto_convergence_rate`] and [`pareto_diags`].
+#' @family helper-functions
 #' @param k pareto-k values
-#' @param S sample size
+#' @param ndraws number of draws
 #' @param ... unused
 #' @return convergence rate
-#' @noRd
-ps_convergence_rate <- function(k, S, ...) {
+#' @export
+ps_convergence_rate <- function(k, ndraws, ...) {
   # allow array of k's
   rate <- numeric(length(k))
   # k<0 bounded distribution
   rate[k < 0] <- 1
   # k>0 non-finite mean
   rate[k > 1] <- 0
   # limit value at k=1/2
-  rate[k == 0.5] <- 1 - 1 / log(S)
-  # smooth approximation for the rest (see Appendix of PSIS paper)
+  rate[k == 0.5] <- 1 - 1 / log(ndraws)
+  # smooth approximation for the rest (see Appendix B of PSIS paper)
   ki <- (k > 0 & k < 1 & k != 0.5)
   kk <- k[ki]
   rate[ki] <- pmax(
     0,
-    (2 * (kk - 1) * S^(2 * kk + 1) + (1 - 2 * kk) * S^(2 * kk) + S^2) /
-      ((S - 1) * (S - S^(2 * kk)))
+    (2 * (kk - 1) * ndraws^(2 * kk + 1) + (1 - 2 * kk) * ndraws^(2 * kk) + ndraws^2) /
+      ((ndraws - 1) * (ndraws - ndraws^(2 * kk)))
   )
   rate
 }
 
-#' Calculate the tail length from S and r_eff
-#' Appendix H in PSIS paper
-#' @noRd
-ps_tail_length <- function(S, r_eff, ...) {
-  ifelse(S > 225, ceiling(3 * sqrt(S / r_eff)), S / 5)
+#' Pareto tail length
+#' 
+#' Calculate the tail length from number of draws and relative efficiency
+#' r_eff. See Appendix H in Vehtari et al. (2024). This function is
+#' used internally and is exported to be available for other packages.
+#' @family helper-functions
+#' @param ndraws number of draws
+#' @param r_eff relative efficiency
+#' @param ... unused
+#' @return tail length
+#' @export
+ps_tail_length <- function(ndraws, r_eff, ...) {
+  ifelse(ndraws > 225, ceiling(3 * sqrt(ndraws / r_eff)), ndraws / 5)
 }
 
 #' Pareto-k diagnostic message
 #'
-#' Given S and scalar and k, form a diagnostic message string
+#' Given number of draws and k, form a diagnostic message string.
 #' @param diags (numeric) named vector of diagnostic values
 #' @param are_weights (logical) are the diagnostics for weights
 #' @param ... unused

diff --git a/man-roxygen/ref-vehtari-paretosmooth-2022.R b/man-roxygen/ref-vehtari-paretosmooth-2022.R
@@ -2,4 +2,4 @@
 #' Aki Vehtari, Daniel Simpson, Andrew Gelman, Yuling Yao and
 #' Jonah Gabry (2024). Pareto Smoothed Importance Sampling.
 #' *Journal of Machine Learning Research*, 25(72):1-58.
-#' [PDF](https://jmlr.org/papers/v25/19-556.html)
+#' [PDF](https://jmlr.org/papers/v25/19-556.html)
diff --git a/man/ps_convergence_rate.Rd b/man/ps_convergence_rate.Rd
diff --git a/man/ps_khat_threshold.Rd b/man/ps_khat_threshold.Rd
diff --git a/man/ps_min_ss.Rd b/man/ps_min_ss.Rd
diff --git a/man/ps_tail_length.Rd b/man/ps_tail_length.Rd