diff --git a/NAMESPACE b/NAMESPACE index 62cf616..a3f3465 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -474,6 +474,10 @@ export(pareto_khat) export(pareto_khat_threshold) export(pareto_min_ss) export(pareto_smooth) +export(ps_convergence_rate) +export(ps_khat_threshold) +export(ps_min_ss) +export(ps_tail_length) export(quantile2) export(r_scale) export(rdo) diff --git a/R/gpd.R b/R/gpd.R index d86eef5..3cffd77 100644 --- a/R/gpd.R +++ b/R/gpd.R @@ -21,22 +21,28 @@ qgeneralized_pareto <- function(p, mu = 0, sigma = 1, k = 0, lower.tail = TRUE, #' Estimate parameters of the Generalized Pareto distribution #' -#' Given a sample \eqn{x}, Estimate the parameters \eqn{k} and \eqn{\sigma} of -#' the generalized Pareto distribution (GPD), assuming the location parameter is -#' 0. By default the fit uses a prior for \eqn{k} (this is in addition to the prior described by Zhang and Stephens, 2009), which will stabilize -#' estimates for very small sample sizes (and low effective sample sizes in the -#' case of MCMC samples). The weakly informative prior is a Gaussian prior -#' centered at 0.5 (see details in Vehtari et al., 2024). -#' -#' @param x A numeric vector. The sample from which to estimate the parameters. -#' @param wip Logical indicating whether to adjust \eqn{k} based on a weakly -#' informative Gaussian prior centered on 0.5. Defaults to `TRUE`. -#' @param min_grid_pts The minimum number of grid points used in the fitting -#' algorithm. The actual number used is `min_grid_pts + floor(sqrt(length(x)))`. -#' @param sort_x If `TRUE` (the default), the first step in the fitting -#' algorithm is to sort the elements of `x`. If `x` is already -#' sorted in ascending order then `sort_x` can be set to `FALSE` to -#' skip the initial sorting step. +#' Given a sample \eqn{x}, Estimate the parameters \eqn{k} and +#' \eqn{\sigma} of the generalized Pareto distribution (GPD), assuming +#' the location parameter is 0. By default the fit uses a prior for +#' \eqn{k} (this is in addition to the prior described by Zhang and +#' Stephens, 2009), which will stabilize estimates for very small +#' sample sizes (and low effective sample sizes in the case of MCMC +#' samples). The weakly informative prior is a Gaussian prior centered +#' at 0.5 (see details in Vehtari et al., 2024). This is used +#' internally but is exported for use by other packages. +#' @family helper-functions +#' @param x A numeric vector. The sample from which to estimate the +#' parameters. +#' @param wip Logical indicating whether to adjust \eqn{k} based on a +#' weakly informative Gaussian prior centered on 0.5. Defaults to +#' `TRUE`. +#' @param min_grid_pts The minimum number of grid points used in the +#' fitting algorithm. The actual number used is `min_grid_pts + +#' floor(sqrt(length(x)))`. +#' @param sort_x If `TRUE` (the default), the first step in the +#' fitting algorithm is to sort the elements of `x`. If `x` is +#' already sorted in ascending order then `sort_x` can be set to +#' `FALSE` to skip the initial sorting step. #' @return A named list with components `k` and `sigma`. #' #' @details Here the parameter \eqn{k} is the negative of \eqn{k} in Zhang & @@ -67,7 +73,8 @@ gpdfit <- function(x, wip = TRUE, min_grid_pts = 30, sort_x = TRUE) { sigma_hat <- -k_hat / theta_hat # adjust k_hat based on weakly informative prior, Gaussian centered on 0.5. - # this stabilizes estimates for very small Monte Carlo sample sizes and low neff (see Vehtari et al., 2024 for details) + # this stabilizes estimates for very small Monte Carlo sample sizes and low ess + # (see Vehtari et al., 2024 for details) if (wip) { k_hat <- (k_hat * N + 0.5 * 10) / (N + 10) } diff --git a/R/pareto_smooth.R b/R/pareto_smooth.R index 39acaac..dca53cc 100644 --- a/R/pareto_smooth.R +++ b/R/pareto_smooth.R @@ -281,25 +281,25 @@ pareto_smooth.default <- function(x, } tail <- match.arg(tail) - S <- length(x) + ndraws <- length(x) # automatically calculate relative efficiency if (is.null(r_eff)) { - r_eff <- ess_tail(x) / S + r_eff <- ess_tail(x) / ndraws } # automatically calculate tail length if (is.null(ndraws_tail)) { - ndraws_tail <- ps_tail_length(S, r_eff) + ndraws_tail <- ps_tail_length(ndraws, r_eff) } if (tail == "both") { - if (ndraws_tail > S / 2) { + if (ndraws_tail > ndraws / 2) { warning("Number of tail draws cannot be more than half ", "the total number of draws if both tails are fit, ", - "changing to ", S / 2, ".") - ndraws_tail <- S / 2 + "changing to ", ndraws / 2, ".") + ndraws_tail <- ndraws / 2 } if (ndraws_tail < 5) { @@ -347,7 +347,7 @@ pareto_smooth.default <- function(x, diags_list <- list(khat = k) if (extra_diags) { - ext_diags <- .pareto_smooth_extra_diags(k, S) + ext_diags <- .pareto_smooth_extra_diags(k, ndraws) diags_list <- c(diags_list, ext_diags) } @@ -447,8 +447,8 @@ pareto_convergence_rate.rvar <- function(x, ...) { tail <- match.arg(tail) - S <- length(x) - tail_ids <- seq(S - ndraws_tail + 1, S) + ndraws <- length(x) + tail_ids <- seq(ndraws - ndraws_tail + 1, ndraws) if (tail == "left") { x <- -x @@ -527,15 +527,15 @@ pareto_convergence_rate.rvar <- function(x, ...) { #' Extra pareto-k diagnostics #' #' internal function to calculate the extra diagnostics for a given -#' pareto k and sample size S +#' pareto k and number of draws ndraws #' @noRd -.pareto_smooth_extra_diags <- function(k, S, ...) { +.pareto_smooth_extra_diags <- function(k, ndraws, ...) { min_ss <- ps_min_ss(k) - khat_threshold <- ps_khat_threshold(S) + khat_threshold <- ps_khat_threshold(ndraws) - convergence_rate <- ps_convergence_rate(k, S) + convergence_rate <- ps_convergence_rate(k, ndraws) other_diags <- list( min_ss = min_ss, @@ -547,12 +547,15 @@ pareto_convergence_rate.rvar <- function(x, ...) { #' Pareto-smoothing minimum sample-size #' #' Given Pareto-k computes the minimum sample size for reliable Pareto -#' smoothed estimate (to have small probability of large error) -#' Equation (11) in PSIS paper +#' smoothed estimate (to have small probability of large error). See +#' section 3.2.3 in Vehtari et al. (2024). This function is exported +#' to be usable by other packages. For user-facing diagnostic functions, see +#' [`pareto_min_ss`] and [`pareto_diags`]. +#' @family helper-functions #' @param k pareto k value #' @param ... unused #' @return minimum sample size -#' @noRd +#' @export ps_min_ss <- function(k, ...) { if (k < 1) { out <- 10^(1 / (1 - max(0, k))) @@ -562,29 +565,37 @@ ps_min_ss <- function(k, ...) { out } -#' Pareto-smoothing k-hat threshold +#' Pareto k-hat threshold #' -#' Given sample size S computes khat threshold for reliable Pareto +#' Given number of draws, computes khat threshold for reliable Pareto #' smoothed estimate (to have small probability of large error). See -#' section 3.2.4, equation (13). -#' @param S sample size +#' section 3.2.4, equation (13) of Vehtari et al. (2024). This +#' function is exported to be usable by other packages. For +#' user-facing diagnostic functions, see [`pareto_khat_threshold`] and +#' [`pareto_diags`]. +#' @family helper-functions +#' @param ndraws number of draws #' @param ... unused #' @return threshold -#' @noRd -ps_khat_threshold <- function(S, ...) { - 1 - 1 / log10(S) +#' @export +ps_khat_threshold <- function(ndraws, ...) { + 1 - 1 / log10(ndraws) } -#' Pareto-smoothing convergence rate +#' Pareto convergence rate #' -#' Given S and scalar or array of k's, compute the relative -#' convergence rate of PSIS estimate RMSE +#' Given number of draws and scalar or array of k's, compute the +#' relative convergence rate of PSIS estimate RMSE. See Appendix B of +#' Vehtari et al. (2024). This function is exported to be usable by +#' other packages. For user-facing diagnostic functions, see +#' [`pareto_convergence_rate`] and [`pareto_diags`]. +#' @family helper-functions #' @param k pareto-k values -#' @param S sample size +#' @param ndraws number of draws #' @param ... unused #' @return convergence rate -#' @noRd -ps_convergence_rate <- function(k, S, ...) { +#' @export +ps_convergence_rate <- function(k, ndraws, ...) { # allow array of k's rate <- numeric(length(k)) # k<0 bounded distribution @@ -592,28 +603,36 @@ ps_convergence_rate <- function(k, S, ...) { # k>0 non-finite mean rate[k > 1] <- 0 # limit value at k=1/2 - rate[k == 0.5] <- 1 - 1 / log(S) - # smooth approximation for the rest (see Appendix of PSIS paper) + rate[k == 0.5] <- 1 - 1 / log(ndraws) + # smooth approximation for the rest (see Appendix B of PSIS paper) ki <- (k > 0 & k < 1 & k != 0.5) kk <- k[ki] rate[ki] <- pmax( 0, - (2 * (kk - 1) * S^(2 * kk + 1) + (1 - 2 * kk) * S^(2 * kk) + S^2) / - ((S - 1) * (S - S^(2 * kk))) + (2 * (kk - 1) * ndraws^(2 * kk + 1) + (1 - 2 * kk) * ndraws^(2 * kk) + ndraws^2) / + ((ndraws - 1) * (ndraws - ndraws^(2 * kk))) ) rate } -#' Calculate the tail length from S and r_eff -#' Appendix H in PSIS paper -#' @noRd -ps_tail_length <- function(S, r_eff, ...) { - ifelse(S > 225, ceiling(3 * sqrt(S / r_eff)), S / 5) +#' Pareto tail length +#' +#' Calculate the tail length from number of draws and relative efficiency +#' r_eff. See Appendix H in Vehtari et al. (2024). This function is +#' used internally and is exported to be available for other packages. +#' @family helper-functions +#' @param ndraws number of draws +#' @param r_eff relative efficiency +#' @param ... unused +#' @return tail length +#' @export +ps_tail_length <- function(ndraws, r_eff, ...) { + ifelse(ndraws > 225, ceiling(3 * sqrt(ndraws / r_eff)), ndraws / 5) } #' Pareto-k diagnostic message #' -#' Given S and scalar and k, form a diagnostic message string +#' Given number of draws and k, form a diagnostic message string. #' @param diags (numeric) named vector of diagnostic values #' @param are_weights (logical) are the diagnostics for weights #' @param ... unused diff --git a/man-roxygen/ref-vehtari-paretosmooth-2022.R b/man-roxygen/ref-vehtari-paretosmooth-2022.R index addcf4c..6f87d07 100644 --- a/man-roxygen/ref-vehtari-paretosmooth-2022.R +++ b/man-roxygen/ref-vehtari-paretosmooth-2022.R @@ -2,4 +2,4 @@ #' Aki Vehtari, Daniel Simpson, Andrew Gelman, Yuling Yao and #' Jonah Gabry (2024). Pareto Smoothed Importance Sampling. #' *Journal of Machine Learning Research*, 25(72):1-58. -#' [PDF](https://jmlr.org/papers/v25/19-556.html) +#' [PDF](https://jmlr.org/papers/v25/19-556.html) \ No newline at end of file diff --git a/man/ps_convergence_rate.Rd b/man/ps_convergence_rate.Rd new file mode 100644 index 0000000..535f0eb --- /dev/null +++ b/man/ps_convergence_rate.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pareto_smooth.R +\name{ps_convergence_rate} +\alias{ps_convergence_rate} +\title{Pareto convergence rate} +\usage{ +ps_convergence_rate(k, ndraws, ...) +} +\arguments{ +\item{k}{pareto-k values} + +\item{ndraws}{number of draws} + +\item{...}{unused} +} +\value{ +convergence rate +} +\description{ +Given number of draws and scalar or array of k's, compute the +relative convergence rate of PSIS estimate RMSE. See Appendix B of +Vehtari et al. (2024). This function is exported to be usable by +other packages. For user-facing diagnostic functions, see +\code{\link{pareto_convergence_rate}} and \code{\link{pareto_diags}}. +} +\seealso{ +Other helper-functions: +\code{\link{ps_khat_threshold}()}, +\code{\link{ps_min_ss}()}, +\code{\link{ps_tail_length}()} +} +\concept{helper-functions} diff --git a/man/ps_khat_threshold.Rd b/man/ps_khat_threshold.Rd new file mode 100644 index 0000000..d82ebcb --- /dev/null +++ b/man/ps_khat_threshold.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pareto_smooth.R +\name{ps_khat_threshold} +\alias{ps_khat_threshold} +\title{Pareto k-hat threshold} +\usage{ +ps_khat_threshold(ndraws, ...) +} +\arguments{ +\item{ndraws}{number of draws} + +\item{...}{unused} +} +\value{ +threshold +} +\description{ +Given number of draws, computes khat threshold for reliable Pareto +smoothed estimate (to have small probability of large error). See +section 3.2.4, equation (13) of Vehtari et al. (2024). This +function is exported to be usable by other packages. For +user-facing diagnostic functions, see \code{\link{pareto_khat_threshold}} and +\code{\link{pareto_diags}}. +} +\seealso{ +Other helper-functions: +\code{\link{ps_convergence_rate}()}, +\code{\link{ps_min_ss}()}, +\code{\link{ps_tail_length}()} +} +\concept{helper-functions} diff --git a/man/ps_min_ss.Rd b/man/ps_min_ss.Rd new file mode 100644 index 0000000..533cfa3 --- /dev/null +++ b/man/ps_min_ss.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pareto_smooth.R +\name{ps_min_ss} +\alias{ps_min_ss} +\title{Pareto-smoothing minimum sample-size} +\usage{ +ps_min_ss(k, ...) +} +\arguments{ +\item{k}{pareto k value} + +\item{...}{unused} +} +\value{ +minimum sample size +} +\description{ +Given Pareto-k computes the minimum sample size for reliable Pareto +smoothed estimate (to have small probability of large error). See +section 3.2.3 in Vehtari et al. (2024). This function is exported +to be usable by other packages. For user-facing diagnostic functions, see +\code{\link{pareto_min_ss}} and \code{\link{pareto_diags}}. +} +\seealso{ +Other helper-functions: +\code{\link{ps_convergence_rate}()}, +\code{\link{ps_khat_threshold}()}, +\code{\link{ps_tail_length}()} +} +\concept{helper-functions} diff --git a/man/ps_tail_length.Rd b/man/ps_tail_length.Rd new file mode 100644 index 0000000..036afe3 --- /dev/null +++ b/man/ps_tail_length.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pareto_smooth.R +\name{ps_tail_length} +\alias{ps_tail_length} +\title{Pareto tail length} +\usage{ +ps_tail_length(ndraws, r_eff, ...) +} +\arguments{ +\item{ndraws}{number of draws} + +\item{r_eff}{relative efficiency} + +\item{...}{unused} +} +\value{ +tail length +} +\description{ +Calculate the tail length from number of draws and relative efficiency +r_eff. See Appendix H in Vehtari et al. (2024). This function is +used internally and is exported to be available for other packages. +} +\seealso{ +Other helper-functions: +\code{\link{ps_convergence_rate}()}, +\code{\link{ps_khat_threshold}()}, +\code{\link{ps_min_ss}()} +} +\concept{helper-functions}