Skip to content

Commit

Permalink
Merge pull request #361 from n-kall/export-helpers
Browse files Browse the repository at this point in the history
export helper functions
  • Loading branch information
paul-buerkner authored May 15, 2024
2 parents 419a1a3 + ee9d36f commit 86c8fba
Show file tree
Hide file tree
Showing 8 changed files with 210 additions and 57 deletions.
4 changes: 4 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -474,6 +474,10 @@ export(pareto_khat)
export(pareto_khat_threshold)
export(pareto_min_ss)
export(pareto_smooth)
export(ps_convergence_rate)
export(ps_khat_threshold)
export(ps_min_ss)
export(ps_tail_length)
export(quantile2)
export(r_scale)
export(rdo)
Expand Down
41 changes: 24 additions & 17 deletions R/gpd.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,22 +21,28 @@ qgeneralized_pareto <- function(p, mu = 0, sigma = 1, k = 0, lower.tail = TRUE,

#' Estimate parameters of the Generalized Pareto distribution
#'
#' Given a sample \eqn{x}, Estimate the parameters \eqn{k} and \eqn{\sigma} of
#' the generalized Pareto distribution (GPD), assuming the location parameter is
#' 0. By default the fit uses a prior for \eqn{k} (this is in addition to the prior described by Zhang and Stephens, 2009), which will stabilize
#' estimates for very small sample sizes (and low effective sample sizes in the
#' case of MCMC samples). The weakly informative prior is a Gaussian prior
#' centered at 0.5 (see details in Vehtari et al., 2024).
#'
#' @param x A numeric vector. The sample from which to estimate the parameters.
#' @param wip Logical indicating whether to adjust \eqn{k} based on a weakly
#' informative Gaussian prior centered on 0.5. Defaults to `TRUE`.
#' @param min_grid_pts The minimum number of grid points used in the fitting
#' algorithm. The actual number used is `min_grid_pts + floor(sqrt(length(x)))`.
#' @param sort_x If `TRUE` (the default), the first step in the fitting
#' algorithm is to sort the elements of `x`. If `x` is already
#' sorted in ascending order then `sort_x` can be set to `FALSE` to
#' skip the initial sorting step.
#' Given a sample \eqn{x}, Estimate the parameters \eqn{k} and
#' \eqn{\sigma} of the generalized Pareto distribution (GPD), assuming
#' the location parameter is 0. By default the fit uses a prior for
#' \eqn{k} (this is in addition to the prior described by Zhang and
#' Stephens, 2009), which will stabilize estimates for very small
#' sample sizes (and low effective sample sizes in the case of MCMC
#' samples). The weakly informative prior is a Gaussian prior centered
#' at 0.5 (see details in Vehtari et al., 2024). This is used
#' internally but is exported for use by other packages.
#' @family helper-functions
#' @param x A numeric vector. The sample from which to estimate the
#' parameters.
#' @param wip Logical indicating whether to adjust \eqn{k} based on a
#' weakly informative Gaussian prior centered on 0.5. Defaults to
#' `TRUE`.
#' @param min_grid_pts The minimum number of grid points used in the
#' fitting algorithm. The actual number used is `min_grid_pts +
#' floor(sqrt(length(x)))`.
#' @param sort_x If `TRUE` (the default), the first step in the
#' fitting algorithm is to sort the elements of `x`. If `x` is
#' already sorted in ascending order then `sort_x` can be set to
#' `FALSE` to skip the initial sorting step.
#' @return A named list with components `k` and `sigma`.
#'
#' @details Here the parameter \eqn{k} is the negative of \eqn{k} in Zhang &
Expand Down Expand Up @@ -67,7 +73,8 @@ gpdfit <- function(x, wip = TRUE, min_grid_pts = 30, sort_x = TRUE) {
sigma_hat <- -k_hat / theta_hat

# adjust k_hat based on weakly informative prior, Gaussian centered on 0.5.
# this stabilizes estimates for very small Monte Carlo sample sizes and low neff (see Vehtari et al., 2024 for details)
# this stabilizes estimates for very small Monte Carlo sample sizes and low ess
# (see Vehtari et al., 2024 for details)
if (wip) {
k_hat <- (k_hat * N + 0.5 * 10) / (N + 10)
}
Expand Down
97 changes: 58 additions & 39 deletions R/pareto_smooth.R
Original file line number Diff line number Diff line change
Expand Up @@ -281,25 +281,25 @@ pareto_smooth.default <- function(x,
}

tail <- match.arg(tail)
S <- length(x)
ndraws <- length(x)

# automatically calculate relative efficiency
if (is.null(r_eff)) {
r_eff <- ess_tail(x) / S
r_eff <- ess_tail(x) / ndraws
}

# automatically calculate tail length
if (is.null(ndraws_tail)) {
ndraws_tail <- ps_tail_length(S, r_eff)
ndraws_tail <- ps_tail_length(ndraws, r_eff)
}

if (tail == "both") {

if (ndraws_tail > S / 2) {
if (ndraws_tail > ndraws / 2) {
warning("Number of tail draws cannot be more than half ",
"the total number of draws if both tails are fit, ",
"changing to ", S / 2, ".")
ndraws_tail <- S / 2
"changing to ", ndraws / 2, ".")
ndraws_tail <- ndraws / 2
}

if (ndraws_tail < 5) {
Expand Down Expand Up @@ -347,7 +347,7 @@ pareto_smooth.default <- function(x,
diags_list <- list(khat = k)

if (extra_diags) {
ext_diags <- .pareto_smooth_extra_diags(k, S)
ext_diags <- .pareto_smooth_extra_diags(k, ndraws)
diags_list <- c(diags_list, ext_diags)
}

Expand Down Expand Up @@ -447,8 +447,8 @@ pareto_convergence_rate.rvar <- function(x, ...) {

tail <- match.arg(tail)

S <- length(x)
tail_ids <- seq(S - ndraws_tail + 1, S)
ndraws <- length(x)
tail_ids <- seq(ndraws - ndraws_tail + 1, ndraws)

if (tail == "left") {
x <- -x
Expand Down Expand Up @@ -527,15 +527,15 @@ pareto_convergence_rate.rvar <- function(x, ...) {
#' Extra pareto-k diagnostics
#'
#' internal function to calculate the extra diagnostics for a given
#' pareto k and sample size S
#' pareto k and number of draws ndraws
#' @noRd
.pareto_smooth_extra_diags <- function(k, S, ...) {
.pareto_smooth_extra_diags <- function(k, ndraws, ...) {

min_ss <- ps_min_ss(k)

khat_threshold <- ps_khat_threshold(S)
khat_threshold <- ps_khat_threshold(ndraws)

convergence_rate <- ps_convergence_rate(k, S)
convergence_rate <- ps_convergence_rate(k, ndraws)

other_diags <- list(
min_ss = min_ss,
Expand All @@ -547,12 +547,15 @@ pareto_convergence_rate.rvar <- function(x, ...) {
#' Pareto-smoothing minimum sample-size
#'
#' Given Pareto-k computes the minimum sample size for reliable Pareto
#' smoothed estimate (to have small probability of large error)
#' Equation (11) in PSIS paper
#' smoothed estimate (to have small probability of large error). See
#' section 3.2.3 in Vehtari et al. (2024). This function is exported
#' to be usable by other packages. For user-facing diagnostic functions, see
#' [`pareto_min_ss`] and [`pareto_diags`].
#' @family helper-functions
#' @param k pareto k value
#' @param ... unused
#' @return minimum sample size
#' @noRd
#' @export
ps_min_ss <- function(k, ...) {
if (k < 1) {
out <- 10^(1 / (1 - max(0, k)))
Expand All @@ -562,58 +565,74 @@ ps_min_ss <- function(k, ...) {
out
}

#' Pareto-smoothing k-hat threshold
#' Pareto k-hat threshold
#'
#' Given sample size S computes khat threshold for reliable Pareto
#' Given number of draws, computes khat threshold for reliable Pareto
#' smoothed estimate (to have small probability of large error). See
#' section 3.2.4, equation (13).
#' @param S sample size
#' section 3.2.4, equation (13) of Vehtari et al. (2024). This
#' function is exported to be usable by other packages. For
#' user-facing diagnostic functions, see [`pareto_khat_threshold`] and
#' [`pareto_diags`].
#' @family helper-functions
#' @param ndraws number of draws
#' @param ... unused
#' @return threshold
#' @noRd
ps_khat_threshold <- function(S, ...) {
1 - 1 / log10(S)
#' @export
ps_khat_threshold <- function(ndraws, ...) {
1 - 1 / log10(ndraws)
}

#' Pareto-smoothing convergence rate
#' Pareto convergence rate
#'
#' Given S and scalar or array of k's, compute the relative
#' convergence rate of PSIS estimate RMSE
#' Given number of draws and scalar or array of k's, compute the
#' relative convergence rate of PSIS estimate RMSE. See Appendix B of
#' Vehtari et al. (2024). This function is exported to be usable by
#' other packages. For user-facing diagnostic functions, see
#' [`pareto_convergence_rate`] and [`pareto_diags`].
#' @family helper-functions
#' @param k pareto-k values
#' @param S sample size
#' @param ndraws number of draws
#' @param ... unused
#' @return convergence rate
#' @noRd
ps_convergence_rate <- function(k, S, ...) {
#' @export
ps_convergence_rate <- function(k, ndraws, ...) {
# allow array of k's
rate <- numeric(length(k))
# k<0 bounded distribution
rate[k < 0] <- 1
# k>0 non-finite mean
rate[k > 1] <- 0
# limit value at k=1/2
rate[k == 0.5] <- 1 - 1 / log(S)
# smooth approximation for the rest (see Appendix of PSIS paper)
rate[k == 0.5] <- 1 - 1 / log(ndraws)
# smooth approximation for the rest (see Appendix B of PSIS paper)
ki <- (k > 0 & k < 1 & k != 0.5)
kk <- k[ki]
rate[ki] <- pmax(
0,
(2 * (kk - 1) * S^(2 * kk + 1) + (1 - 2 * kk) * S^(2 * kk) + S^2) /
((S - 1) * (S - S^(2 * kk)))
(2 * (kk - 1) * ndraws^(2 * kk + 1) + (1 - 2 * kk) * ndraws^(2 * kk) + ndraws^2) /
((ndraws - 1) * (ndraws - ndraws^(2 * kk)))
)
rate
}

#' Calculate the tail length from S and r_eff
#' Appendix H in PSIS paper
#' @noRd
ps_tail_length <- function(S, r_eff, ...) {
ifelse(S > 225, ceiling(3 * sqrt(S / r_eff)), S / 5)
#' Pareto tail length
#'
#' Calculate the tail length from number of draws and relative efficiency
#' r_eff. See Appendix H in Vehtari et al. (2024). This function is
#' used internally and is exported to be available for other packages.
#' @family helper-functions
#' @param ndraws number of draws
#' @param r_eff relative efficiency
#' @param ... unused
#' @return tail length
#' @export
ps_tail_length <- function(ndraws, r_eff, ...) {
ifelse(ndraws > 225, ceiling(3 * sqrt(ndraws / r_eff)), ndraws / 5)
}

#' Pareto-k diagnostic message
#'
#' Given S and scalar and k, form a diagnostic message string
#' Given number of draws and k, form a diagnostic message string.
#' @param diags (numeric) named vector of diagnostic values
#' @param are_weights (logical) are the diagnostics for weights
#' @param ... unused
Expand Down
2 changes: 1 addition & 1 deletion man-roxygen/ref-vehtari-paretosmooth-2022.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
#' Aki Vehtari, Daniel Simpson, Andrew Gelman, Yuling Yao and
#' Jonah Gabry (2024). Pareto Smoothed Importance Sampling.
#' *Journal of Machine Learning Research*, 25(72):1-58.
#' [PDF](https://jmlr.org/papers/v25/19-556.html)
#' [PDF](https://jmlr.org/papers/v25/19-556.html)
32 changes: 32 additions & 0 deletions man/ps_convergence_rate.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

31 changes: 31 additions & 0 deletions man/ps_khat_threshold.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

30 changes: 30 additions & 0 deletions man/ps_min_ss.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

30 changes: 30 additions & 0 deletions man/ps_tail_length.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 86c8fba

Please sign in to comment.