From 137a6278a2587413eb423cafff47084ee1359db8 Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 26 Oct 2023 19:03:54 +0200 Subject: [PATCH 01/73] DHARMa implementation for new `check_residuals()` function Fixes #595 --- DESCRIPTION | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 2326f93f7..fa06c9ae3 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: performance Title: Assessment of Regression Models Performance -Version: 0.10.6.3 +Version: 0.10.6.4 Authors@R: c(person(given = "Daniel", family = "Lüdecke", @@ -91,6 +91,7 @@ Suggests: correlation, cplm, dbscan, + DHARMa, estimatr, fixest, flextable, From 4fc92747b8e75e69002f406d15c8872d0fdf4b9d Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 26 Oct 2023 20:56:11 +0200 Subject: [PATCH 02/73] draft function --- NAMESPACE | 1 + R/simulate_residuals.R | 28 ++++++++++++++++++++++++++++ man/simulate_residuals.Rd | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 61 insertions(+) create mode 100644 R/simulate_residuals.R create mode 100644 man/simulate_residuals.Rd diff --git a/NAMESPACE b/NAMESPACE index e573d5ce0..959b1cc58 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -576,6 +576,7 @@ export(r2_tjur) export(r2_xu) export(r2_zeroinflated) export(rmse) +export(simulate_residuals) export(test_bf) export(test_likelihoodratio) export(test_lrt) diff --git a/R/simulate_residuals.R b/R/simulate_residuals.R new file mode 100644 index 000000000..61a5043fd --- /dev/null +++ b/R/simulate_residuals.R @@ -0,0 +1,28 @@ +#' @title Check model for (non-)constant error variance +#' @name simulate_residuals +#' +#' @description to do. +#' +#' @param x A model object. +#' @param ... Currently not used. +#' +#' @return Simulated residuals. +#' +#' @details Based on [`DHARMa::simulateResiduals()`]. +#' +#' @examples +#' m <<- lm(mpg ~ wt + cyl + gear + disp, data = mtcars) +#' check_heteroscedasticity(m) +#' +#' # plot results +#' if (require("see")) { +#' x <- check_heteroscedasticity(m) +#' plot(x) +#' } +#' @export +simulate_residuals <- function(x, ...) { + insight::check_if_installed("DHARMa") + out <- stats::residuals(DHARMa::simulateResiduals(x, ...)) + class(out) <- c("performance_simres", class(out)) + out +} diff --git a/man/simulate_residuals.Rd b/man/simulate_residuals.Rd new file mode 100644 index 000000000..32ee0820d --- /dev/null +++ b/man/simulate_residuals.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/simulate_residuals.R +\name{simulate_residuals} +\alias{simulate_residuals} +\title{Check model for (non-)constant error variance} +\usage{ +simulate_residuals(x, ...) +} +\arguments{ +\item{x}{A model object.} + +\item{...}{Currently not used.} +} +\value{ +Simulated residuals. +} +\description{ +to do. +} +\details{ +Based on \code{\link[DHARMa:simulateResiduals]{DHARMa::simulateResiduals()}}. +} +\examples{ +m <<- lm(mpg ~ wt + cyl + gear + disp, data = mtcars) +check_heteroscedasticity(m) + +# plot results +if (require("see")) { + x <- check_heteroscedasticity(m) + plot(x) +} +} From e38cc083b12dbf20a084c3060b7c6d25f3072a76 Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 26 Oct 2023 22:13:21 +0200 Subject: [PATCH 03/73] update pkgdown --- _pkgdown.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/_pkgdown.yaml b/_pkgdown.yaml index 71f444657..d6e56740c 100644 --- a/_pkgdown.yaml +++ b/_pkgdown.yaml @@ -13,6 +13,7 @@ reference: contents: - binned_residuals - starts_with("check_") + - simulate_residuals - title: "Check Model Performance or Quality" contents: From 4c805bf6c5544fd686d07a42ef95853e474666ee Mon Sep 17 00:00:00 2001 From: Michael McCarthy <51542091+mccarthy-m-g@users.noreply.github.com> Date: Thu, 26 Oct 2023 16:17:46 -0700 Subject: [PATCH 04/73] Don't prematurely return residuals in `simulate_residuals()` --- R/simulate_residuals.R | 42 ++++++++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/R/simulate_residuals.R b/R/simulate_residuals.R index 61a5043fd..1cc4d966c 100644 --- a/R/simulate_residuals.R +++ b/R/simulate_residuals.R @@ -1,28 +1,46 @@ -#' @title Check model for (non-)constant error variance +#' @title Simulate randomized quantile residuals from a model #' @name simulate_residuals #' #' @description to do. #' #' @param x A model object. -#' @param ... Currently not used. +#' @param ... Arguments passed on to [`DHARMa::simulateResiduals()`]. #' #' @return Simulated residuals. #' -#' @details Based on [`DHARMa::simulateResiduals()`]. +#' @details Based on [`DHARMa::simulateResiduals()`]. See also `vignette("DHARMa")`. +#' +#' @references +#' +#' - Hartig, F., & Lohse, L. (2022). DHARMa: Residual Diagnostics for Hierarchical (Multi-Level / Mixed) Regression Models (Version 0.4.5). Retrieved from https://CRAN.R-project.org/package=DHARMa +#' - Dunn, P. K., & Smyth, G. K. (1996). Randomized Quantile Residuals. Journal of Computational and Graphical Statistics, 5(3), 236. https://doi.org/10.2307/1390802 #' #' @examples -#' m <<- lm(mpg ~ wt + cyl + gear + disp, data = mtcars) -#' check_heteroscedasticity(m) -#' -#' # plot results -#' if (require("see")) { -#' x <- check_heteroscedasticity(m) -#' plot(x) -#' } +#' m <- lm(mpg ~ wt + cyl + gear + disp, data = mtcars) +#' simulate_residuals(m) +#' #' @export simulate_residuals <- function(x, ...) { insight::check_if_installed("DHARMa") - out <- stats::residuals(DHARMa::simulateResiduals(x, ...)) + # TODO (low priority): Note that DHARMa::simulateResiduals(x, ...) does its own checks for whether + # or not the model passed to it is supported, do we want to use this or do our + # own checks so we can supply our own error message? + # + # It's important to preserve this object as is, rather than prematurely + # extracting the residuals from it because the object contains important stuff + # in it that we'll want to pass onto other functions later, such as passing + # the fitted model into check_model(). + out <- DHARMa::simulateResiduals(x, ...) class(out) <- c("performance_simres", class(out)) out } + +# methods ------------------------------ + +#' @export +print.performance_simres <- function(x, ...) { + # TODO (low priority): We can probably just base this off of the print method + # DHARMa uses, but with an easystats style. For now we can just stick with + # DHARMa's method. + print(x) +} From bb1a2f36b9f1d0a81ae9a9738d49ce09df13e68b Mon Sep 17 00:00:00 2001 From: Michael McCarthy <51542091+mccarthy-m-g@users.noreply.github.com> Date: Thu, 26 Oct 2023 16:19:03 -0700 Subject: [PATCH 05/73] draft `check_residuals()` --- R/check_residuals.R | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 R/check_residuals.R diff --git a/R/check_residuals.R b/R/check_residuals.R new file mode 100644 index 000000000..e5b649307 --- /dev/null +++ b/R/check_residuals.R @@ -0,0 +1,22 @@ +#' Check uniformity of simulated residuals +#' +#' `check_residuals()` checks generalized linear (mixed) models for uniformity +#' of randomized quantile residuals, which can be used to identify typical model +#' misspecification problems, such as over/underdispersion, zero-inflation, and +#' residual spatial and temporal autocorrelation. +#' +#' @export +check_residuals <- function(x, ...) { + # TODO: This should be an S3 method instead of using ifelse + if (any(class(x) %in% c("performance_simres", "DHARMa"))) { + # tests if the overall distribution conforms to expectations; equivalent to: + # ks.test(residuals(simulated_residuals), "punif") + DHARMa::testUniformity(x, plot = FALSE, ...) + } else { + stop("Unsupported input") + } +} + +# methods ------------------------------ + +# TODO: Add print method From de272d7bd152ac95ad6f5e3ca28e4a6c7157fc27 Mon Sep 17 00:00:00 2001 From: Michael McCarthy <51542091+mccarthy-m-g@users.noreply.github.com> Date: Thu, 26 Oct 2023 16:21:06 -0700 Subject: [PATCH 06/73] Add (temporary?) vignette for basic simulated residuals workflow demonstration and discussion --- vignettes/simulate_residuals.Rmd | 73 ++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 vignettes/simulate_residuals.Rmd diff --git a/vignettes/simulate_residuals.Rmd b/vignettes/simulate_residuals.Rmd new file mode 100644 index 000000000..9909aa5af --- /dev/null +++ b/vignettes/simulate_residuals.Rmd @@ -0,0 +1,73 @@ +--- +title: "Checking simulated residuals" +output: + rmarkdown::html_vignette: + toc: true + fig_width: 10.08 + fig_height: 6 +tags: [r, performance] +vignette: > + \usepackage[utf8]{inputenc} + %\VignetteIndexEntry{Checking simulated residuals} + %\VignetteEngine{knitr::rmarkdown} +editor_options: + chunk_output_type: console +--- + +```{r} +devtools::load_all() +library(DHARMa) +``` + +The basic workflow for simulated residual checks using `simulate_residuals()` is as follows. + +First, fit a model: + +```{r} +library(glmmTMB) + +model <- glmmTMB( + count ~ mined + spp + (1 | site), + family = poisson, + data = Salamanders +) +``` + +Next, simulate residuals from the model: + +```{r} +simulated_residuals <- simulate_residuals(model) + +simulated_residuals +``` + + +Note that since this inherits the DHARMa class, all the methods implemented in DHARMa just work, including all the tests: + +```{r} +residuals(simulated_residuals) + +DHARMa::testUniformity(simulated_residuals, plot = FALSE) +``` + + +Finally, run specific checks on the simulated residuals: + +```{r} +check_residuals(simulated_residuals) +``` + +Or check the entire model. + +```{r, eval=FALSE} +# TODO (not implemented) +check_model(simulated_residuals) +``` + +The `check_model()` function is the main reason we don't want to prematurely extract the residuals in `simulate_residuals()`, because if we do then the `simulated_residuals` won't contain the model fit (`fittedModel` in the output below), so we won't be able to do all of the checks we would want to do using the model. + +```{r} +str(simulated_residuals, max.level = 1) +``` + +It would also mean we would need to reimplement some of the tests from DHARMa (e.g., `DHARMa::testOutliers()`) if we're planning to include those checks as well. We probably don't want to do that, since some of them are fairly involved rather than just being wrappers for tests supplied in base R (e.g., ) . From a6a5afe3711857922ab23bcbbb63afe9eb3d1a4e Mon Sep 17 00:00:00 2001 From: Michael McCarthy <51542091+mccarthy-m-g@users.noreply.github.com> Date: Thu, 26 Oct 2023 16:31:02 -0700 Subject: [PATCH 07/73] clarify why model fit is needed --- vignettes/simulate_residuals.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/simulate_residuals.Rmd b/vignettes/simulate_residuals.Rmd index 9909aa5af..0c0872b21 100644 --- a/vignettes/simulate_residuals.Rmd +++ b/vignettes/simulate_residuals.Rmd @@ -64,7 +64,7 @@ Or check the entire model. check_model(simulated_residuals) ``` -The `check_model()` function is the main reason we don't want to prematurely extract the residuals in `simulate_residuals()`, because if we do then the `simulated_residuals` won't contain the model fit (`fittedModel` in the output below), so we won't be able to do all of the checks we would want to do using the model. +The `check_model()` function is the main reason we don't want to prematurely extract the residuals in `simulate_residuals()`, because if we do then the `simulated_residuals` won't contain the model fit (`fittedModel` in the output below), so we won't be able to do all of the checks we would want to do using the model (e.g., posterior predictive checks). ```{r} str(simulated_residuals, max.level = 1) From 5b96362df5cfacac3a9c444179e39da80a122c3c Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 27 Oct 2023 09:43:04 +0200 Subject: [PATCH 08/73] fix vignette issues --- vignettes/simulate_residuals.Rmd | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/vignettes/simulate_residuals.Rmd b/vignettes/simulate_residuals.Rmd index 0c0872b21..7338aea4e 100644 --- a/vignettes/simulate_residuals.Rmd +++ b/vignettes/simulate_residuals.Rmd @@ -14,9 +14,29 @@ editor_options: chunk_output_type: console --- -```{r} -devtools::load_all() -library(DHARMa) +```{r , include=FALSE} +library(knitr) +library(performance) +options(knitr.kable.NA = "") +knitr::opts_chunk$set( + comment = ">", + message = FALSE, + warning = FALSE, + out.width = "100%", + dpi = 450 +) +options(digits = 2) + +pkgs <- c("DHARMa", "glmmTMB") +successfully_loaded <- vapply(pkgs, requireNamespace, FUN.VALUE = logical(1L), quietly = TRUE) +can_evaluate <- all(successfully_loaded) + +if (can_evaluate) { + knitr::opts_chunk$set(eval = TRUE) + vapply(pkgs, require, FUN.VALUE = logical(1L), quietly = TRUE, character.only = TRUE) +} else { + knitr::opts_chunk$set(eval = FALSE) +} ``` The basic workflow for simulated residual checks using `simulate_residuals()` is as follows. @@ -45,6 +65,7 @@ simulated_residuals Note that since this inherits the DHARMa class, all the methods implemented in DHARMa just work, including all the tests: ```{r} +library(DHARMa) residuals(simulated_residuals) DHARMa::testUniformity(simulated_residuals, plot = FALSE) From ddc4d6b61b8b4e2b6ee1e9f30e48c183bb63dbba Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 27 Oct 2023 09:44:38 +0200 Subject: [PATCH 09/73] lintr --- R/check_residuals.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/R/check_residuals.R b/R/check_residuals.R index e5b649307..26f8a614a 100644 --- a/R/check_residuals.R +++ b/R/check_residuals.R @@ -7,13 +7,14 @@ #' #' @export check_residuals <- function(x, ...) { + insight::check_if_installed("DHARMa") # TODO: This should be an S3 method instead of using ifelse - if (any(class(x) %in% c("performance_simres", "DHARMa"))) { + if (inherits(x, c("performance_simres", "DHARMa"))) { # tests if the overall distribution conforms to expectations; equivalent to: # ks.test(residuals(simulated_residuals), "punif") DHARMa::testUniformity(x, plot = FALSE, ...) } else { - stop("Unsupported input") + insight::format_error("Unsupported input.") } } From a51e77ad8467c97f9a62222dd05016e12bb504b4 Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 27 Oct 2023 09:46:08 +0200 Subject: [PATCH 10/73] RD; update namespace --- NAMESPACE | 2 ++ R/simulate_residuals.R | 8 ++++++-- man/check_residuals.Rd | 14 ++++++++++++++ man/simulate_residuals.Rd | 22 +++++++++++++--------- 4 files changed, 35 insertions(+), 11 deletions(-) create mode 100644 man/check_residuals.Rd diff --git a/NAMESPACE b/NAMESPACE index 959b1cc58..5b4e3819c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -299,6 +299,7 @@ S3method(print,performance_pcp) S3method(print,performance_pp_check) S3method(print,performance_roc) S3method(print,performance_score) +S3method(print,performance_simres) S3method(print,r2_bayes) S3method(print,r2_generic) S3method(print,r2_loo) @@ -523,6 +524,7 @@ export(check_outliers) export(check_overdispersion) export(check_posterior_predictions) export(check_predictions) +export(check_residuals) export(check_singularity) export(check_sphericity) export(check_sphericity_bartlett) diff --git a/R/simulate_residuals.R b/R/simulate_residuals.R index 1cc4d966c..ca2e58402 100644 --- a/R/simulate_residuals.R +++ b/R/simulate_residuals.R @@ -12,8 +12,12 @@ #' #' @references #' -#' - Hartig, F., & Lohse, L. (2022). DHARMa: Residual Diagnostics for Hierarchical (Multi-Level / Mixed) Regression Models (Version 0.4.5). Retrieved from https://CRAN.R-project.org/package=DHARMa -#' - Dunn, P. K., & Smyth, G. K. (1996). Randomized Quantile Residuals. Journal of Computational and Graphical Statistics, 5(3), 236. https://doi.org/10.2307/1390802 +#' - Hartig, F., & Lohse, L. (2022). DHARMa: Residual Diagnostics for Hierarchical +#' (Multi-Level / Mixed) Regression Models (Version 0.4.5). Retrieved from +#' https://CRAN.R-project.org/package=DHARMa +#' +#' - Dunn, P. K., & Smyth, G. K. (1996). Randomized Quantile Residuals. Journal +#' of Computational and Graphical Statistics, 5(3), 236. \doi{10.2307/1390802} #' #' @examples #' m <- lm(mpg ~ wt + cyl + gear + disp, data = mtcars) diff --git a/man/check_residuals.Rd b/man/check_residuals.Rd new file mode 100644 index 000000000..eb74a9bdd --- /dev/null +++ b/man/check_residuals.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check_residuals.R +\name{check_residuals} +\alias{check_residuals} +\title{Check uniformity of simulated residuals} +\usage{ +check_residuals(x, ...) +} +\description{ +\code{check_residuals()} checks generalized linear (mixed) models for uniformity +of randomized quantile residuals, which can be used to identify typical model +misspecification problems, such as over/underdispersion, zero-inflation, and +residual spatial and temporal autocorrelation. +} diff --git a/man/simulate_residuals.Rd b/man/simulate_residuals.Rd index 32ee0820d..22da129b2 100644 --- a/man/simulate_residuals.Rd +++ b/man/simulate_residuals.Rd @@ -2,14 +2,14 @@ % Please edit documentation in R/simulate_residuals.R \name{simulate_residuals} \alias{simulate_residuals} -\title{Check model for (non-)constant error variance} +\title{Simulate randomized quantile residuals from a model} \usage{ simulate_residuals(x, ...) } \arguments{ \item{x}{A model object.} -\item{...}{Currently not used.} +\item{...}{Arguments passed on to \code{\link[DHARMa:simulateResiduals]{DHARMa::simulateResiduals()}}.} } \value{ Simulated residuals. @@ -18,15 +18,19 @@ Simulated residuals. to do. } \details{ -Based on \code{\link[DHARMa:simulateResiduals]{DHARMa::simulateResiduals()}}. +Based on \code{\link[DHARMa:simulateResiduals]{DHARMa::simulateResiduals()}}. See also \code{vignette("DHARMa")}. } \examples{ -m <<- lm(mpg ~ wt + cyl + gear + disp, data = mtcars) -check_heteroscedasticity(m) +m <- lm(mpg ~ wt + cyl + gear + disp, data = mtcars) +simulate_residuals(m) -# plot results -if (require("see")) { - x <- check_heteroscedasticity(m) - plot(x) +} +\references{ +\itemize{ +\item Hartig, F., & Lohse, L. (2022). DHARMa: Residual Diagnostics for Hierarchical +(Multi-Level / Mixed) Regression Models (Version 0.4.5). Retrieved from +https://CRAN.R-project.org/package=DHARMa +\item Dunn, P. K., & Smyth, G. K. (1996). Randomized Quantile Residuals. Journal +of Computational and Graphical Statistics, 5(3), 236. \doi{10.2307/1390802} } } From 6ffa8b67d611a3d11329191ac58eab5f12086e7b Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 27 Oct 2023 09:58:53 +0200 Subject: [PATCH 11/73] add method for check_normality --- NAMESPACE | 1 + R/check_normality.R | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/NAMESPACE b/NAMESPACE index 5b4e3819c..c69240f80 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -70,6 +70,7 @@ S3method(check_normality,htest) S3method(check_normality,lmerModLmerTest) S3method(check_normality,merMod) S3method(check_normality,numeric) +S3method(check_normality,performance_simres) S3method(check_outliers,BFBayesFactor) S3method(check_outliers,character) S3method(check_outliers,data.frame) diff --git a/R/check_normality.R b/R/check_normality.R index 9dc00d03f..8f0d9dd85 100644 --- a/R/check_normality.R +++ b/R/check_normality.R @@ -92,6 +92,24 @@ check_normality.glm <- function(x, ...) { invisible(out) } +# DHARMa ------------------- + +#' @export +check_normality.performance_simres <- function(x, ...) { + # check for normality of residuals + res <- stats::residuals(x, quantileFunction = stats::qnorm) + p.val <- suppressWarnings(stats::ks.test(stats::residuals(x), "punif"))$p.value + + attr(p.val, "data") <- res + attr(p.val, "object_name") <- insight::safe_deparse_symbol(substitute(x)) + attr(p.val, "effects") <- "fixed" + attr(p.val, "type") <- "residuals" + class(p.val) <- unique(c("check_normality", "see_check_normality", class(p.val))) + + p.val +} + + # numeric ------------------- #' @export From 991b4831b6ee7eb08ca661856eeb196aa19c5eb5 Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 27 Oct 2023 10:06:42 +0200 Subject: [PATCH 12/73] add class attr for see::plot --- R/check_normality.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/check_normality.R b/R/check_normality.R index 8f0d9dd85..215d6a6c6 100644 --- a/R/check_normality.R +++ b/R/check_normality.R @@ -104,7 +104,7 @@ check_normality.performance_simres <- function(x, ...) { attr(p.val, "object_name") <- insight::safe_deparse_symbol(substitute(x)) attr(p.val, "effects") <- "fixed" attr(p.val, "type") <- "residuals" - class(p.val) <- unique(c("check_normality", "see_check_normality", class(p.val))) + class(p.val) <- unique(c("check_normality", "see_check_normality", "performance_simres", class(p.val))) p.val } From 542ea79c400bf02b5d7d8d2db01150476e2918d9 Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 27 Oct 2023 11:34:23 +0200 Subject: [PATCH 13/73] save necessary info --- R/check_normality.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/R/check_normality.R b/R/check_normality.R index 215d6a6c6..497d6da0f 100644 --- a/R/check_normality.R +++ b/R/check_normality.R @@ -101,15 +101,16 @@ check_normality.performance_simres <- function(x, ...) { p.val <- suppressWarnings(stats::ks.test(stats::residuals(x), "punif"))$p.value attr(p.val, "data") <- res - attr(p.val, "object_name") <- insight::safe_deparse_symbol(substitute(x)) + attr(p.val, "object_name") <- NULL attr(p.val, "effects") <- "fixed" attr(p.val, "type") <- "residuals" + attr(p.val, "model") <- x$fittedModel + attr(p.val, "model_info") <- insight::model_info(x$fittedModel) class(p.val) <- unique(c("check_normality", "see_check_normality", "performance_simres", class(p.val))) p.val } - # numeric ------------------- #' @export From 0e9a2aea6ac9fc529dadd862852b83b7f0f7d442 Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 27 Oct 2023 12:03:21 +0200 Subject: [PATCH 14/73] fix check issues --- R/check_residuals.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/R/check_residuals.R b/R/check_residuals.R index 26f8a614a..3bd8441d2 100644 --- a/R/check_residuals.R +++ b/R/check_residuals.R @@ -5,6 +5,9 @@ #' misspecification problems, such as over/underdispersion, zero-inflation, and #' residual spatial and temporal autocorrelation. #' +#' @param x An object returned by [`simulated_residuals()`]. +#' @param ... Passed down to [`DHARMa::testUniformity()`] +#' #' @export check_residuals <- function(x, ...) { insight::check_if_installed("DHARMa") From 30038c9abedc3ea6d1365b9ca3af2b4ab9ccea0b Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 27 Oct 2023 12:03:31 +0200 Subject: [PATCH 15/73] rd --- man/check_residuals.Rd | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/man/check_residuals.Rd b/man/check_residuals.Rd index eb74a9bdd..f8595b34a 100644 --- a/man/check_residuals.Rd +++ b/man/check_residuals.Rd @@ -6,6 +6,11 @@ \usage{ check_residuals(x, ...) } +\arguments{ +\item{x}{An object returned by \code{\link[=simulated_residuals]{simulated_residuals()}}.} + +\item{...}{Passed down to \code{\link[DHARMa:testUniformity]{DHARMa::testUniformity()}}} +} \description{ \code{check_residuals()} checks generalized linear (mixed) models for uniformity of randomized quantile residuals, which can be used to identify typical model From 50f1f3d7916799c473e7f11d4ec8f82fdea78bf2 Mon Sep 17 00:00:00 2001 From: Michael McCarthy <51542091+mccarthy-m-g@users.noreply.github.com> Date: Fri, 27 Oct 2023 13:38:23 -0700 Subject: [PATCH 16/73] Add basic print output for `simulate_residuals()` --- R/simulate_residuals.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/R/simulate_residuals.R b/R/simulate_residuals.R index ca2e58402..4fcad4472 100644 --- a/R/simulate_residuals.R +++ b/R/simulate_residuals.R @@ -46,5 +46,8 @@ print.performance_simres <- function(x, ...) { # TODO (low priority): We can probably just base this off of the print method # DHARMa uses, but with an easystats style. For now we can just stick with # DHARMa's method. - print(x) + cat( + "Simulated residuals from a model of class", class(x$fittedModel), + "based on", x$nSim, "simulations." + ) } From 25ff8902caec97666695d9469ceaf7ba4a784809 Mon Sep 17 00:00:00 2001 From: Daniel Date: Mon, 30 Oct 2023 07:57:05 +0100 Subject: [PATCH 17/73] draft --- NAMESPACE | 4 +++- R/check_normality.R | 19 ---------------- R/check_residuals.R | 46 +++++++++++++++++++++++++++++---------- R/simulate_residuals.R | 17 ++++++++++----- man/check_residuals.Rd | 19 ++++++++++++++-- man/simulate_residuals.Rd | 7 ++++-- 6 files changed, 71 insertions(+), 41 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index c69240f80..f70dc5e92 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -70,7 +70,6 @@ S3method(check_normality,htest) S3method(check_normality,lmerModLmerTest) S3method(check_normality,merMod) S3method(check_normality,numeric) -S3method(check_normality,performance_simres) S3method(check_outliers,BFBayesFactor) S3method(check_outliers,character) S3method(check_outliers,data.frame) @@ -106,6 +105,9 @@ S3method(check_overdispersion,poissonmfx) S3method(check_predictions,BFBayesFactor) S3method(check_predictions,default) S3method(check_predictions,lme) +S3method(check_residuals,DHARMa) +S3method(check_residuals,default) +S3method(check_residuals,performance_simres) S3method(check_singularity,MixMod) S3method(check_singularity,clmm) S3method(check_singularity,cpglmm) diff --git a/R/check_normality.R b/R/check_normality.R index 497d6da0f..9dc00d03f 100644 --- a/R/check_normality.R +++ b/R/check_normality.R @@ -92,25 +92,6 @@ check_normality.glm <- function(x, ...) { invisible(out) } -# DHARMa ------------------- - -#' @export -check_normality.performance_simres <- function(x, ...) { - # check for normality of residuals - res <- stats::residuals(x, quantileFunction = stats::qnorm) - p.val <- suppressWarnings(stats::ks.test(stats::residuals(x), "punif"))$p.value - - attr(p.val, "data") <- res - attr(p.val, "object_name") <- NULL - attr(p.val, "effects") <- "fixed" - attr(p.val, "type") <- "residuals" - attr(p.val, "model") <- x$fittedModel - attr(p.val, "model_info") <- insight::model_info(x$fittedModel) - class(p.val) <- unique(c("check_normality", "see_check_normality", "performance_simres", class(p.val))) - - p.val -} - # numeric ------------------- #' @export diff --git a/R/check_residuals.R b/R/check_residuals.R index 3bd8441d2..8ef1f6866 100644 --- a/R/check_residuals.R +++ b/R/check_residuals.R @@ -5,22 +5,46 @@ #' misspecification problems, such as over/underdispersion, zero-inflation, and #' residual spatial and temporal autocorrelation. #' -#' @param x An object returned by [`simulated_residuals()`]. -#' @param ... Passed down to [`DHARMa::testUniformity()`] +#' @param x An object returned by [`simulate_residuals()`] or +#' [`DHARMa::simulateResiduals()`]. +#' @param alternative A character string specifying the alternative hypothesis. +#' See [`stats::ks.test()`] for details. +#' @param ... Passed down to [`stats::ks.test()`] +#' +#' @examplesIf require("DHARMa") +#' dat <- DHARMa::createData(sampleSize = 100, overdispersion = 0.5, family = poisson()) +#' m <- glm(observedResponse ~ Environment1, family = poisson(), data = dat) +#' res <- simulate_residuals(m) +#' check_residuals(res) #' #' @export check_residuals <- function(x, ...) { - insight::check_if_installed("DHARMa") - # TODO: This should be an S3 method instead of using ifelse - if (inherits(x, c("performance_simres", "DHARMa"))) { - # tests if the overall distribution conforms to expectations; equivalent to: - # ks.test(residuals(simulated_residuals), "punif") - DHARMa::testUniformity(x, plot = FALSE, ...) - } else { - insight::format_error("Unsupported input.") - } + UseMethod("check_residuals") +} + +#' @export +check_residuals.default <- function(x, ...) { + insight::format_error("`check_residuals()` only works with objects returned by `simulate_residuals()` by `DHARMa::simulateResiduals()`.") # nolint +} + +#' @rdname check_residuals +#' @export +check_residuals.performance_simres <- function(x, + alternative = c("two.sided", "less", "greater"), + ...) { + alternative <- match.arg(alternative) + stats::ks.test( + stats::residuals(simulated_residuals), + "punif", + alternative = alternative, + ... + ) } +#' @export +check_residuals.DHARMa <- check_residuals.performance_simres + + # methods ------------------------------ # TODO: Add print method diff --git a/R/simulate_residuals.R b/R/simulate_residuals.R index 4fcad4472..d1e920a3a 100644 --- a/R/simulate_residuals.R +++ b/R/simulate_residuals.R @@ -4,6 +4,7 @@ #' @description to do. #' #' @param x A model object. +#' @param iterations Number of simulations to run. #' @param ... Arguments passed on to [`DHARMa::simulateResiduals()`]. #' #' @return Simulated residuals. @@ -19,12 +20,12 @@ #' - Dunn, P. K., & Smyth, G. K. (1996). Randomized Quantile Residuals. Journal #' of Computational and Graphical Statistics, 5(3), 236. \doi{10.2307/1390802} #' -#' @examples +#' @examplesIf require("DHARMa") #' m <- lm(mpg ~ wt + cyl + gear + disp, data = mtcars) #' simulate_residuals(m) #' #' @export -simulate_residuals <- function(x, ...) { +simulate_residuals <- function(x, iterations = 250, ...) { insight::check_if_installed("DHARMa") # TODO (low priority): Note that DHARMa::simulateResiduals(x, ...) does its own checks for whether # or not the model passed to it is supported, do we want to use this or do our @@ -34,7 +35,7 @@ simulate_residuals <- function(x, ...) { # extracting the residuals from it because the object contains important stuff # in it that we'll want to pass onto other functions later, such as passing # the fitted model into check_model(). - out <- DHARMa::simulateResiduals(x, ...) + out <- DHARMa::simulateResiduals(x, n = iterations, plot = FALSE, ...) class(out) <- c("performance_simres", class(out)) out } @@ -46,8 +47,12 @@ print.performance_simres <- function(x, ...) { # TODO (low priority): We can probably just base this off of the print method # DHARMa uses, but with an easystats style. For now we can just stick with # DHARMa's method. - cat( - "Simulated residuals from a model of class", class(x$fittedModel), - "based on", x$nSim, "simulations." + msg <- paste0( + "Simulated residuals from a model of class `", class(x$fittedModel), + "` based on ", x$nSim, " simulations. Use `check_residuals()` to check ", + "uniformity of residuals. It is recommended to refer to `?DHARMa::simulateReisudals`", + " and `vignette(\"DHARMa\")` for more information about different settings", + " in particular situations or for particular models." ) + cat(insight::format_message(msg)) } diff --git a/man/check_residuals.Rd b/man/check_residuals.Rd index f8595b34a..fdeb931d9 100644 --- a/man/check_residuals.Rd +++ b/man/check_residuals.Rd @@ -2,14 +2,21 @@ % Please edit documentation in R/check_residuals.R \name{check_residuals} \alias{check_residuals} +\alias{check_residuals.performance_simres} \title{Check uniformity of simulated residuals} \usage{ check_residuals(x, ...) + +\method{check_residuals}{performance_simres}(x, alternative = c("two.sided", "less", "greater"), ...) } \arguments{ -\item{x}{An object returned by \code{\link[=simulated_residuals]{simulated_residuals()}}.} +\item{x}{An object returned by \code{\link[=simulate_residuals]{simulate_residuals()}} or +\code{\link[DHARMa:simulateResiduals]{DHARMa::simulateResiduals()}}.} + +\item{...}{Passed down to \code{\link[stats:ks.test]{stats::ks.test()}}} -\item{...}{Passed down to \code{\link[DHARMa:testUniformity]{DHARMa::testUniformity()}}} +\item{alternative}{A character string specifying the alternative hypothesis. +See \code{\link[stats:ks.test]{stats::ks.test()}} for details.} } \description{ \code{check_residuals()} checks generalized linear (mixed) models for uniformity @@ -17,3 +24,11 @@ of randomized quantile residuals, which can be used to identify typical model misspecification problems, such as over/underdispersion, zero-inflation, and residual spatial and temporal autocorrelation. } +\examples{ +\dontshow{if (require("DHARMa")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +dat <- DHARMa::createData(sampleSize = 100, overdispersion = 0.5, family = poisson()) +m <- glm(observedResponse ~ Environment1, family = poisson(), data = dat) +res <- simulate_residuals(m) +check_residuals(res) +\dontshow{\}) # examplesIf} +} diff --git a/man/simulate_residuals.Rd b/man/simulate_residuals.Rd index 22da129b2..4e0498624 100644 --- a/man/simulate_residuals.Rd +++ b/man/simulate_residuals.Rd @@ -4,11 +4,13 @@ \alias{simulate_residuals} \title{Simulate randomized quantile residuals from a model} \usage{ -simulate_residuals(x, ...) +simulate_residuals(x, iterations = 250, ...) } \arguments{ \item{x}{A model object.} +\item{iterations}{Number of simulations to run.} + \item{...}{Arguments passed on to \code{\link[DHARMa:simulateResiduals]{DHARMa::simulateResiduals()}}.} } \value{ @@ -21,9 +23,10 @@ to do. Based on \code{\link[DHARMa:simulateResiduals]{DHARMa::simulateResiduals()}}. See also \code{vignette("DHARMa")}. } \examples{ +\dontshow{if (require("DHARMa")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} m <- lm(mpg ~ wt + cyl + gear + disp, data = mtcars) simulate_residuals(m) - +\dontshow{\}) # examplesIf} } \references{ \itemize{ From 22157841c87c540bd063688f0f07824754cf69df Mon Sep 17 00:00:00 2001 From: Daniel Date: Mon, 30 Oct 2023 07:59:03 +0100 Subject: [PATCH 18/73] fix print --- R/simulate_residuals.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/simulate_residuals.R b/R/simulate_residuals.R index d1e920a3a..819c90bf9 100644 --- a/R/simulate_residuals.R +++ b/R/simulate_residuals.R @@ -48,11 +48,11 @@ print.performance_simres <- function(x, ...) { # DHARMa uses, but with an easystats style. For now we can just stick with # DHARMa's method. msg <- paste0( - "Simulated residuals from a model of class `", class(x$fittedModel), + "Simulated residuals from a model of class `", class(x$fittedModel)[1], "` based on ", x$nSim, " simulations. Use `check_residuals()` to check ", "uniformity of residuals. It is recommended to refer to `?DHARMa::simulateReisudals`", " and `vignette(\"DHARMa\")` for more information about different settings", - " in particular situations or for particular models." + " in particular situations or for particular models.\n" ) cat(insight::format_message(msg)) } From 5b439d7c9ec9e6d2c46163b55b953b40cd301f4a Mon Sep 17 00:00:00 2001 From: Daniel Date: Mon, 30 Oct 2023 08:02:07 +0100 Subject: [PATCH 19/73] docs --- R/check_residuals.R | 6 ++++++ R/simulate_residuals.R | 5 ++++- man/check_residuals.Rd | 9 +++++++++ man/simulate_residuals.Rd | 6 +++++- 4 files changed, 24 insertions(+), 2 deletions(-) diff --git a/R/check_residuals.R b/R/check_residuals.R index 8ef1f6866..c0968fb28 100644 --- a/R/check_residuals.R +++ b/R/check_residuals.R @@ -11,6 +11,12 @@ #' See [`stats::ks.test()`] for details. #' @param ... Passed down to [`stats::ks.test()`] #' +#' @details Uniformity of residuals is checked using a Kolmogorov-Smirnov test. +#' +#' @seealso [`simulate_residuals()`] +#' +#' @return The p-value of the test statistics. +#' #' @examplesIf require("DHARMa") #' dat <- DHARMa::createData(sampleSize = 100, overdispersion = 0.5, family = poisson()) #' m <- glm(observedResponse ~ Environment1, family = poisson(), data = dat) diff --git a/R/simulate_residuals.R b/R/simulate_residuals.R index 819c90bf9..1afd84851 100644 --- a/R/simulate_residuals.R +++ b/R/simulate_residuals.R @@ -7,7 +7,10 @@ #' @param iterations Number of simulations to run. #' @param ... Arguments passed on to [`DHARMa::simulateResiduals()`]. #' -#' @return Simulated residuals. +#' @return Simulated residuals, which can be further processed with +#' [`check_residuals()`]. +#' +#' @seealso [`check_residuals()`] #' #' @details Based on [`DHARMa::simulateResiduals()`]. See also `vignette("DHARMa")`. #' diff --git a/man/check_residuals.Rd b/man/check_residuals.Rd index fdeb931d9..412615ef3 100644 --- a/man/check_residuals.Rd +++ b/man/check_residuals.Rd @@ -18,12 +18,18 @@ check_residuals(x, ...) \item{alternative}{A character string specifying the alternative hypothesis. See \code{\link[stats:ks.test]{stats::ks.test()}} for details.} } +\value{ +The p-value of the test statistics. +} \description{ \code{check_residuals()} checks generalized linear (mixed) models for uniformity of randomized quantile residuals, which can be used to identify typical model misspecification problems, such as over/underdispersion, zero-inflation, and residual spatial and temporal autocorrelation. } +\details{ +Uniformity of residuals is checked using a Kolmogorov-Smirnov test. +} \examples{ \dontshow{if (require("DHARMa")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} dat <- DHARMa::createData(sampleSize = 100, overdispersion = 0.5, family = poisson()) @@ -32,3 +38,6 @@ res <- simulate_residuals(m) check_residuals(res) \dontshow{\}) # examplesIf} } +\seealso{ +\code{\link[=simulate_residuals]{simulate_residuals()}} +} diff --git a/man/simulate_residuals.Rd b/man/simulate_residuals.Rd index 4e0498624..f032aabd9 100644 --- a/man/simulate_residuals.Rd +++ b/man/simulate_residuals.Rd @@ -14,7 +14,8 @@ simulate_residuals(x, iterations = 250, ...) \item{...}{Arguments passed on to \code{\link[DHARMa:simulateResiduals]{DHARMa::simulateResiduals()}}.} } \value{ -Simulated residuals. +Simulated residuals, which can be further processed with +\code{\link[=check_residuals]{check_residuals()}}. } \description{ to do. @@ -37,3 +38,6 @@ https://CRAN.R-project.org/package=DHARMa of Computational and Graphical Statistics, 5(3), 236. \doi{10.2307/1390802} } } +\seealso{ +\code{\link[=check_residuals]{check_residuals()}} +} From bd737f1476a4b75e042f986b97a69ada60251a7a Mon Sep 17 00:00:00 2001 From: Daniel Date: Mon, 30 Oct 2023 08:12:19 +0100 Subject: [PATCH 20/73] add plot method --- NAMESPACE | 1 + R/simulate_residuals.R | 8 +++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/NAMESPACE b/NAMESPACE index f70dc5e92..33c6de753 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -265,6 +265,7 @@ S3method(plot,check_sphericity) S3method(plot,compare_performance) S3method(plot,performance_pp_check) S3method(plot,performance_roc) +S3method(plot,performance_simres) S3method(plot,test_likelihoodratio) S3method(plot,test_performance) S3method(print,binned_residuals) diff --git a/R/simulate_residuals.R b/R/simulate_residuals.R index 1afd84851..ffbd33b3d 100644 --- a/R/simulate_residuals.R +++ b/R/simulate_residuals.R @@ -39,7 +39,7 @@ simulate_residuals <- function(x, iterations = 250, ...) { # in it that we'll want to pass onto other functions later, such as passing # the fitted model into check_model(). out <- DHARMa::simulateResiduals(x, n = iterations, plot = FALSE, ...) - class(out) <- c("performance_simres", class(out)) + class(out) <- c("performance_simres", "see_performance_simres", class(out)) out } @@ -59,3 +59,9 @@ print.performance_simres <- function(x, ...) { ) cat(insight::format_message(msg)) } + +#' @export +plot.performance_simres <- function(x, ...) { + insight::check_if_installed("see", "for residual plots") + NextMethod() +} From d515869a0b517f44f6bf3600ba28d2a1a9d9f10a Mon Sep 17 00:00:00 2001 From: Michael McCarthy <51542091+mccarthy-m-g@users.noreply.github.com> Date: Fri, 3 Nov 2023 13:58:35 -0700 Subject: [PATCH 21/73] add print method for `check_residuals()` --- NAMESPACE | 1 + R/check_residuals.R | 45 ++++++++++++++++++++++++++++++++++++++------- 2 files changed, 39 insertions(+), 7 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 33c6de753..521d979c0 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -285,6 +285,7 @@ S3method(print,check_outliers) S3method(print,check_outliers_metafor) S3method(print,check_outliers_metagen) S3method(print,check_overdisp) +S3method(print,check_residuals) S3method(print,check_sphericity) S3method(print,check_symmetry) S3method(print,check_zi) diff --git a/R/check_residuals.R b/R/check_residuals.R index c0968fb28..86d918777 100644 --- a/R/check_residuals.R +++ b/R/check_residuals.R @@ -30,7 +30,7 @@ check_residuals <- function(x, ...) { #' @export check_residuals.default <- function(x, ...) { - insight::format_error("`check_residuals()` only works with objects returned by `simulate_residuals()` by `DHARMa::simulateResiduals()`.") # nolint + insight::format_error("`check_residuals()` only works with objects returned by `simulate_residuals()` or `DHARMa::simulateResiduals()`.") # nolint } #' @rdname check_residuals @@ -39,12 +39,22 @@ check_residuals.performance_simres <- function(x, alternative = c("two.sided", "less", "greater"), ...) { alternative <- match.arg(alternative) - stats::ks.test( - stats::residuals(simulated_residuals), - "punif", - alternative = alternative, - ... + ts <- suppressWarnings( + stats::ks.test( + stats::residuals(simulated_residuals), + "punif", + alternative = alternative, + ... + ) ) + + p.val <- ts$p.value + + attr(p.val, "data") <- x + attr(p.val, "object_name") <- insight::safe_deparse_symbol(substitute(x)) + class(p.val) <- unique(c("check_residuals", "see_check_residuals", class(p.val))) + + p.val } #' @export @@ -53,4 +63,25 @@ check_residuals.DHARMa <- check_residuals.performance_simres # methods ------------------------------ -# TODO: Add print method +#' @export +print.check_residuals <- function(x, ...) { + pstring <- insight::format_p(x) + + if (x < 0.05) { + insight::print_color( + sprintf( + "Warning: Non-uniformity of simulated residuals detected (%s).\n", pstring + ), + "red" + ) + } else { + insight::print_color( + sprintf( + "OK: Simulated residuals appear as uniformly distributed (%s).\n", pstring + ), + "green" + ) + } + + invisible(x) +} From 08eb9e05b6de3be3f8b374cd666555d26ce239ad Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 14 Mar 2024 08:16:59 +0100 Subject: [PATCH 22/73] fix --- R/check_residuals.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/check_residuals.R b/R/check_residuals.R index 86d918777..0dcfc9e56 100644 --- a/R/check_residuals.R +++ b/R/check_residuals.R @@ -41,7 +41,7 @@ check_residuals.performance_simres <- function(x, alternative <- match.arg(alternative) ts <- suppressWarnings( stats::ks.test( - stats::residuals(simulated_residuals), + stats::residuals(x), "punif", alternative = alternative, ... From 0533558e1dce05255c556870f5585de2e6fa03a3 Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 14 Mar 2024 08:20:34 +0100 Subject: [PATCH 23/73] test --- tests/testthat/test-check_residuals.R | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 tests/testthat/test-check_residuals.R diff --git a/tests/testthat/test-check_residuals.R b/tests/testthat/test-check_residuals.R new file mode 100644 index 000000000..df92e3505 --- /dev/null +++ b/tests/testthat/test-check_residuals.R @@ -0,0 +1,14 @@ +test_that("check_singularity, lme4", { + skip_on_cran() + skip_if_not_installed("DHARMa") + set.seed(123) + dat <- DHARMa::createData(sampleSize = 100, overdispersion = 0.5, family = poisson()) + m <- glm(observedResponse ~ Environment1, family = poisson(), data = dat) + res <- simulate_residuals(m) + out <- check_residuals(res) + expect_equal(out, 0.01884602, ignore_attr = TRUE, tolerance = 1e-4) + expect_identical( + capture.output(print(out)), + "Warning: Non-uniformity of simulated residuals detected (p = 0.019)." + ) +}) From 87d9036dff224d32c6bfe30904fd2c417c635355 Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 15 Mar 2024 11:49:30 +0100 Subject: [PATCH 24/73] docs --- NAMESPACE | 1 + R/check_residuals.R | 8 +++++++- R/simulate_residuals.R | 12 +++++++++--- man/check_residuals.Rd | 2 +- man/simulate_residuals.Rd | 12 +++++++++--- 5 files changed, 27 insertions(+), 8 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 4204331b3..5ea1259b3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -264,6 +264,7 @@ S3method(plot,check_model) S3method(plot,check_normality) S3method(plot,check_outliers) S3method(plot,check_overdisp) +S3method(plot,check_residuals) S3method(plot,check_sphericity) S3method(plot,compare_performance) S3method(plot,performance_pp_check) diff --git a/R/check_residuals.R b/R/check_residuals.R index 0dcfc9e56..137d414d3 100644 --- a/R/check_residuals.R +++ b/R/check_residuals.R @@ -9,7 +9,7 @@ #' [`DHARMa::simulateResiduals()`]. #' @param alternative A character string specifying the alternative hypothesis. #' See [`stats::ks.test()`] for details. -#' @param ... Passed down to [`stats::ks.test()`] +#' @param ... Passed down to [`stats::ks.test()`]. #' #' @details Uniformity of residuals is checked using a Kolmogorov-Smirnov test. #' @@ -85,3 +85,9 @@ print.check_residuals <- function(x, ...) { invisible(x) } + +#' @export +plot.check_residuals <- function(x, ...) { + insight::check_if_installed("see", "for residual plots") + NextMethod() +} diff --git a/R/simulate_residuals.R b/R/simulate_residuals.R index ffbd33b3d..51aa60146 100644 --- a/R/simulate_residuals.R +++ b/R/simulate_residuals.R @@ -1,18 +1,24 @@ #' @title Simulate randomized quantile residuals from a model #' @name simulate_residuals #' -#' @description to do. +#' @description Returns simulated residuals from a model. This is useful for +#' checking the uniformity of residuals, in particular for non-Gaussian models, +#' where the residuals are not expected to be normally distributed. #' #' @param x A model object. #' @param iterations Number of simulations to run. #' @param ... Arguments passed on to [`DHARMa::simulateResiduals()`]. #' #' @return Simulated residuals, which can be further processed with -#' [`check_residuals()`]. +#' [`check_residuals()`]. The returned object is of class `DHARMa` and +#' `performance_simres`. #' #' @seealso [`check_residuals()`] #' -#' @details Based on [`DHARMa::simulateResiduals()`]. See also `vignette("DHARMa")`. +#' @details This function is a small wrapper around [`DHARMa::simulateResiduals()`]. +#' It basically only sets `plot = FALSE` and adds an additional class attribute +#' (`"performance_sim_res"`), which allows using the DHARMa object in own plotting +#' functions in the **see** package. See also `vignette("DHARMa")`. #' #' @references #' diff --git a/man/check_residuals.Rd b/man/check_residuals.Rd index 412615ef3..ab487074a 100644 --- a/man/check_residuals.Rd +++ b/man/check_residuals.Rd @@ -13,7 +13,7 @@ check_residuals(x, ...) \item{x}{An object returned by \code{\link[=simulate_residuals]{simulate_residuals()}} or \code{\link[DHARMa:simulateResiduals]{DHARMa::simulateResiduals()}}.} -\item{...}{Passed down to \code{\link[stats:ks.test]{stats::ks.test()}}} +\item{...}{Passed down to \code{\link[stats:ks.test]{stats::ks.test()}}.} \item{alternative}{A character string specifying the alternative hypothesis. See \code{\link[stats:ks.test]{stats::ks.test()}} for details.} diff --git a/man/simulate_residuals.Rd b/man/simulate_residuals.Rd index f032aabd9..3a51be7cc 100644 --- a/man/simulate_residuals.Rd +++ b/man/simulate_residuals.Rd @@ -15,13 +15,19 @@ simulate_residuals(x, iterations = 250, ...) } \value{ Simulated residuals, which can be further processed with -\code{\link[=check_residuals]{check_residuals()}}. +\code{\link[=check_residuals]{check_residuals()}}. The returned object is of class \code{DHARMa} and +\code{performance_simres}. } \description{ -to do. +Returns simulated residuals from a model. This is useful for +checking the uniformity of residuals, in particular for non-Gaussian models, +where the residuals are not expected to be normally distributed. } \details{ -Based on \code{\link[DHARMa:simulateResiduals]{DHARMa::simulateResiduals()}}. See also \code{vignette("DHARMa")}. +This function is a small wrapper around \code{\link[DHARMa:simulateResiduals]{DHARMa::simulateResiduals()}}. +It basically only sets \code{plot = FALSE} and adds an additional class attribute +(\code{"performance_sim_res"}), which allows using the DHARMa object in own plotting +functions in the \strong{see} package. See also \code{vignette("DHARMa")}. } \examples{ \dontshow{if (require("DHARMa")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} From aea57eb59e11a3e5805bc03e5cca52a22a23983f Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 15 Mar 2024 11:50:14 +0100 Subject: [PATCH 25/73] version bump --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 752b1364f..7263302b7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: performance Title: Assessment of Regression Models Performance -Version: 0.10.9.6 +Version: 0.10.9.7 Authors@R: c(person(given = "Daniel", family = "Lüdecke", From 4cf45bae3c50843946c3aa249d96b0b1537797bf Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 15 Mar 2024 11:51:15 +0100 Subject: [PATCH 26/73] news --- NEWS.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/NEWS.md b/NEWS.md index 9e2f2daa8..e9a457c09 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,6 +4,11 @@ * Rudimentary support for models of class `serp` from package *serp*. +## New functions + +* `simulate_residuals()` and `check_residuals()`, to simulate and check residuals + from generalized linear (mixed) models. + ## General * Improved error messages for `check_model()` when QQ-plots cannot be created. From b9454a2f6948f9d7402babce79ab0a9fa91610f2 Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 15 Mar 2024 13:14:09 +0100 Subject: [PATCH 27/73] include DHARMa plot --- R/check_model.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/check_model.R b/R/check_model.R index e015bff80..1757c32e2 100644 --- a/R/check_model.R +++ b/R/check_model.R @@ -367,7 +367,9 @@ check_model.model_fit <- function(x, dat <- list() dat$VIF <- .diag_vif(model, verbose = verbose) - dat$QQ <- .diag_qq(model, model_info = model_info, verbose = verbose) + # old QQ plots - now replaced by DHARma + # dat$QQ <- .diag_qq(model, model_info = model_info, verbose = verbose) + dat$QQ <- simulate_residuals(model) dat$HOMOGENEITY <- .diag_homogeneity(model, verbose = verbose) dat$REQQ <- .diag_reqq(model, level = 0.95, model_info = model_info, verbose = verbose) dat$OUTLIERS <- .safe(check_outliers(model, method = "cook")) From b1281741fd2d299462b8bf9beb875f84dcf39439 Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 15 Mar 2024 14:48:53 +0100 Subject: [PATCH 28/73] docs --- R/check_normality.R | 4 ++-- R/check_residuals.R | 1 + R/simulate_residuals.R | 3 ++- man/check_residuals.Rd | 1 + man/simulate_residuals.Rd | 3 ++- 5 files changed, 8 insertions(+), 4 deletions(-) diff --git a/R/check_normality.R b/R/check_normality.R index 9dc00d03f..fe2fbf28f 100644 --- a/R/check_normality.R +++ b/R/check_normality.R @@ -58,7 +58,7 @@ check_normality.default <- function(x, ...) { if (!insight::model_info(x)$is_linear) { insight::format_alert( - "Checking normality of residuals is only appropriate for linear models." + "Checking normality of residuals is only appropriate for linear models. It is recommended to use `simulate_residuals()` and `check_residuals()` to check generalized linear (mixed) models for uniformity of residuals." # nolint ) return(NULL) } @@ -87,7 +87,7 @@ check_normality.glm <- function(x, ...) { insight::format_alert( "There's no formal statistical test for normality for generalized linear model.", - "Please use `plot()` on the return value of this function: `plot(check_normality(model))`" + "Instead, please use `simulate_residuals()` and `check_residuals()` to check for uniformity of residuals." ) invisible(out) } diff --git a/R/check_residuals.R b/R/check_residuals.R index 137d414d3..3bb202e21 100644 --- a/R/check_residuals.R +++ b/R/check_residuals.R @@ -12,6 +12,7 @@ #' @param ... Passed down to [`stats::ks.test()`]. #' #' @details Uniformity of residuals is checked using a Kolmogorov-Smirnov test. +#' There is a `plot()` method to visualize the distribution of the residuals. #' #' @seealso [`simulate_residuals()`] #' diff --git a/R/simulate_residuals.R b/R/simulate_residuals.R index 51aa60146..f9847e4db 100644 --- a/R/simulate_residuals.R +++ b/R/simulate_residuals.R @@ -18,7 +18,8 @@ #' @details This function is a small wrapper around [`DHARMa::simulateResiduals()`]. #' It basically only sets `plot = FALSE` and adds an additional class attribute #' (`"performance_sim_res"`), which allows using the DHARMa object in own plotting -#' functions in the **see** package. See also `vignette("DHARMa")`. +#' functions in the **see** package. See also `vignette("DHARMa")`. There is a +#' `plot()` method to visualize the distribution of the residuals. #' #' @references #' diff --git a/man/check_residuals.Rd b/man/check_residuals.Rd index ab487074a..d8c0b1ca2 100644 --- a/man/check_residuals.Rd +++ b/man/check_residuals.Rd @@ -29,6 +29,7 @@ residual spatial and temporal autocorrelation. } \details{ Uniformity of residuals is checked using a Kolmogorov-Smirnov test. +There is a \code{plot()} method to visualize the distribution of the residuals. } \examples{ \dontshow{if (require("DHARMa")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} diff --git a/man/simulate_residuals.Rd b/man/simulate_residuals.Rd index 3a51be7cc..134adc3cc 100644 --- a/man/simulate_residuals.Rd +++ b/man/simulate_residuals.Rd @@ -27,7 +27,8 @@ where the residuals are not expected to be normally distributed. This function is a small wrapper around \code{\link[DHARMa:simulateResiduals]{DHARMa::simulateResiduals()}}. It basically only sets \code{plot = FALSE} and adds an additional class attribute (\code{"performance_sim_res"}), which allows using the DHARMa object in own plotting -functions in the \strong{see} package. See also \code{vignette("DHARMa")}. +functions in the \strong{see} package. See also \code{vignette("DHARMa")}. There is a +\code{plot()} method to visualize the distribution of the residuals. } \examples{ \dontshow{if (require("DHARMa")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} From 85817fef27625b7d3052fbbb4f5b8899d99086ed Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 15 Mar 2024 14:55:50 +0100 Subject: [PATCH 29/73] detrend default to FALSE --- R/check_model.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/R/check_model.R b/R/check_model.R index 1757c32e2..af8c5d6a4 100644 --- a/R/check_model.R +++ b/R/check_model.R @@ -189,6 +189,9 @@ check_model.default <- function(x, } else if (minfo$is_linear) { suppressWarnings(.check_assumptions_linear(x, minfo, verbose, ...)) } else { + if (missing(detrend)) { + detrend <- FALSE + } suppressWarnings(.check_assumptions_glm(x, minfo, verbose, ...)) }, error = function(e) { From 764dcdf5bb067c6db22a9d2a74774a7df7a67efb Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 15 Mar 2024 15:30:28 +0100 Subject: [PATCH 30/73] allow to directly pass models --- R/check_residuals.R | 14 ++++++++------ man/check_residuals.Rd | 4 ++-- tests/testthat/test-check_residuals.R | 10 ++++++++++ 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/R/check_residuals.R b/R/check_residuals.R index 3bb202e21..628c5cfd0 100644 --- a/R/check_residuals.R +++ b/R/check_residuals.R @@ -29,16 +29,18 @@ check_residuals <- function(x, ...) { UseMethod("check_residuals") } +#' @rdname check_residuals #' @export -check_residuals.default <- function(x, ...) { - insight::format_error("`check_residuals()` only works with objects returned by `simulate_residuals()` or `DHARMa::simulateResiduals()`.") # nolint +check_residuals.default <- function(x, alternative = c("two.sided", "less", "greater"), ...) { + if (insight::is_model(x)) { + check_residuals(simulate_residuals(x, ...), alternative = alternative) + } else { + insight::format_error("`check_residuals()` only works with objects supported by `simulate_residuals()` or `DHARMa::simulateResiduals()`.") # nolint + } } -#' @rdname check_residuals #' @export -check_residuals.performance_simres <- function(x, - alternative = c("two.sided", "less", "greater"), - ...) { +check_residuals.performance_simres <- function(x, alternative = c("two.sided", "less", "greater"), ...) { alternative <- match.arg(alternative) ts <- suppressWarnings( stats::ks.test( diff --git a/man/check_residuals.Rd b/man/check_residuals.Rd index d8c0b1ca2..b91615953 100644 --- a/man/check_residuals.Rd +++ b/man/check_residuals.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/check_residuals.R \name{check_residuals} \alias{check_residuals} -\alias{check_residuals.performance_simres} +\alias{check_residuals.default} \title{Check uniformity of simulated residuals} \usage{ check_residuals(x, ...) -\method{check_residuals}{performance_simres}(x, alternative = c("two.sided", "less", "greater"), ...) +\method{check_residuals}{default}(x, alternative = c("two.sided", "less", "greater"), ...) } \arguments{ \item{x}{An object returned by \code{\link[=simulate_residuals]{simulate_residuals()}} or diff --git a/tests/testthat/test-check_residuals.R b/tests/testthat/test-check_residuals.R index df92e3505..b057174d0 100644 --- a/tests/testthat/test-check_residuals.R +++ b/tests/testthat/test-check_residuals.R @@ -11,4 +11,14 @@ test_that("check_singularity, lme4", { capture.output(print(out)), "Warning: Non-uniformity of simulated residuals detected (p = 0.019)." ) + skip_if_not_installed("MASS") + set.seed(3) + mu <- rpois(500, lambda = 3) + x <- rnorm(500, mu, mu * 3) + x <- ceiling(x) + x <- pmax(x, 0) + quine.nb1 <- MASS::glm.nb(x ~ mu) + set.seed(123) + result <- check_residuals(quine.nb1) + expect_equal(result, 0.000665414, tolerance = 1e-3) }) From 7d80db4e4993ca60b6c0fd808a3f377564edd1c3 Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 15 Mar 2024 15:40:15 +0100 Subject: [PATCH 31/73] close #613 --- R/check_normality.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/check_normality.R b/R/check_normality.R index fe2fbf28f..2d80ec89e 100644 --- a/R/check_normality.R +++ b/R/check_normality.R @@ -181,7 +181,7 @@ check_normality.merMod <- function(x, effects = c("fixed", "random"), ...) { # valid model? if (!info$is_linear && effects == "fixed") { insight::format_alert( - "Checking normality of residuals is only appropriate for linear models." + "Checking normality of residuals is only appropriate for linear models. It is recommended to use `simulate_residuals()` and `check_residuals()` to check generalized linear (mixed) models for uniformity of residuals." # nolint ) return(NULL) } @@ -217,6 +217,8 @@ check_normality.merMod <- function(x, effects = c("fixed", "random"), ...) { attr(p.val, "type") <- "random effects" attr(p.val, "re_groups") <- re_groups } + } else if (inherits(x, "glmmTMB")) { + p.val <- .check_normality(stats::residuals(x, type = "deviance"), x) } else { # check for normality of residuals p.val <- .check_normality(stats::rstudent(x), x) From 3820cb05ccfc849f8e98ab3620b421908ee51e2f Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 15 Mar 2024 16:26:29 +0100 Subject: [PATCH 32/73] docs --- R/check_model.R | 89 ++++++++++++++++++++++++++++------------------ man/check_model.Rd | 38 +++++++++++++++----- 2 files changed, 85 insertions(+), 42 deletions(-) diff --git a/R/check_model.R b/R/check_model.R index af8c5d6a4..7e30d7c80 100644 --- a/R/check_model.R +++ b/R/check_model.R @@ -12,40 +12,50 @@ #' @param panel Logical, if `TRUE`, plots are arranged as panels; else, #' single plots for each diagnostic are returned. #' @param check Character vector, indicating which checks for should be performed -#' and plotted. May be one or more of `"all"`, `"vif"`, `"qq"`, `"normality"`, -#' `"linearity"`, `"ncv"`, `"homogeneity"`, `"outliers"`, `"reqq"`, `"pp_check"`, -#' `"binned_residuals"` or `"overdispersion"`. Note that not all check apply -#' to all type of models (see 'Details'). `"reqq"` is a QQ-plot for random -#' effects and only available for mixed models. `"ncv"` is an alias for -#' `"linearity"`, and checks for non-constant variance, i.e. for -#' heteroscedasticity, as well as the linear relationship. By default, all -#' possible checks are performed and plotted. +#' and plotted. May be one or more of `"all"`, `"vif"`, `"qq"`, `"normality"`, +#' `"linearity"`, `"ncv"`, `"homogeneity"`, `"outliers"`, `"reqq"`, `"pp_check"`, +#' `"binned_residuals"` or `"overdispersion"`. Note that not all check apply +#' to all type of models (see 'Details'). `"reqq"` is a QQ-plot for random +#' effects and only available for mixed models. `"ncv"` is an alias for +#' `"linearity"`, and checks for non-constant variance, i.e. for +#' heteroscedasticity, as well as the linear relationship. By default, all +#' possible checks are performed and plotted. #' @param alpha,dot_alpha The alpha level of the confidence bands and dot-geoms. -#' Scalar from 0 to 1. +#' Scalar from 0 to 1. #' @param colors Character vector with color codes (hex-format). Must be of -#' length 3. First color is usually used for reference lines, second color -#' for dots, and third color for outliers or extreme values. +#' length 3. First color is usually used for reference lines, second color +#' for dots, and third color for outliers or extreme values. #' @param theme String, indicating the name of the plot-theme. Must be in the -#' format `"package::theme_name"` (e.g. `"ggplot2::theme_minimal"`). +#' format `"package::theme_name"` (e.g. `"ggplot2::theme_minimal"`). #' @param detrend Logical. Should Q-Q/P-P plots be detrended? Defaults to -#' `TRUE`. +#' `TRUE` for linear models or when `residual_type = "normal"`. Defaults to +#' `FALSE` for QQ plots based on simulated residuals (i.e. when +#' `residual_type = "simulated"`). +#' @param residual_type Character, indicating the type of residuals to be used. +#' For non-Gaussian models, the default is `"simulated"`, which uses simulated +#' residuals. These are based on [`simulate_residuals()`] and thus uses the +#' **DHARMa** package to return randomized quantile residuals. For Gaussian +#' models, the default is `"normal"`, which uses the default residuals from +#' the model. Setting `residual_type = "normal"` for non-Gaussian models will +#' use a half-normal Q-Q plot of the absolute value of the standardized deviance +#' residuals. #' @param show_dots Logical, if `TRUE`, will show data points in the plot. Set -#' to `FALSE` for models with many observations, if generating the plot is too -#' time-consuming. By default, `show_dots = NULL`. In this case `check_model()` -#' tries to guess whether performance will be poor due to a very large model -#' and thus automatically shows or hides dots. +#' to `FALSE` for models with many observations, if generating the plot is too +#' time-consuming. By default, `show_dots = NULL`. In this case `check_model()` +#' tries to guess whether performance will be poor due to a very large model +#' and thus automatically shows or hides dots. #' @param verbose If `FALSE` (default), suppress most warning messages. #' @param ... Arguments passed down to the individual check functions, especially -#' to `check_predictions()` and `binned_residuals()`. +#' to `check_predictions()` and `binned_residuals()`. #' @inheritParams check_predictions #' #' @return The data frame that is used for plotting. #' #' @note This function just prepares the data for plotting. To create the plots, -#' **see** needs to be installed. Furthermore, this function suppresses -#' all possible warnings. In case you observe suspicious plots, please refer -#' to the dedicated functions (like `check_collinearity()`, -#' `check_normality()` etc.) to get informative messages and warnings. +#' **see** needs to be installed. Furthermore, this function suppresses +#' all possible warnings. In case you observe suspicious plots, please refer +#' to the dedicated functions (like `check_collinearity()`, +#' `check_normality()` etc.) to get informative messages and warnings. #' #' @details For Bayesian models from packages **rstanarm** or **brms**, #' models will be "converted" to their frequentist counterpart, using @@ -103,10 +113,18 @@ #' normally distributed. Usually, dots should fall along the line. If there is #' some deviation (mostly at the tails), this indicates that the model doesn't #' predict the outcome well for that range that shows larger deviations from -#' the line. For generalized linear models, a half-normal Q-Q plot of the -#' absolute value of the standardized deviance residuals is shown, however, the -#' interpretation of the plot remains the same. See [`check_normality()`] for -#' further details. +#' the line. For generalized linear models and when `residual_type = "normal"`, +#' a half-normal Q-Q plot of the absolute value of the standardized deviance +#' residuals is shown, however, the interpretation of the plot remains the same. +#' See [`check_normality()`] for further details. +#' +#' @section Uniformity of Residuals: +#' Fore non-Gaussian models, when `residual_type = "simulated"` (the default +#' for generalized linear (mixed) models), residuals are not expected to be +#' normally distributed. In this case, the created Q-Q plot checks the uniformity +#' of residuals. The interpretation of the plot is the same as for the normal +#' Q-Q plot. See [`simulate_residuals()`] and [`check_residuals()`] for further +#' details. #' #' @section Overdispersion: #' For count models, an *overdispersion plot* is shown. Overdispersion occurs @@ -124,12 +142,12 @@ #' inside the error bounds. See [`binned_residuals()`] for further details. #' #' @section Residuals for (Generalized) Linear Models: -#' Plots that check the normality of residuals (QQ-plot) or the homogeneity of +#' Plots that check the normality of residuals (Q-Q plot) or the homogeneity of #' variance use standardized Pearson's residuals for generalized linear models, #' and standardized residuals for linear models. The plots for the normality of #' residuals (with overlayed normal curve) and for the linearity assumption use -#' the default residuals for `lm` and `glm` (which are deviance -#' residuals for `glm`). +#' the default residuals for `lm` and `glm` (which are deviance residuals for +#' `glm`). #' #' @section Troubleshooting: #' For models with many observations, or for more complex models in general, @@ -174,6 +192,7 @@ check_model.default <- function(x, show_dots = NULL, bandwidth = "nrd", type = "density", + residual_type = "simulated", verbose = FALSE, ...) { # check model formula @@ -192,7 +211,7 @@ check_model.default <- function(x, if (missing(detrend)) { detrend <- FALSE } - suppressWarnings(.check_assumptions_glm(x, minfo, verbose, ...)) + suppressWarnings(.check_assumptions_glm(x, minfo, residual_type, verbose, ...)) }, error = function(e) { e @@ -366,13 +385,15 @@ check_model.model_fit <- function(x, # compile plots for checks of generalized linear models ------------------------ -.check_assumptions_glm <- function(model, model_info, verbose = TRUE, ...) { +.check_assumptions_glm <- function(model, model_info, residual_type = "simulated", verbose = TRUE, ...) { dat <- list() dat$VIF <- .diag_vif(model, verbose = verbose) - # old QQ plots - now replaced by DHARma - # dat$QQ <- .diag_qq(model, model_info = model_info, verbose = verbose) - dat$QQ <- simulate_residuals(model) + dat$QQ <- switch( + residual_type, + simulated = simulate_residuals(model), + .diag_qq(model, model_info = model_info, verbose = verbose) + ) dat$HOMOGENEITY <- .diag_homogeneity(model, verbose = verbose) dat$REQQ <- .diag_reqq(model, level = 0.95, model_info = model_info, verbose = verbose) dat$OUTLIERS <- .safe(check_outliers(model, method = "cook")) diff --git a/man/check_model.Rd b/man/check_model.Rd index 4e8b5fddf..ea0a390fc 100644 --- a/man/check_model.Rd +++ b/man/check_model.Rd @@ -21,6 +21,7 @@ check_model(x, ...) show_dots = NULL, bandwidth = "nrd", type = "density", + residual_type = "simulated", verbose = FALSE, ... ) @@ -57,7 +58,9 @@ for dots, and third color for outliers or extreme values.} format \code{"package::theme_name"} (e.g. \code{"ggplot2::theme_minimal"}).} \item{detrend}{Logical. Should Q-Q/P-P plots be detrended? Defaults to -\code{TRUE}.} +\code{TRUE} for linear models or when \code{residual_type = "normal"}. Defaults to +\code{FALSE} for QQ plots based on simulated residuals (i.e. when +\code{residual_type = "simulated"}).} \item{show_dots}{Logical, if \code{TRUE}, will show data points in the plot. Set to \code{FALSE} for models with many observations, if generating the plot is too @@ -76,6 +79,15 @@ to a different value.} options are appropriate for models with discrete - binary, integer or ordinal etc. - outcomes).} +\item{residual_type}{Character, indicating the type of residuals to be used. +For non-Gaussian models, the default is \code{"simulated"}, which uses simulated +residuals. These are based on \code{\link[=simulate_residuals]{simulate_residuals()}} and thus uses the +\strong{DHARMa} package to return randomized quantile residuals. For Gaussian +models, the default is \code{"normal"}, which uses the default residuals from +the model. Setting \code{residual_type = "normal"} for non-Gaussian models will +use a half-normal Q-Q plot of the absolute value of the standardized deviance +residuals.} + \item{verbose}{If \code{FALSE} (default), suppress most warning messages.} } \value{ @@ -161,10 +173,20 @@ This plot is used to determine if the residuals of the regression model are normally distributed. Usually, dots should fall along the line. If there is some deviation (mostly at the tails), this indicates that the model doesn't predict the outcome well for that range that shows larger deviations from -the line. For generalized linear models, a half-normal Q-Q plot of the -absolute value of the standardized deviance residuals is shown, however, the -interpretation of the plot remains the same. See \code{\link[=check_normality]{check_normality()}} for -further details. +the line. For generalized linear models and when \code{residual_type = "normal"}, +a half-normal Q-Q plot of the absolute value of the standardized deviance +residuals is shown, however, the interpretation of the plot remains the same. +See \code{\link[=check_normality]{check_normality()}} for further details. +} + +\section{Uniformity of Residuals}{ + +Fore non-Gaussian models, when \code{residual_type = "simulated"} (the default +for generalized linear (mixed) models), residuals are not expected to be +normally distributed. In this case, the created Q-Q plot checks the uniformity +of residuals. The interpretation of the plot is the same as for the normal +Q-Q plot. See \code{\link[=simulate_residuals]{simulate_residuals()}} and \code{\link[=check_residuals]{check_residuals()}} for further +details. } \section{Overdispersion}{ @@ -188,12 +210,12 @@ inside the error bounds. See \code{\link[=binned_residuals]{binned_residuals()}} \section{Residuals for (Generalized) Linear Models}{ -Plots that check the normality of residuals (QQ-plot) or the homogeneity of +Plots that check the normality of residuals (Q-Q plot) or the homogeneity of variance use standardized Pearson's residuals for generalized linear models, and standardized residuals for linear models. The plots for the normality of residuals (with overlayed normal curve) and for the linearity assumption use -the default residuals for \code{lm} and \code{glm} (which are deviance -residuals for \code{glm}). +the default residuals for \code{lm} and \code{glm} (which are deviance residuals for +\code{glm}). } \section{Troubleshooting}{ From d8b77f9629166eedf5709fac5e166a9327e9dcb9 Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 15 Mar 2024 16:31:31 +0100 Subject: [PATCH 33/73] docs, args --- R/check_model.R | 27 ++++++++++++++++++++------- man/check_model.Rd | 2 +- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/R/check_model.R b/R/check_model.R index 7e30d7c80..20bb056f4 100644 --- a/R/check_model.R +++ b/R/check_model.R @@ -192,7 +192,7 @@ check_model.default <- function(x, show_dots = NULL, bandwidth = "nrd", type = "density", - residual_type = "simulated", + residual_type = NULL, verbose = FALSE, ...) { # check model formula @@ -202,15 +202,21 @@ check_model.default <- function(x, minfo <- insight::model_info(x, verbose = FALSE) + # set default for residual_type + if (is.null(residual_type)) { + residual_type <- ifelse(minfo$is_linear, "normal", "simulated") + } + # set default for detrend + if (missing(detrend)) { + detrend <- residual_type == "normal" + } + assumptions_data <- tryCatch( if (minfo$is_bayesian) { suppressWarnings(.check_assumptions_stan(x, ...)) } else if (minfo$is_linear) { - suppressWarnings(.check_assumptions_linear(x, minfo, verbose, ...)) + suppressWarnings(.check_assumptions_linear(x, minfo, residual_type, verbose, ...)) } else { - if (missing(detrend)) { - detrend <- FALSE - } suppressWarnings(.check_assumptions_glm(x, minfo, residual_type, verbose, ...)) }, error = function(e) { @@ -294,6 +300,7 @@ check_model.stanreg <- function(x, show_dots = NULL, bandwidth = "nrd", type = "density", + residual_type = NULL, verbose = TRUE, ...) { check_model(bayestestR::bayesian_as_frequentist(x), @@ -309,6 +316,7 @@ check_model.stanreg <- function(x, show_dots = show_dots, bandwidth = bandwidth, type = type, + residual_type = residual_type, verbose = verbose, ... ) @@ -333,6 +341,7 @@ check_model.model_fit <- function(x, show_dots = NULL, bandwidth = "nrd", type = "density", + residual_type = NULL, verbose = TRUE, ...) { check_model( @@ -349,6 +358,7 @@ check_model.model_fit <- function(x, show_dots = show_dots, bandwidth = bandwidth, type = type, + residual_type = residual_type, verbose = verbose, ... ) @@ -358,11 +368,14 @@ check_model.model_fit <- function(x, # compile plots for checks of linear models ------------------------ -.check_assumptions_linear <- function(model, model_info, verbose = TRUE, ...) { +.check_assumptions_linear <- function(model, model_info, residual_type = "normal", verbose = TRUE, ...) { dat <- list() dat$VIF <- .diag_vif(model, verbose = verbose) - dat$QQ <- .diag_qq(model, model_info = model_info, verbose = verbose) + dat$QQ <- switch(residual_type, + simulated = simulate_residuals(model), + .diag_qq(model, model_info = model_info, verbose = verbose) + ) dat$REQQ <- .diag_reqq(model, level = 0.95, model_info = model_info, verbose = verbose) dat$NORM <- .diag_norm(model, verbose = verbose) dat$NCV <- .diag_ncv(model, verbose = verbose) diff --git a/man/check_model.Rd b/man/check_model.Rd index ea0a390fc..a117b1fba 100644 --- a/man/check_model.Rd +++ b/man/check_model.Rd @@ -21,7 +21,7 @@ check_model(x, ...) show_dots = NULL, bandwidth = "nrd", type = "density", - residual_type = "simulated", + residual_type = NULL, verbose = FALSE, ... ) From d1d4098fe1cb19e81450e402065addb79ffdb176 Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 15 Mar 2024 17:31:18 +0100 Subject: [PATCH 34/73] zi-inflation check based on DHARMa --- NAMESPACE | 3 ++ R/check_zeroinflation.R | 91 +++++++++++++++++++++++++++++++++++--- man/check_zeroinflation.Rd | 32 +++++++++++++- 3 files changed, 119 insertions(+), 7 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 5ea1259b3..757f4834d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -126,6 +126,9 @@ S3method(check_sphericity,default) S3method(check_sphericity,mlm) S3method(check_symmetry,htest) S3method(check_symmetry,numeric) +S3method(check_zeroinflation,DHARMa) +S3method(check_zeroinflation,default) +S3method(check_zeroinflation,performance_simres) S3method(cronbachs_alpha,data.frame) S3method(cronbachs_alpha,matrix) S3method(cronbachs_alpha,parameters_pca) diff --git a/R/check_zeroinflation.R b/R/check_zeroinflation.R index f0f19b369..01c1708d7 100644 --- a/R/check_zeroinflation.R +++ b/R/check_zeroinflation.R @@ -19,20 +19,47 @@ #' zero-inflation in the data. In such cases, it is recommended to use #' negative binomial or zero-inflated models. #' +#' In case of negative binomial models, models with zero-inflation component, +#' or hurdle models, the results from `check_zeroinflation()` are likely to be +#' unreliable. In such cases, it is recommended to use `simulate_residuals()` +#' first, followed by `check_zeroinflation()` to check for zero-inflation, +#' e.g.: `check_zeroinflation(simulate_residuals(model))`. +#' #' @family functions to check model assumptions and and assess model quality #' -#' @examplesIf require("glmmTMB") +#' @examplesIf require("glmmTMB") && require("DHARMa") #' data(Salamanders, package = "glmmTMB") #' m <- glm(count ~ spp + mined, family = poisson, data = Salamanders) #' check_zeroinflation(m) +#' +#' # for models with zero-inflation component, it's better to carry out +#' # the check for zero-inflation using simulated residuals +#' m <- glmmTMB::glmmTMB( +#' count ~ spp + mined, +#' ziformula = ~ mined + spp, +#' family = poisson, +#' data = Salamanders +#' ) +#' res <- simulate_residuals(m) +#' check_zeroinflation(res) +#' @export +check_zeroinflation <- function(x, ...) { + UseMethod("check_zeroinflation") +} + + +#' @rdname check_zeroinflation #' @export -check_zeroinflation <- function(x, tolerance = 0.05) { +check_zeroinflation.default <- function(x, tolerance = 0.05, ...) { # check if we have poisson model_info <- insight::model_info(x) if (!model_info$is_count) { insight::format_error("Model must be from Poisson-family.") } + # for warning message + model_name <- insight::safe_deparse(substitute(x)) + # get actual zero of response obs.zero <- sum(insight::get_response(x, verbose = FALSE) == 0L) @@ -41,6 +68,16 @@ check_zeroinflation <- function(x, tolerance = 0.05) { return(NULL) } + # for zero-inflated models, tell user to use simulated_residuals() + if (model_info$is_zero_inflated) { + insight::format_warning(paste0( + "\nModel has zero-inflation component, returned results are likely to be unreliable. ", + "Please use `check_zeroinflation(simulate_residuals(", + model_name, + "))` to check for zero-inflation." + )) + } + # get predictions of outcome mu <- stats::fitted(x) @@ -77,6 +114,40 @@ check_zeroinflation <- function(x, tolerance = 0.05) { } +#' @rdname check_zeroinflation +#' @export +check_zeroinflation.performance_simres <- function(x, + tolerance = 0.05, + alternative = c("two.sided", "less", "greater"), + ...) { + # match arguments + alternative <- match.arg(alternative) + # count observed and simulated zeros + observed <- sum(x$observedResponse == 0) + simulated <- apply(x$simulatedResponse, 2, function(i) sum(i == 0)) + # p is simply ratio of simulated zeros to observed zeros + p <- switch( + alternative, + greater = mean(simulated >= observed), + less = mean(simulated <= observed), + min(min(mean(simulated <= observed), mean(simulated >= observed)) * 2, 1) + ) + + structure( + class = "check_zi", + list( + predicted.zeros = round(mean(simulated)), + observed.zeros = observed, + ratio = mean(simulated) / observed, + tolerance = tolerance, + p.value = p + ) + ) +} + +#' @export +check_zeroinflation.DHARMa <- check_zeroinflation.performance_simres + # methods ------------------ @@ -90,12 +161,22 @@ print.check_zi <- function(x, ...) { lower <- 1 - x$tolerance upper <- 1 + x$tolerance + if (!is.null(x$p.value)) { + p_string <- paste0(" (", insight::format_p(x$p.value), ")") + } else { + p_string <- "" + } + if (x$ratio < lower) { - message("Model is underfitting zeros (probable zero-inflation).") + message("Model is underfitting zeros (probable zero-inflation)", p_string, ".") } else if (x$ratio > upper) { - message("Model is overfitting zeros.") + message("Model is overfitting zeros", p_string, ".") } else { - insight::format_alert("Model seems ok, ratio of observed and predicted zeros is within the tolerance range.") + insight::format_alert(paste0( + "Model seems ok, ratio of observed and predicted zeros is within the tolerance range", + p_string, + "." + )) } invisible(x) diff --git a/man/check_zeroinflation.Rd b/man/check_zeroinflation.Rd index db9eddd23..c37b3b713 100644 --- a/man/check_zeroinflation.Rd +++ b/man/check_zeroinflation.Rd @@ -2,9 +2,20 @@ % Please edit documentation in R/check_zeroinflation.R \name{check_zeroinflation} \alias{check_zeroinflation} +\alias{check_zeroinflation.default} +\alias{check_zeroinflation.performance_simres} \title{Check for zero-inflation in count models} \usage{ -check_zeroinflation(x, tolerance = 0.05) +check_zeroinflation(x, ...) + +\method{check_zeroinflation}{default}(x, tolerance = 0.05, ...) + +\method{check_zeroinflation}{performance_simres}( + x, + tolerance = 0.05, + alternative = c("two.sided", "less", "greater"), + ... +) } \arguments{ \item{x}{Fitted model of class \code{merMod}, \code{glmmTMB}, \code{glm}, or \code{glm.nb} @@ -28,12 +39,29 @@ If the amount of observed zeros is larger than the amount of predicted zeros, the model is underfitting zeros, which indicates a zero-inflation in the data. In such cases, it is recommended to use negative binomial or zero-inflated models. + +In case of negative binomial models, models with zero-inflation component, +or hurdle models, the results from \code{check_zeroinflation()} are likely to be +unreliable. In such cases, it is recommended to use \code{simulate_residuals()} +first, followed by \code{check_zeroinflation()} to check for zero-inflation, +e.g.: \code{check_zeroinflation(simulate_residuals(model))}. } \examples{ -\dontshow{if (require("glmmTMB")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +\dontshow{if (require("glmmTMB") && require("DHARMa")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} data(Salamanders, package = "glmmTMB") m <- glm(count ~ spp + mined, family = poisson, data = Salamanders) check_zeroinflation(m) + +# for models with zero-inflation component, it's better to carry out +# the check for zero-inflation using simulated residuals +m <- glmmTMB::glmmTMB( + count ~ spp + mined, + ziformula = ~ mined + spp, + family = poisson, + data = Salamanders +) +res <- simulate_residuals(m) +check_zeroinflation(res) \dontshow{\}) # examplesIf} } \seealso{ From 15a660fbe490f1ea149de94bca91ab714269be6d Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 15 Mar 2024 18:11:00 +0100 Subject: [PATCH 35/73] Fixes #367 --- R/check_zeroinflation.R | 17 +++-- man/check_zeroinflation.Rd | 2 +- tests/testthat/test-check_zeroinflation.R | 84 ++++++++++++++++++++++- 3 files changed, 91 insertions(+), 12 deletions(-) diff --git a/R/check_zeroinflation.R b/R/check_zeroinflation.R index 01c1708d7..d8e42fbe0 100644 --- a/R/check_zeroinflation.R +++ b/R/check_zeroinflation.R @@ -68,14 +68,13 @@ check_zeroinflation.default <- function(x, tolerance = 0.05, ...) { return(NULL) } - # for zero-inflated models, tell user to use simulated_residuals() - if (model_info$is_zero_inflated) { - insight::format_warning(paste0( - "\nModel has zero-inflation component, returned results are likely to be unreliable. ", - "Please use `check_zeroinflation(simulate_residuals(", - model_name, - "))` to check for zero-inflation." - )) + # for glmmTMB models with zero-inflation component or nbinom families, + # we use simulated_residuals() + if (inherits(x, "glmmTMB") && (model_info$is_zero_inflated || model_info$is_negbin)) { + if (missing(tolerance)) { + tolerance <- 0.1 + } + return(check_zeroinflation(simulate_residuals(x, ...), tolerance = tolerance, ...)) } # get predictions of outcome @@ -117,7 +116,7 @@ check_zeroinflation.default <- function(x, tolerance = 0.05, ...) { #' @rdname check_zeroinflation #' @export check_zeroinflation.performance_simres <- function(x, - tolerance = 0.05, + tolerance = 0.1, alternative = c("two.sided", "less", "greater"), ...) { # match arguments diff --git a/man/check_zeroinflation.Rd b/man/check_zeroinflation.Rd index c37b3b713..724e2be00 100644 --- a/man/check_zeroinflation.Rd +++ b/man/check_zeroinflation.Rd @@ -12,7 +12,7 @@ check_zeroinflation(x, ...) \method{check_zeroinflation}{performance_simres}( x, - tolerance = 0.05, + tolerance = 0.1, alternative = c("two.sided", "less", "greater"), ... ) diff --git a/tests/testthat/test-check_zeroinflation.R b/tests/testthat/test-check_zeroinflation.R index d2e60f065..da7f652b6 100644 --- a/tests/testthat/test-check_zeroinflation.R +++ b/tests/testthat/test-check_zeroinflation.R @@ -19,7 +19,58 @@ test_that("check_zeroinflation", { ) }) + +test_that("check_zeroinflation, glmmTMB with and without zero-inflation component", { + skip_if_not_installed("glmmTMB") + skip_if_not_installed("DHARMa") + set.seed(123) + data(Salamanders, package = "glmmTMB") + + # no zero-inflation model + m <- glmmTMB::glmmTMB(count ~ spp + mined, family = poisson, data = Salamanders) + + expect_equal( + check_zeroinflation(m), + structure( + list( + predicted.zeros = 298, + observed.zeros = 387L, + ratio = 0.770025839793282, + tolerance = 0.05 + ), + class = "check_zi" + ), + tolerance = 1e-3 + ) + + # zero-inflation model + m <- glmmTMB::glmmTMB( + count ~ spp + mined, + ziformula = ~ spp + mined, + family = poisson, + data = Salamanders + ) + + set.seed(123) + expect_equal( + check_zeroinflation(m), + structure( + list( + predicted.zeros = 387, + observed.zeros = 387L, + ratio = 1.00093023255814, + tolerance = 0.1, + p.value = 1 + ), + class = "check_zi" + ), + tolerance = 1e-3 + ) +}) + + test_that("check_zeroinflation, glmer.nb", { + skip_on_cran() skip_if_not_installed("glmmTMB") skip_if_not_installed("lme4") set.seed(101) @@ -34,9 +85,9 @@ test_that("check_zeroinflation, glmer.nb", { mu <- 5 * (-4 + with(dd, as.integer(f1) + 4 * as.numeric(f2))) dd$y <- rnbinom(nrow(dd), mu = mu, size = 0.5) dat2 <<- dd - suppressMessages( + suppressMessages({ m <- lme4::glmer.nb(y ~ f1 * f2 + (1 | g), data = dat2, verbose = FALSE) - ) + }) expect_equal( check_zeroinflation(m), @@ -50,3 +101,32 @@ test_that("check_zeroinflation, glmer.nb", { tolerance = 1e-3 ) }) + + +test_that("check_zeroinflation, glmmTMB nbinom", { + skip_if_not_installed("glmmTMB") + skip_if_not_installed("DHARMa") + skip_on_cran() + + set.seed(1234) + dat <- DHARMa::createData(sampleSize = 1000) + fit <- suppressWarnings(glmmTMB::glmmTMB( + observedResponse ~ Environment1 + (1 | group), + data = dat, + family = glmmTMB::nbinom1() + )) + expect_equal( + check_zeroinflation(fit), + structure( + list( + predicted.zeros = 462, + observed.zeros = 482L, + ratio = 0.95850622406639, + tolerance = 0.1, + p.value = 0.776 + ), + class = "check_zi" + ), + tolerance = 1e-3 + ) +}) From 5842c1534cd01b71ed65a99352431a133feceb1d Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 15 Mar 2024 18:32:01 +0100 Subject: [PATCH 36/73] fixes --- R/check_residuals.R | 4 ++-- R/check_zeroinflation.R | 14 ++++++++++---- man/check_zeroinflation.Rd | 10 +++++++++- 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/R/check_residuals.R b/R/check_residuals.R index 628c5cfd0..b04328f25 100644 --- a/R/check_residuals.R +++ b/R/check_residuals.R @@ -42,7 +42,7 @@ check_residuals.default <- function(x, alternative = c("two.sided", "less", "gre #' @export check_residuals.performance_simres <- function(x, alternative = c("two.sided", "less", "greater"), ...) { alternative <- match.arg(alternative) - ts <- suppressWarnings( + ts_test <- suppressWarnings( stats::ks.test( stats::residuals(x), "punif", @@ -51,7 +51,7 @@ check_residuals.performance_simres <- function(x, alternative = c("two.sided", " ) ) - p.val <- ts$p.value + p.val <- ts_test$p.value attr(p.val, "data") <- x attr(p.val, "object_name") <- insight::safe_deparse_symbol(substitute(x)) diff --git a/R/check_zeroinflation.R b/R/check_zeroinflation.R index d8e42fbe0..e392137d1 100644 --- a/R/check_zeroinflation.R +++ b/R/check_zeroinflation.R @@ -7,9 +7,13 @@ #' @param x Fitted model of class `merMod`, `glmmTMB`, `glm`, or `glm.nb` #' (package **MASS**). #' @param tolerance The tolerance for the ratio of observed and predicted -#' zeros to considered as over- or underfitting zeros. A ratio -#' between 1 +/- `tolerance` is considered as OK, while a ratio -#' beyond or below this threshold would indicate over- or underfitting. +#' zeros to considered as over- or underfitting zeros. A ratio +#' between 1 +/- `tolerance` is considered as OK, while a ratio +#' beyond or below this threshold would indicate over- or underfitting. +#' @param alternative A character string specifying the alternative hypothesis. +#' @param ... Arguments passed down to [`simulate_residuals()`]. This only applies +#' for models with zero-inflation component, or for models of class `glmmTMB` +#' from `nbinom1` or `nbinom2` family. #' #' @return A list with information about the amount of predicted and observed #' zeros in the outcome, as well as the ratio between these two values. @@ -23,7 +27,9 @@ #' or hurdle models, the results from `check_zeroinflation()` are likely to be #' unreliable. In such cases, it is recommended to use `simulate_residuals()` #' first, followed by `check_zeroinflation()` to check for zero-inflation, -#' e.g.: `check_zeroinflation(simulate_residuals(model))`. +#' e.g.: `check_zeroinflation(simulate_residuals(model))`. Usually, such models +#' are detected automatically and `check_zeroinflation()` internally calls +#' `simulate_residuals()` if necessary. #' #' @family functions to check model assumptions and and assess model quality #' diff --git a/man/check_zeroinflation.Rd b/man/check_zeroinflation.Rd index 724e2be00..32a2c2143 100644 --- a/man/check_zeroinflation.Rd +++ b/man/check_zeroinflation.Rd @@ -21,10 +21,16 @@ check_zeroinflation(x, ...) \item{x}{Fitted model of class \code{merMod}, \code{glmmTMB}, \code{glm}, or \code{glm.nb} (package \strong{MASS}).} +\item{...}{Arguments passed down to \code{\link[=simulate_residuals]{simulate_residuals()}}. This only applies +for models with zero-inflation component, or for models of class \code{glmmTMB} +from \code{nbinom1} or \code{nbinom2} family.} + \item{tolerance}{The tolerance for the ratio of observed and predicted zeros to considered as over- or underfitting zeros. A ratio between 1 +/- \code{tolerance} is considered as OK, while a ratio beyond or below this threshold would indicate over- or underfitting.} + +\item{alternative}{A character string specifying the alternative hypothesis.} } \value{ A list with information about the amount of predicted and observed @@ -44,7 +50,9 @@ In case of negative binomial models, models with zero-inflation component, or hurdle models, the results from \code{check_zeroinflation()} are likely to be unreliable. In such cases, it is recommended to use \code{simulate_residuals()} first, followed by \code{check_zeroinflation()} to check for zero-inflation, -e.g.: \code{check_zeroinflation(simulate_residuals(model))}. +e.g.: \code{check_zeroinflation(simulate_residuals(model))}. Usually, such models +are detected automatically and \code{check_zeroinflation()} internally calls +\code{simulate_residuals()} if necessary. } \examples{ \dontshow{if (require("glmmTMB") && require("DHARMa")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} From 3d0663eeff4b0e048dd7961fe8e462341519de82 Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 15 Mar 2024 19:12:26 +0100 Subject: [PATCH 37/73] check_overdispersion method --- NAMESPACE | 2 + R/check_overdispersion.R | 100 +++++++++++++++++++++++++++--------- R/check_zeroinflation.R | 24 +++------ R/simulate_residuals.R | 17 ++++++ man/check_overdispersion.Rd | 10 +++- 5 files changed, 110 insertions(+), 43 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 757f4834d..d4b36a3fa 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -92,6 +92,7 @@ S3method(check_outliers,rma.uni) S3method(check_outliers,rq) S3method(check_outliers,rqs) S3method(check_outliers,rqss) +S3method(check_overdispersion,DHARMa) S3method(check_overdispersion,default) S3method(check_overdispersion,fixest) S3method(check_overdispersion,fixest_multi) @@ -103,6 +104,7 @@ S3method(check_overdispersion,model_fit) S3method(check_overdispersion,negbin) S3method(check_overdispersion,negbinirr) S3method(check_overdispersion,negbinmfx) +S3method(check_overdispersion,performance_simres) S3method(check_overdispersion,poissonirr) S3method(check_overdispersion,poissonmfx) S3method(check_predictions,BFBayesFactor) diff --git a/R/check_overdispersion.R b/R/check_overdispersion.R index 95ad62c0a..7edd91da6 100644 --- a/R/check_overdispersion.R +++ b/R/check_overdispersion.R @@ -5,7 +5,7 @@ #' models for overdispersion. #' #' @param x Fitted model of class `merMod`, `glmmTMB`, `glm`, or `glm.nb` -#' (package **MASS**). +#' (package **MASS**), or an object returned by `simulate_residuals()`. #' @param ... Currently not used. #' #' @return A list with results from the overdispersion test, like chi-squared @@ -113,7 +113,9 @@ print.check_overdisp <- function(x, digits = 3, ...) { orig_x <- x x$dispersion_ratio <- sprintf("%.*f", digits, x$dispersion_ratio) - x$chisq_statistic <- sprintf("%.*f", digits, x$chisq_statistic) + if (!is.null(x$chisq_statistic)) { + x$chisq_statistic <- sprintf("%.*f", digits, x$chisq_statistic) + } x$p_value <- pval <- round(x$p_value, digits = digits) if (x$p_value < 0.001) x$p_value <- "< 0.001" @@ -125,9 +127,14 @@ print.check_overdisp <- function(x, digits = 3, ...) { ) insight::print_color("# Overdispersion test\n\n", "blue") - cat(sprintf(" dispersion ratio = %s\n", format(x$dispersion_ratio, justify = "right", width = maxlen))) - cat(sprintf(" Pearson's Chi-Squared = %s\n", format(x$chisq_statistic, justify = "right", width = maxlen))) - cat(sprintf(" p-value = %s\n\n", format(x$p_value, justify = "right", width = maxlen))) + if (is.null(x$chisq_statistic)) { + cat(sprintf(" dispersion ratio = %s\n", format(x$dispersion_ratio, justify = "right", width = maxlen))) + cat(sprintf(" p-value = %s\n\n", format(x$p_value, justify = "right", width = maxlen))) + } else { + cat(sprintf(" dispersion ratio = %s\n", format(x$dispersion_ratio, justify = "right", width = maxlen))) + cat(sprintf(" Pearson's Chi-Squared = %s\n", format(x$chisq_statistic, justify = "right", width = maxlen))) + cat(sprintf(" p-value = %s\n\n", format(x$p_value, justify = "right", width = maxlen))) + } if (pval > 0.05) { message("No overdispersion detected.") @@ -147,18 +154,24 @@ check_overdispersion.glm <- function(x, verbose = TRUE, ...) { # check if we have poisson info <- insight::model_info(x) if (!info$is_count && !info$is_binomial) { - insight::format_error( - "Overdispersion checks can only be used for models from Poisson families or binomial families with trials > 1." - ) + insight::format_error(paste0( + "Overdispersion checks can only be used for models from Poisson families or binomial families with trials > 1. ", + "You may try to use `check_overdispersion()` on `simulated_residuals()`, e.g. ", + "`check_overdispersion(simulate_residuals(", model_name, "))`." + )) } # check for Bernoulli if (info$is_bernoulli) { - insight::format_error("Overdispersion checks cannot be used for Bernoulli models.") + insight::format_error(paste0( + "Overdispersion checks cannot be used for Bernoulli models. ", + "You may try to use `check_overdispersion()` on `simulated_residuals()`, e.g. ", + "`check_overdispersion(simulate_residuals(", model_name, "))`." + )) } if (info$is_binomial) { - return(check_overdispersion.merMod(x, verbose = verbose, ...)) + return(check_overdispersion.merMod(x, ...)) } yhat <- stats::fitted(x) @@ -219,33 +232,37 @@ check_overdispersion.model_fit <- check_overdispersion.poissonmfx # Overdispersion for mixed models --------------------------- #' @export -check_overdispersion.merMod <- function(x, verbose = TRUE, ...) { +check_overdispersion.merMod <- function(x, ...) { + # for warning message + model_name <- insight::safe_deparse(substitute(x)) + # check if we have poisson or binomial info <- insight::model_info(x) if (!info$is_count && !info$is_binomial) { - insight::format_error( - "Overdispersion checks can only be used for models from Poisson families or binomial families with trials > 1." - ) + insight::format_error(paste0( + "Overdispersion checks can only be used for models from Poisson families or binomial families with trials > 1. ", + "You may try to use `check_overdispersion()` on `simulated_residuals()`, e.g. ", + "`check_overdispersion(simulate_residuals(", model_name, "))`." + )) } # check for Bernoulli if (info$is_bernoulli) { - insight::format_error("Overdispersion checks cannot be used for Bernoulli models.") + insight::format_error(paste0( + "Overdispersion checks cannot be used for Bernoulli models. ", + "You may try to use `check_overdispersion()` on `simulated_residuals()`, e.g. ", + "`check_overdispersion(simulate_residuals(", model_name, "))`." + )) } rdf <- stats::df.residual(x) rp <- insight::get_residuals(x, type = "pearson") if (insight::is_empty_object(rp)) { - Pearson.chisq <- NA - prat <- NA - pval <- NA - rp <- NA - if (isTRUE(verbose)) { - insight::format_alert( - "Cannot test for overdispersion, because pearson residuals are not implemented for models with zero-inflation or variable dispersion.", - "Only the visual inspection using `plot(check_overdispersion(model))` is possible." - ) - } + insight::format_error(paste0( + "Cannot test for overdispersion, because pearson residuals are not implemented for models with zero-inflation or variable dispersion. ", # nolint + "You may try to use `check_overdispersion()` on `simulated_residuals()`, e.g. ", + "`check_overdispersion(simulate_residuals(", model_name, "))`." + )) } else { Pearson.chisq <- sum(rp^2) prat <- Pearson.chisq / rdf @@ -270,3 +287,36 @@ check_overdispersion.negbin <- check_overdispersion.merMod #' @export check_overdispersion.glmmTMB <- check_overdispersion.merMod + + +# simulated residuals ----------------------------- + +#' @rdname check_overdispersion +#' @export +check_overdispersion.performance_simres <- function(x, + tolerance = 0.1, + alternative = c("two.sided", "less", "greater"), + ...) { + # match arguments + alternative <- match.arg(alternative) + + # statistics function + variance <- stats::sd(x$simulatedResponse)^2 + dispersion <- function(i) var(i - x$fittedPredictedResponse) / variance + + # compute test results + .simres_statistics(x, statistic_fun = dispersion, alternative = alternative) + + out <- list( + dispersion_ratio = .simres_statistics$observed / mean(.simres_statistics$simulated), + p_value = .simres_statistics$p + ) + + class(out) <- c("check_overdisp", "see_check_overdisp") + attr(out, "object_name") <- insight::safe_deparse_symbol(substitute(x)) + + out +} + +#' @export +check_overdispersion.DHARMa <- check_overdispersion.performance_simres diff --git a/R/check_zeroinflation.R b/R/check_zeroinflation.R index e392137d1..c9068419a 100644 --- a/R/check_zeroinflation.R +++ b/R/check_zeroinflation.R @@ -63,9 +63,6 @@ check_zeroinflation.default <- function(x, tolerance = 0.05, ...) { insight::format_error("Model must be from Poisson-family.") } - # for warning message - model_name <- insight::safe_deparse(substitute(x)) - # get actual zero of response obs.zero <- sum(insight::get_response(x, verbose = FALSE) == 0L) @@ -127,25 +124,18 @@ check_zeroinflation.performance_simres <- function(x, ...) { # match arguments alternative <- match.arg(alternative) - # count observed and simulated zeros - observed <- sum(x$observedResponse == 0) - simulated <- apply(x$simulatedResponse, 2, function(i) sum(i == 0)) - # p is simply ratio of simulated zeros to observed zeros - p <- switch( - alternative, - greater = mean(simulated >= observed), - less = mean(simulated <= observed), - min(min(mean(simulated <= observed), mean(simulated >= observed)) * 2, 1) - ) + + # compute test results + .simres_statistics(x, statistic_fun = function(i) sum(i == 0), alternative = alternative) structure( class = "check_zi", list( - predicted.zeros = round(mean(simulated)), - observed.zeros = observed, - ratio = mean(simulated) / observed, + predicted.zeros = round(mean(.simres_statistics$simulated)), + observed.zeros = .simres_statistics$observed, + ratio = mean(.simres_statistics$simulated) / .simres_statistics$observed, tolerance = tolerance, - p.value = p + p.value = .simres_statistics$p ) ) } diff --git a/R/simulate_residuals.R b/R/simulate_residuals.R index f9847e4db..164310d87 100644 --- a/R/simulate_residuals.R +++ b/R/simulate_residuals.R @@ -50,6 +50,7 @@ simulate_residuals <- function(x, iterations = 250, ...) { out } + # methods ------------------------------ #' @export @@ -72,3 +73,19 @@ plot.performance_simres <- function(x, ...) { insight::check_if_installed("see", "for residual plots") NextMethod() } + + +# helper functions --------------------- + +.simres_statistics <- function(x, statistic_fun, alternative = "two.sided") { + # count observed and simulated zeros + observed <- statistic_fun(x$observedResponse) + simulated <- apply(x$simulatedResponse, 2, statistic_fun) + # p is simply ratio of simulated zeros to observed zeros + p <- switch(alternative, + greater = mean(simulated >= observed), + less = mean(simulated <= observed), + min(min(mean(simulated <= observed), mean(simulated >= observed)) * 2, 1) + ) + list(observed = observed, simulated = simulated, p = p) +} diff --git a/man/check_overdispersion.Rd b/man/check_overdispersion.Rd index ce8341dc4..02098c520 100644 --- a/man/check_overdispersion.Rd +++ b/man/check_overdispersion.Rd @@ -2,13 +2,21 @@ % Please edit documentation in R/check_overdispersion.R \name{check_overdispersion} \alias{check_overdispersion} +\alias{check_overdispersion.performance_simres} \title{Check overdispersion of GL(M)M's} \usage{ check_overdispersion(x, ...) + +\method{check_overdispersion}{performance_simres}( + x, + tolerance = 0.1, + alternative = c("two.sided", "less", "greater"), + ... +) } \arguments{ \item{x}{Fitted model of class \code{merMod}, \code{glmmTMB}, \code{glm}, or \code{glm.nb} -(package \strong{MASS}).} +(package \strong{MASS}), or an object returned by \code{simulate_residuals()}.} \item{...}{Currently not used.} } From 5d68703aa27b431c0f9638bdf8a0727460b9e188 Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 15 Mar 2024 19:14:15 +0100 Subject: [PATCH 38/73] docs --- R/check_overdispersion.R | 4 +++- man/check_overdispersion.Rd | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/R/check_overdispersion.R b/R/check_overdispersion.R index 7edd91da6..49f192d32 100644 --- a/R/check_overdispersion.R +++ b/R/check_overdispersion.R @@ -33,7 +33,9 @@ #' section *How can I deal with overdispersion in GLMMs?*. Note that this #' function only returns an *approximate* estimate of an overdispersion #' parameter, and is probably inaccurate for zero-inflated mixed models (fitted -#' with `glmmTMB`). +#' with `glmmTMB`). In such cases, it is recommended to use `simulate_residuals()` +#' first, followed by `check_overdispersion()` to check for overdispersion, e.g.: +#' `check_overdispersion(simulate_residuals(model))`. #' #' @section How to fix Overdispersion: #' Overdispersion can be fixed by either modeling the dispersion parameter, or diff --git a/man/check_overdispersion.Rd b/man/check_overdispersion.Rd index 02098c520..1be4bd61f 100644 --- a/man/check_overdispersion.Rd +++ b/man/check_overdispersion.Rd @@ -56,7 +56,9 @@ is based on the code in the section \emph{How can I deal with overdispersion in GLMMs?}. Note that this function only returns an \emph{approximate} estimate of an overdispersion parameter, and is probably inaccurate for zero-inflated mixed models (fitted -with \code{glmmTMB}). +with \code{glmmTMB}). In such cases, it is recommended to use \code{simulate_residuals()} +first, followed by \code{check_overdispersion()} to check for overdispersion, e.g.: +\code{check_overdispersion(simulate_residuals(model))}. } \section{How to fix Overdispersion}{ From 9fa5420569dc52f980c13faefc5973960cb032f7 Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 15 Mar 2024 23:00:57 +0100 Subject: [PATCH 39/73] add test function --- R/check_model_diagnostics.R | 76 ++++++++++++++++++++++++++++++++++++- 1 file changed, 74 insertions(+), 2 deletions(-) diff --git a/R/check_model_diagnostics.R b/R/check_model_diagnostics.R index ff9482934..e3ec99ce1 100644 --- a/R/check_model_diagnostics.R +++ b/R/check_model_diagnostics.R @@ -293,7 +293,80 @@ # prepare data for homogeneity of variance plot ---------------------------------- -.diag_overdispersion <- function(model) { +.new_diag_overdispersion <- function(model, ...) { + faminfo <- insight::model_info(model) + + d <- data.frame(Predicted = stats::predict(model, type = "response")) + + # residuals based on simulated residuals - but we want normally distributed residuals + d$Residuals <- stats::residuals(simulate_residuals(model, ...), quantileFunction = stats::qnorm, ...) + d$Res2 <- d$Residuals^2 + # standard residuals: residuals / sqrt(mse) + d$StdRes <- d$Residuals / sqrt(mean(d$Res2, na.rm = TRUE)) + + # data for poisson models + if (faminfo$is_poisson && !faminfo$is_zero_inflated) { + d$V <- d$Predicted + } + + # data for negative binomial models + if (faminfo$is_negbin && !faminfo$is_zero_inflated) { + if (inherits(model, "glmmTMB")) { + if (faminfo$family == "nbinom1") { + # for nbinom1, we can use "sigma()" + d$V <- insight::get_sigma(model)^2 * stats::family(model)$variance(d$Predicted) + } else { + # for nbinom2, "sigma()" has "inverse meaning" (see #654) + d$V <- (1 / insight::get_sigma(model)^2) * stats::family(model)$variance(d$Predicted) + } + } else { + ## FIXME: this is not correct for glm.nb models? + d$V <- d$Predicted * (1 + d$Predicted / insight::get_sigma(model)) + } + } + + # data for zero-inflated poisson models + if (faminfo$is_poisson && faminfo$is_zero_inflated) { + if (inherits(model, "glmmTMB")) { + ptype <- "zprob" + } else { + ptype <- "zero" + } + d$Prob <- stats::predict(model, type = ptype) + d$V <- d$Predicted * (1 - d$Prob) * (1 + d$Predicted * d$Prob) + } + + # data for zero-inflated negative binomial models + if (faminfo$is_negbin && faminfo$is_zero_inflated && !faminfo$is_dispersion) { + if (inherits(model, "glmmTMB")) { + ptype <- "zprob" + } else { + ptype <- "zero" + } + d$Prob <- stats::predict(model, type = ptype) + d$Disp <- insight::get_sigma(model) + d$V <- d$Predicted * (1 + d$Predicted / d$Disp) * (1 - d$Prob) * (1 + d$Predicted * (1 + d$Predicted / d$Disp) * d$Prob) # nolint + } + + # data for zero-inflated negative binomial models with dispersion + if (faminfo$is_negbin && faminfo$is_zero_inflated && faminfo$is_dispersion) { + d <- data.frame(Predicted = stats::predict(model, type = "response")) + if (inherits(model, "glmmTMB")) { + ptype <- "zprob" + } else { + ptype <- "zero" + } + d$Prob <- stats::predict(model, type = ptype) + d$Disp <- stats::predict(model, type = "disp") + d$V <- d$Predicted * (1 + d$Predicted / d$Disp) * (1 - d$Prob) * (1 + d$Predicted * (1 + d$Predicted / d$Disp) * d$Prob) # nolint + } + + d +} + + + +.diag_overdispersion <- function(model, ...) { faminfo <- insight::model_info(model) # data for poisson models @@ -380,7 +453,6 @@ } - # helpers ---------------------------------- .sigma_glmmTMB_nonmixed <- function(model, faminfo) { From a25ecac41b148bb7dae65834a585d9dc246b8e89 Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 15 Mar 2024 23:05:19 +0100 Subject: [PATCH 40/73] fix --- R/check_zeroinflation.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/R/check_zeroinflation.R b/R/check_zeroinflation.R index c9068419a..e92d25bcd 100644 --- a/R/check_zeroinflation.R +++ b/R/check_zeroinflation.R @@ -126,16 +126,16 @@ check_zeroinflation.performance_simres <- function(x, alternative <- match.arg(alternative) # compute test results - .simres_statistics(x, statistic_fun = function(i) sum(i == 0), alternative = alternative) + result <- .simres_statistics(x, statistic_fun = function(i) sum(i == 0), alternative = alternative) structure( class = "check_zi", list( - predicted.zeros = round(mean(.simres_statistics$simulated)), - observed.zeros = .simres_statistics$observed, - ratio = mean(.simres_statistics$simulated) / .simres_statistics$observed, + predicted.zeros = round(mean(result$simulated)), + observed.zeros = result$observed, + ratio = mean(result$simulated) / result$observed, tolerance = tolerance, - p.value = .simres_statistics$p + p.value = result$p ) ) } From d0822bab65ea738349d543d84c52464b92b22257 Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 15 Mar 2024 23:06:00 +0100 Subject: [PATCH 41/73] fix --- R/check_overdispersion.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/check_overdispersion.R b/R/check_overdispersion.R index 49f192d32..f8707f5ae 100644 --- a/R/check_overdispersion.R +++ b/R/check_overdispersion.R @@ -307,11 +307,11 @@ check_overdispersion.performance_simres <- function(x, dispersion <- function(i) var(i - x$fittedPredictedResponse) / variance # compute test results - .simres_statistics(x, statistic_fun = dispersion, alternative = alternative) + result <- .simres_statistics(x, statistic_fun = dispersion, alternative = alternative) out <- list( - dispersion_ratio = .simres_statistics$observed / mean(.simres_statistics$simulated), - p_value = .simres_statistics$p + dispersion_ratio = result$observed / mean(result$simulated), + p_value = result$p ) class(out) <- c("check_overdisp", "see_check_overdisp") From 7111028813b1d8fda9eaad571935505701bfb992 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 16 Mar 2024 10:15:57 +0100 Subject: [PATCH 42/73] fix issues --- R/check_model.R | 4 ++-- R/check_overdispersion.R | 13 +++++++------ R/simulate_residuals.R | 4 +++- inst/WORDLIST | 7 +++++++ man/check_overdispersion.Rd | 13 ++++++------- tests/testthat/test-check_model.R | 7 ++----- tests/testthat/test-check_residuals.R | 2 ++ 7 files changed, 29 insertions(+), 21 deletions(-) diff --git a/R/check_model.R b/R/check_model.R index 20bb056f4..582d3c54a 100644 --- a/R/check_model.R +++ b/R/check_model.R @@ -373,7 +373,7 @@ check_model.model_fit <- function(x, dat$VIF <- .diag_vif(model, verbose = verbose) dat$QQ <- switch(residual_type, - simulated = simulate_residuals(model), + simulated = simulate_residuals(model, ...), .diag_qq(model, model_info = model_info, verbose = verbose) ) dat$REQQ <- .diag_reqq(model, level = 0.95, model_info = model_info, verbose = verbose) @@ -404,7 +404,7 @@ check_model.model_fit <- function(x, dat$VIF <- .diag_vif(model, verbose = verbose) dat$QQ <- switch( residual_type, - simulated = simulate_residuals(model), + simulated = simulate_residuals(model, ...), .diag_qq(model, model_info = model_info, verbose = verbose) ) dat$HOMOGENEITY <- .diag_homogeneity(model, verbose = verbose) diff --git a/R/check_overdispersion.R b/R/check_overdispersion.R index f8707f5ae..5bb0d0c0f 100644 --- a/R/check_overdispersion.R +++ b/R/check_overdispersion.R @@ -6,7 +6,8 @@ #' #' @param x Fitted model of class `merMod`, `glmmTMB`, `glm`, or `glm.nb` #' (package **MASS**), or an object returned by `simulate_residuals()`. -#' @param ... Currently not used. +#' +#' @inheritParams check_zeroinflation #' #' @return A list with results from the overdispersion test, like chi-squared #' statistics, p-value or dispersion ratio. @@ -153,6 +154,9 @@ print.check_overdisp <- function(x, digits = 3, ...) { #' @export check_overdispersion.glm <- function(x, verbose = TRUE, ...) { + # for warning message + model_name <- insight::safe_deparse(substitute(x)) + # check if we have poisson info <- insight::model_info(x) if (!info$is_count && !info$is_binomial) { @@ -295,16 +299,13 @@ check_overdispersion.glmmTMB <- check_overdispersion.merMod #' @rdname check_overdispersion #' @export -check_overdispersion.performance_simres <- function(x, - tolerance = 0.1, - alternative = c("two.sided", "less", "greater"), - ...) { +check_overdispersion.performance_simres <- function(x, alternative = c("two.sided", "less", "greater"), ...) { # match arguments alternative <- match.arg(alternative) # statistics function variance <- stats::sd(x$simulatedResponse)^2 - dispersion <- function(i) var(i - x$fittedPredictedResponse) / variance + dispersion <- function(i) stats::var(i - x$fittedPredictedResponse) / variance # compute test results result <- .simres_statistics(x, statistic_fun = dispersion, alternative = alternative) diff --git a/R/simulate_residuals.R b/R/simulate_residuals.R index 164310d87..1e3b2b84d 100644 --- a/R/simulate_residuals.R +++ b/R/simulate_residuals.R @@ -40,7 +40,9 @@ simulate_residuals <- function(x, iterations = 250, ...) { # TODO (low priority): Note that DHARMa::simulateResiduals(x, ...) does its own checks for whether # or not the model passed to it is supported, do we want to use this or do our # own checks so we can supply our own error message? - # + if (iterations < 2) { + insight::format_error("`iterations` must be at least 2.") + } # It's important to preserve this object as is, rather than prematurely # extracting the residuals from it because the object contains important stuff # in it that we'll want to pass onto other functions later, such as passing diff --git a/inst/WORDLIST b/inst/WORDLIST index 377bd8ea4..be923aed4 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -46,6 +46,7 @@ DOI Datenerhebung Delacre Deskriptivstatistische +DHARMa Distinguishability Dom Dominicy @@ -76,6 +77,7 @@ Gazen Gelman Gnanadesikan Guilford +Hartig HDI HJ Hastie @@ -124,6 +126,7 @@ Ley Leys Lillo Liu +Lohse Lomax MADs MSA @@ -182,6 +185,7 @@ Sensivity Shachar Shinichi Skrondal +Smyth Solomons Somers Specifity @@ -279,6 +283,7 @@ metafor mfx mhurdle mis +misspecification mlm mlogit modelfit @@ -302,6 +307,7 @@ quartile quartiles rOpenSci recoding +reimplement rempsyc reproducibility rescaling @@ -325,6 +331,7 @@ unadjusted und underfitted underfitting +underdispersion visualisation winsorization winsorize diff --git a/man/check_overdispersion.Rd b/man/check_overdispersion.Rd index 1be4bd61f..d5c80a6f1 100644 --- a/man/check_overdispersion.Rd +++ b/man/check_overdispersion.Rd @@ -7,18 +7,17 @@ \usage{ check_overdispersion(x, ...) -\method{check_overdispersion}{performance_simres}( - x, - tolerance = 0.1, - alternative = c("two.sided", "less", "greater"), - ... -) +\method{check_overdispersion}{performance_simres}(x, alternative = c("two.sided", "less", "greater"), ...) } \arguments{ \item{x}{Fitted model of class \code{merMod}, \code{glmmTMB}, \code{glm}, or \code{glm.nb} (package \strong{MASS}), or an object returned by \code{simulate_residuals()}.} -\item{...}{Currently not used.} +\item{...}{Arguments passed down to \code{\link[=simulate_residuals]{simulate_residuals()}}. This only applies +for models with zero-inflation component, or for models of class \code{glmmTMB} +from \code{nbinom1} or \code{nbinom2} family.} + +\item{alternative}{A character string specifying the alternative hypothesis.} } \value{ A list with results from the overdispersion test, like chi-squared diff --git a/tests/testthat/test-check_model.R b/tests/testthat/test-check_model.R index 06973756f..b008658dc 100644 --- a/tests/testthat/test-check_model.R +++ b/tests/testthat/test-check_model.R @@ -64,11 +64,8 @@ test_that("`check_model()` warnings for tweedie", { )) expect_message( expect_message( - expect_message( - check_model(m, iterations = 1, verbose = TRUE), - regex = "Not enough model terms" - ), - regex = "QQ plot could not" + check_model(m, iterations = 2, verbose = TRUE), + regex = "Not enough model terms" ) ) }) diff --git a/tests/testthat/test-check_residuals.R b/tests/testthat/test-check_residuals.R index b057174d0..6c64159d8 100644 --- a/tests/testthat/test-check_residuals.R +++ b/tests/testthat/test-check_residuals.R @@ -11,6 +11,8 @@ test_that("check_singularity, lme4", { capture.output(print(out)), "Warning: Non-uniformity of simulated residuals detected (p = 0.019)." ) + expect_error(simulate_residuals(m, iterations = 1), "`iterations` must be") + skip_if_not_installed("MASS") set.seed(3) mu <- rpois(500, lambda = 3) From 6d7ee1cbcde6a2e625407119176f94a6c0bc432e Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 16 Mar 2024 10:35:14 +0100 Subject: [PATCH 43/73] lintr, styler, fix test --- R/check_model.R | 3 +-- R/check_normality.R | 8 ++++---- R/check_zeroinflation.R | 12 ++++++------ tests/testthat/test-check_residuals.R | 2 +- 4 files changed, 12 insertions(+), 13 deletions(-) diff --git a/R/check_model.R b/R/check_model.R index 582d3c54a..8a3fa29ff 100644 --- a/R/check_model.R +++ b/R/check_model.R @@ -402,8 +402,7 @@ check_model.model_fit <- function(x, dat <- list() dat$VIF <- .diag_vif(model, verbose = verbose) - dat$QQ <- switch( - residual_type, + dat$QQ <- switch(residual_type, simulated = simulate_residuals(model, ...), .diag_qq(model, model_info = model_info, verbose = verbose) ) diff --git a/R/check_normality.R b/R/check_normality.R index 2d80ec89e..2b9f071d5 100644 --- a/R/check_normality.R +++ b/R/check_normality.R @@ -200,7 +200,7 @@ check_normality.merMod <- function(x, effects = c("fixed", "random"), ...) { } }, error = function(e) { - return(NULL) + NULL } ) @@ -262,7 +262,7 @@ check_normality.BFBayesFactor <- check_normality.afex_aov # helper --------------------- .check_normality <- function(x, model, type = "residuals") { - ts <- .safe({ + ts_result <- .safe({ if (length(x) >= 5000) { suppressWarnings(stats::ks.test(x, y = "pnorm", alternative = "two.sided")) } else { @@ -270,7 +270,7 @@ check_normality.BFBayesFactor <- check_normality.afex_aov } }) - if (is.null(ts)) { + if (is.null(ts_result)) { insight::print_color( sprintf("`check_normality()` does not support models of class `%s`.\n", class(model)[1]), "red" @@ -278,7 +278,7 @@ check_normality.BFBayesFactor <- check_normality.afex_aov return(NULL) } - out <- ts$p.value + out <- ts_result$p.value attr(out, "type") <- type out diff --git a/R/check_zeroinflation.R b/R/check_zeroinflation.R index e92d25bcd..d029e02c2 100644 --- a/R/check_zeroinflation.R +++ b/R/check_zeroinflation.R @@ -97,10 +97,10 @@ check_zeroinflation.default <- function(x, tolerance = 0.05, ...) { } # get predicted zero-counts - if (!is.null(theta)) { - pred.zero <- round(sum(stats::dnbinom(x = 0, size = theta, mu = mu))) - } else { + if (is.null(theta)) { pred.zero <- round(sum(stats::dpois(x = 0, lambda = mu))) + } else { + pred.zero <- round(sum(stats::dnbinom(x = 0, size = theta, mu = mu))) } # proportion @@ -156,10 +156,10 @@ print.check_zi <- function(x, ...) { lower <- 1 - x$tolerance upper <- 1 + x$tolerance - if (!is.null(x$p.value)) { - p_string <- paste0(" (", insight::format_p(x$p.value), ")") - } else { + if (is.null(x$p.value)) { p_string <- "" + } else { + p_string <- paste0(" (", insight::format_p(x$p.value), ")") } if (x$ratio < lower) { diff --git a/tests/testthat/test-check_residuals.R b/tests/testthat/test-check_residuals.R index 6c64159d8..33407abf9 100644 --- a/tests/testthat/test-check_residuals.R +++ b/tests/testthat/test-check_residuals.R @@ -22,5 +22,5 @@ test_that("check_singularity, lme4", { quine.nb1 <- MASS::glm.nb(x ~ mu) set.seed(123) result <- check_residuals(quine.nb1) - expect_equal(result, 0.000665414, tolerance = 1e-3) + expect_equal(result, 0.000665414, tolerance = 1e-3, ignore_attr = TRUE) }) From 3575e6a844a02e21dcc48773aade696734bc4eb4 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 16 Mar 2024 11:15:31 +0100 Subject: [PATCH 44/73] fix ZI --- R/check_overdispersion.R | 80 ++++++++-------------- R/check_zeroinflation.R | 33 ++------- man/check_overdispersion.Rd | 14 ++-- man/check_zeroinflation.Rd | 9 +-- tests/testthat/test-check_overdispersion.R | 64 +++++++++++++++++ tests/testthat/test-check_zeroinflation.R | 7 +- 6 files changed, 115 insertions(+), 92 deletions(-) diff --git a/R/check_overdispersion.R b/R/check_overdispersion.R index 5bb0d0c0f..9c3b33e99 100644 --- a/R/check_overdispersion.R +++ b/R/check_overdispersion.R @@ -27,16 +27,20 @@ #' For Poisson models, the overdispersion test is based on the code from #' _Gelman and Hill (2007), page 115_. #' +#' @section Overdispersion in Negative Binomial or Zero-Inflated Models: +#' For negative binomial (mixed) models or models with zero-inflation component, +#' the overdispersion test is based simulated residuals (see [`simulate_residuals()`]). +#' #' @section Overdispersion in Mixed Models: #' For `merMod`- and `glmmTMB`-objects, `check_overdispersion()` #' is based on the code in the #' [GLMM FAQ](http://bbolker.github.io/mixedmodels-misc/glmmFAQ.html), #' section *How can I deal with overdispersion in GLMMs?*. Note that this #' function only returns an *approximate* estimate of an overdispersion -#' parameter, and is probably inaccurate for zero-inflated mixed models (fitted -#' with `glmmTMB`). In such cases, it is recommended to use `simulate_residuals()` -#' first, followed by `check_overdispersion()` to check for overdispersion, e.g.: -#' `check_overdispersion(simulate_residuals(model))`. +#' parameter. Using this approach would be inaccurate for zero-inflated or +#' negative binomial mixed models (fitted with `glmmTMB`), thus, in such cases, +#' the overdispersion test is based on [`simulate_residuals()`] (which is identical +#' to `check_overdispersion(simulate_residuals(model))`). #' #' @section How to fix Overdispersion: #' Overdispersion can be fixed by either modeling the dispersion parameter, or @@ -154,30 +158,14 @@ print.check_overdisp <- function(x, digits = 3, ...) { #' @export check_overdispersion.glm <- function(x, verbose = TRUE, ...) { - # for warning message - model_name <- insight::safe_deparse(substitute(x)) - - # check if we have poisson + # model info info <- insight::model_info(x) - if (!info$is_count && !info$is_binomial) { - insight::format_error(paste0( - "Overdispersion checks can only be used for models from Poisson families or binomial families with trials > 1. ", - "You may try to use `check_overdispersion()` on `simulated_residuals()`, e.g. ", - "`check_overdispersion(simulate_residuals(", model_name, "))`." - )) - } - # check for Bernoulli - if (info$is_bernoulli) { - insight::format_error(paste0( - "Overdispersion checks cannot be used for Bernoulli models. ", - "You may try to use `check_overdispersion()` on `simulated_residuals()`, e.g. ", - "`check_overdispersion(simulate_residuals(", model_name, "))`." - )) - } + # for certain distributions, simulated residuals are more accurate + use_simulated <- info$is_bernoulli || info$is_binomial || (!info$is_count && !info$is_binomial) || info$is_negbin - if (info$is_binomial) { - return(check_overdispersion.merMod(x, ...)) + if (use_simulated) { + return(check_overdispersion(simulate_residuals(x, ...), ...)) } yhat <- stats::fitted(x) @@ -239,42 +227,28 @@ check_overdispersion.model_fit <- check_overdispersion.poissonmfx #' @export check_overdispersion.merMod <- function(x, ...) { - # for warning message - model_name <- insight::safe_deparse(substitute(x)) - - # check if we have poisson or binomial + # for certain distributions, simulated residuals are more accurate info <- insight::model_info(x) - if (!info$is_count && !info$is_binomial) { - insight::format_error(paste0( - "Overdispersion checks can only be used for models from Poisson families or binomial families with trials > 1. ", - "You may try to use `check_overdispersion()` on `simulated_residuals()`, e.g. ", - "`check_overdispersion(simulate_residuals(", model_name, "))`." - )) - } - # check for Bernoulli - if (info$is_bernoulli) { - insight::format_error(paste0( - "Overdispersion checks cannot be used for Bernoulli models. ", - "You may try to use `check_overdispersion()` on `simulated_residuals()`, e.g. ", - "`check_overdispersion(simulate_residuals(", model_name, "))`." - )) + # for certain distributions, simulated residuals are more accurate + use_simulated <- info$is_zero_inflated || info$is_bernoulli || info$is_binomial || (!info$is_count && !info$is_binomial) || info$is_negbin + + if (use_simulated) { + return(check_overdispersion(simulate_residuals(x, ...), ...)) } rdf <- stats::df.residual(x) rp <- insight::get_residuals(x, type = "pearson") + + # check if pearson residuals are available if (insight::is_empty_object(rp)) { - insight::format_error(paste0( - "Cannot test for overdispersion, because pearson residuals are not implemented for models with zero-inflation or variable dispersion. ", # nolint - "You may try to use `check_overdispersion()` on `simulated_residuals()`, e.g. ", - "`check_overdispersion(simulate_residuals(", model_name, "))`." - )) - } else { - Pearson.chisq <- sum(rp^2) - prat <- Pearson.chisq / rdf - pval <- stats::pchisq(Pearson.chisq, df = rdf, lower.tail = FALSE) + return(check_overdispersion(simulate_residuals(x, ...), ...)) } + Pearson.chisq <- sum(rp^2) + prat <- Pearson.chisq / rdf + pval <- stats::pchisq(Pearson.chisq, df = rdf, lower.tail = FALSE) + out <- list( chisq_statistic = Pearson.chisq, dispersion_ratio = prat, @@ -311,7 +285,7 @@ check_overdispersion.performance_simres <- function(x, alternative = c("two.side result <- .simres_statistics(x, statistic_fun = dispersion, alternative = alternative) out <- list( - dispersion_ratio = result$observed / mean(result$simulated), + dispersion_ratio = mean(result$simulated) / result$observed, p_value = result$p ) diff --git a/R/check_zeroinflation.R b/R/check_zeroinflation.R index d029e02c2..0d7ce94a4 100644 --- a/R/check_zeroinflation.R +++ b/R/check_zeroinflation.R @@ -24,12 +24,9 @@ #' negative binomial or zero-inflated models. #' #' In case of negative binomial models, models with zero-inflation component, -#' or hurdle models, the results from `check_zeroinflation()` are likely to be -#' unreliable. In such cases, it is recommended to use `simulate_residuals()` -#' first, followed by `check_zeroinflation()` to check for zero-inflation, -#' e.g.: `check_zeroinflation(simulate_residuals(model))`. Usually, such models -#' are detected automatically and `check_zeroinflation()` internally calls -#' `simulate_residuals()` if necessary. +#' or hurdle models, the results from `check_zeroinflation()` are based on +#' [`simulate_residuals()`], i.e. `check_zeroinflation(simulate_residuals(model))` +#' is internally called if necessary. #' #' @family functions to check model assumptions and and assess model quality #' @@ -71,9 +68,9 @@ check_zeroinflation.default <- function(x, tolerance = 0.05, ...) { return(NULL) } - # for glmmTMB models with zero-inflation component or nbinom families, + # for models with zero-inflation component or negative binomial families, # we use simulated_residuals() - if (inherits(x, "glmmTMB") && (model_info$is_zero_inflated || model_info$is_negbin)) { + if (model_info$is_zero_inflated || model_info$is_negbin) { if (missing(tolerance)) { tolerance <- 0.1 } @@ -82,26 +79,8 @@ check_zeroinflation.default <- function(x, tolerance = 0.05, ...) { # get predictions of outcome mu <- stats::fitted(x) - - # get overdispersion parameters - if (model_info$is_negbin) { - if (methods::is(x, "glmmTMB")) { - theta <- stats::sigma(x) - } else if (methods::is(x, "glmerMod")) { - theta <- environment(x@resp$family$aic)[[".Theta"]] - } else { - theta <- x$theta - } - } else { - theta <- NULL - } - # get predicted zero-counts - if (is.null(theta)) { - pred.zero <- round(sum(stats::dpois(x = 0, lambda = mu))) - } else { - pred.zero <- round(sum(stats::dnbinom(x = 0, size = theta, mu = mu))) - } + pred.zero <- round(sum(stats::dpois(x = 0, lambda = mu))) # proportion structure( diff --git a/man/check_overdispersion.Rd b/man/check_overdispersion.Rd index d5c80a6f1..5ae8bf44c 100644 --- a/man/check_overdispersion.Rd +++ b/man/check_overdispersion.Rd @@ -47,6 +47,12 @@ For Poisson models, the overdispersion test is based on the code from \emph{Gelman and Hill (2007), page 115}. } +\section{Overdispersion in Negative Binomial or Zero-Inflated Models}{ + +For negative binomial (mixed) models or models with zero-inflation component, +the overdispersion test is based simulated residuals (see \code{\link[=simulate_residuals]{simulate_residuals()}}). +} + \section{Overdispersion in Mixed Models}{ For \code{merMod}- and \code{glmmTMB}-objects, \code{check_overdispersion()} @@ -54,10 +60,10 @@ is based on the code in the \href{http://bbolker.github.io/mixedmodels-misc/glmmFAQ.html}{GLMM FAQ}, section \emph{How can I deal with overdispersion in GLMMs?}. Note that this function only returns an \emph{approximate} estimate of an overdispersion -parameter, and is probably inaccurate for zero-inflated mixed models (fitted -with \code{glmmTMB}). In such cases, it is recommended to use \code{simulate_residuals()} -first, followed by \code{check_overdispersion()} to check for overdispersion, e.g.: -\code{check_overdispersion(simulate_residuals(model))}. +parameter. Using this approach would be inaccurate for zero-inflated or +negative binomial mixed models (fitted with \code{glmmTMB}), thus, in such cases, +the overdispersion test is based on \code{\link[=simulate_residuals]{simulate_residuals()}} (which is identical +to \code{check_overdispersion(simulate_residuals(model))}). } \section{How to fix Overdispersion}{ diff --git a/man/check_zeroinflation.Rd b/man/check_zeroinflation.Rd index 32a2c2143..c6054e986 100644 --- a/man/check_zeroinflation.Rd +++ b/man/check_zeroinflation.Rd @@ -47,12 +47,9 @@ zero-inflation in the data. In such cases, it is recommended to use negative binomial or zero-inflated models. In case of negative binomial models, models with zero-inflation component, -or hurdle models, the results from \code{check_zeroinflation()} are likely to be -unreliable. In such cases, it is recommended to use \code{simulate_residuals()} -first, followed by \code{check_zeroinflation()} to check for zero-inflation, -e.g.: \code{check_zeroinflation(simulate_residuals(model))}. Usually, such models -are detected automatically and \code{check_zeroinflation()} internally calls -\code{simulate_residuals()} if necessary. +or hurdle models, the results from \code{check_zeroinflation()} are based on +\code{\link[=simulate_residuals]{simulate_residuals()}}, i.e. \code{check_zeroinflation(simulate_residuals(model))} +is internally called if necessary. } \examples{ \dontshow{if (require("glmmTMB") && require("DHARMa")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} diff --git a/tests/testthat/test-check_overdispersion.R b/tests/testthat/test-check_overdispersion.R index cdd36bcd0..ee3426299 100644 --- a/tests/testthat/test-check_overdispersion.R +++ b/tests/testthat/test-check_overdispersion.R @@ -45,3 +45,67 @@ test_that("check_overdispersion", { tolerance = 1e-3 ) }) + + +test_that("check_overdispersion, zero-inflated and negbin", { + skip_if_not_installed("glmmTMB") + skip_if_not_installed("DHARMa") + skip_if_not(getRversion() >= "4.0.0") + data(Salamanders, package = "glmmTMB") + + m1 <- glmmTMB::glmmTMB( + count ~ spp + mined, + ziformula = ~ spp + mined, + family = poisson, + data = Salamanders + ) + m2 <- glmmTMB::glmmTMB( + count ~ spp + mined, + family = poisson, + data = Salamanders + ) + m3 <- glmmTMB::glmmTMB( + count ~ spp + mined, + family = glmmTMB::nbinom1(), + data = Salamanders + ) + expect_equal( + check_overdispersion(m1), + structure( + list( + dispersion_ratio = 0.504903379544268, + p_value = 0 + ), + class = c("check_overdisp", "see_check_overdisp") + ), + tolerance = 1e-4, + ignore_attr = TRUE + ) + expect_equal( + check_overdispersion(m2), + structure( + list( + chisq_statistic = 1873.7105986433, + dispersion_ratio = 2.94608584692342, + residual_df = 636L, + p_value = 3.26556213101505e-122 + ), + class = c("check_overdisp", "see_check_overdisp"), + object_name = "m1" + ), + tolerance = 1e-4, + ignore_attr = TRUE + ) + expect_equal( + check_overdispersion(m1), + structure( + list( + dispersion_ratio = 0.504903379544268, + p_value = 0 + ), + class = c("check_overdisp", "see_check_overdisp") + ), + tolerance = 1e-4, + ignore_attr = TRUE + ) +}) diff --git a/tests/testthat/test-check_zeroinflation.R b/tests/testthat/test-check_zeroinflation.R index da7f652b6..5fd6a71fc 100644 --- a/tests/testthat/test-check_zeroinflation.R +++ b/tests/testthat/test-check_zeroinflation.R @@ -93,8 +93,11 @@ test_that("check_zeroinflation, glmer.nb", { check_zeroinflation(m), structure( list( - predicted.zeros = 153, observed.zeros = 155L, - ratio = 0.987096774193548, tolerance = 0.05 + predicted.zeros = 153, + observed.zeros = 155L, + ratio = 0.987329032258065, + tolerance = 0.1, + p.value = 0.944 ), class = "check_zi" ), From 13c696f0e9bb4083a12510d401248f89fdd03014 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 16 Mar 2024 11:20:46 +0100 Subject: [PATCH 45/73] add tests --- tests/testthat/test-check_overdispersion.R | 28 ++++++++++++++++++++-- tests/testthat/test-check_zeroinflation.R | 27 +++++++++++++++++++++ 2 files changed, 53 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test-check_overdispersion.R b/tests/testthat/test-check_overdispersion.R index ee3426299..9f38cc5ab 100644 --- a/tests/testthat/test-check_overdispersion.R +++ b/tests/testthat/test-check_overdispersion.R @@ -1,4 +1,4 @@ -test_that("check_overdispersion", { +test_that("check_overdispersion, glmmTMB-poisson", { skip_if_not_installed("glmmTMB") skip_if_not(getRversion() >= "4.0.0") data(Salamanders, package = "glmmTMB") @@ -20,7 +20,7 @@ test_that("check_overdispersion", { ) }) -test_that("check_overdispersion", { +test_that("check_overdispersion, glmmTMB-poisson mixed", { skip_if_not_installed("glmmTMB") skip_if_not(getRversion() >= "4.0.0") data(Salamanders, package = "glmmTMB") @@ -109,3 +109,27 @@ test_that("check_overdispersion, zero-inflated and negbin", { ignore_attr = TRUE ) }) + + +test_that("check_overdispersion, MASS::negbin", { + skip_if_not_installed("MASS") + skip_if_not_installed("DHARMa") + set.seed(3) + mu <- rpois(500, lambda = 3) + x <- rnorm(500, mu, mu * 3) + x <- ceiling(x) + x <- pmax(x, 0) + m <- MASS::glm.nb(x ~ mu) + expect_equal( + check_overdispersion(m), + structure( + list( + dispersion_ratio = 2.44187535015136, + p_value = 0 + ), + class = c("check_overdisp", "see_check_overdisp") + ), + ignore_attr = TRUE, + tolerance = 1e-4 + ) +}) diff --git a/tests/testthat/test-check_zeroinflation.R b/tests/testthat/test-check_zeroinflation.R index 5fd6a71fc..ef8ee3e44 100644 --- a/tests/testthat/test-check_zeroinflation.R +++ b/tests/testthat/test-check_zeroinflation.R @@ -133,3 +133,30 @@ test_that("check_zeroinflation, glmmTMB nbinom", { tolerance = 1e-3 ) }) + + +test_that("check_zeroinflation, MASS::negbin", { + skip_if_not_installed("MASS") + skip_if_not_installed("DHARMa") + set.seed(3) + mu <- rpois(500, lambda = 3) + x <- rnorm(500, mu, mu * 3) + x <- ceiling(x) + x <- pmax(x, 0) + m <- MASS::glm.nb(x ~ mu) + expect_equal( + check_zeroinflation(m), + structure( + list( + predicted.zeros = 178, + observed.zeros = 202L, + ratio = 0.879643564356436, + tolerance = 0.1, + p.value = 0.008 + ), + class = "check_zi" + ), + ignore_attr = TRUE, + tolerance = 1e-4 + ) +}) From ad60db15550c406e533ad037107c81d81a66590d Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 16 Mar 2024 11:33:48 +0100 Subject: [PATCH 46/73] fix --- R/check_overdispersion.R | 5 ++++- R/check_zeroinflation.R | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/R/check_overdispersion.R b/R/check_overdispersion.R index 9c3b33e99..e09265e61 100644 --- a/R/check_overdispersion.R +++ b/R/check_overdispersion.R @@ -164,7 +164,10 @@ check_overdispersion.glm <- function(x, verbose = TRUE, ...) { # for certain distributions, simulated residuals are more accurate use_simulated <- info$is_bernoulli || info$is_binomial || (!info$is_count && !info$is_binomial) || info$is_negbin - if (use_simulated) { + # model classes not supported in DHARMa + not_supported <- c("fixest", "glmx") + + if (use_simulated && !inherits(x, not_supported)) { return(check_overdispersion(simulate_residuals(x, ...), ...)) } diff --git a/R/check_zeroinflation.R b/R/check_zeroinflation.R index 0d7ce94a4..be76f748b 100644 --- a/R/check_zeroinflation.R +++ b/R/check_zeroinflation.R @@ -68,9 +68,12 @@ check_zeroinflation.default <- function(x, tolerance = 0.05, ...) { return(NULL) } + # model classes not supported in DHARMa + not_supported <- c("fixest", "glmx") + # for models with zero-inflation component or negative binomial families, # we use simulated_residuals() - if (model_info$is_zero_inflated || model_info$is_negbin) { + if (!inherits(x, not_supported) && (model_info$is_zero_inflated || model_info$is_negbin)) { if (missing(tolerance)) { tolerance <- 0.1 } From 3d5844d580be9619382d3c4e63a53c769df7fb54 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 16 Mar 2024 12:03:40 +0100 Subject: [PATCH 47/73] update descr --- DESCRIPTION | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 7263302b7..f607579f3 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -70,9 +70,8 @@ Depends: R (>= 3.6) Imports: bayestestR (>= 0.13.2), - insight (>= 0.19.8), + insight (>= 0.19.9), datawizard (>= 0.9.1), - methods, stats, utils Suggests: From 25f2a1656a1889122bfb1b989502c566bb31be71 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 16 Mar 2024 12:05:59 +0100 Subject: [PATCH 48/73] fix --- R/check_overdispersion.R | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/R/check_overdispersion.R b/R/check_overdispersion.R index e09265e61..63fb2c633 100644 --- a/R/check_overdispersion.R +++ b/R/check_overdispersion.R @@ -171,6 +171,18 @@ check_overdispersion.glm <- function(x, verbose = TRUE, ...) { return(check_overdispersion(simulate_residuals(x, ...), ...)) } + # check if we have poisson - need this for models not supported by DHARMa + if (!info$is_count && !info$is_binomial) { + insight::format_error( + "Overdispersion checks can only be used for models from Poisson families or binomial families with trials > 1." + ) + } + + # check for Bernoulli + if (info$is_bernoulli) { + insight::format_error("Overdispersion checks cannot be used for Bernoulli models.") + } + yhat <- stats::fitted(x) n <- stats::nobs(x) From e0235ed53aec9c11080fa2f86ab98ee8f7183f66 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 16 Mar 2024 12:23:06 +0100 Subject: [PATCH 49/73] docs --- R/check_model.R | 4 +++- R/check_overdispersion.R | 2 ++ R/check_residuals.R | 2 ++ R/check_zeroinflation.R | 12 ++++++++++++ R/simulate_residuals.R | 13 +++++++++++++ man/check_model.Rd | 4 +++- man/check_overdispersion.Rd | 14 ++++++++++++++ man/check_residuals.Rd | 15 +++++++++++++++ man/check_zeroinflation.Rd | 14 ++++++++++++++ man/simulate_residuals.Rd | 15 +++++++++++++++ 10 files changed, 93 insertions(+), 2 deletions(-) diff --git a/R/check_model.R b/R/check_model.R index 8a3fa29ff..cfcdaf258 100644 --- a/R/check_model.R +++ b/R/check_model.R @@ -116,7 +116,9 @@ #' the line. For generalized linear models and when `residual_type = "normal"`, #' a half-normal Q-Q plot of the absolute value of the standardized deviance #' residuals is shown, however, the interpretation of the plot remains the same. -#' See [`check_normality()`] for further details. +#' See [`check_normality()`] for further details. Usually, for generalized linear +#' (mixed) models, a test for uniformity of residuals based on simulated residuals +#' is conducted (see next section). #' #' @section Uniformity of Residuals: #' Fore non-Gaussian models, when `residual_type = "simulated"` (the default diff --git a/R/check_overdispersion.R b/R/check_overdispersion.R index 63fb2c633..ec8f480e8 100644 --- a/R/check_overdispersion.R +++ b/R/check_overdispersion.R @@ -42,6 +42,8 @@ #' the overdispersion test is based on [`simulate_residuals()`] (which is identical #' to `check_overdispersion(simulate_residuals(model))`). #' +#' @inheritSection check_zeroinflation Tests based on simulated residuals +#' #' @section How to fix Overdispersion: #' Overdispersion can be fixed by either modeling the dispersion parameter, or #' by choosing a different distributional family (like Quasi-Poisson, or diff --git a/R/check_residuals.R b/R/check_residuals.R index b04328f25..25156be85 100644 --- a/R/check_residuals.R +++ b/R/check_residuals.R @@ -14,6 +14,8 @@ #' @details Uniformity of residuals is checked using a Kolmogorov-Smirnov test. #' There is a `plot()` method to visualize the distribution of the residuals. #' +#' @inheritSection simulate_residuals Tests based on simulated residuals +#' #' @seealso [`simulate_residuals()`] #' #' @return The p-value of the test statistics. diff --git a/R/check_zeroinflation.R b/R/check_zeroinflation.R index be76f748b..bb1582f26 100644 --- a/R/check_zeroinflation.R +++ b/R/check_zeroinflation.R @@ -28,6 +28,18 @@ #' [`simulate_residuals()`], i.e. `check_zeroinflation(simulate_residuals(model))` #' is internally called if necessary. #' +#' @section Tests based on simulated residuals: +#' For certain models, resp. model from certain families, tests are based on +#' [`simulated_residuals()`]. These are usually more accurate for tests than the +#' traditionally used Pearson residuals. However, when simulating from more +#' complex model, such as mixed models or models with zero-inflation, there are +#' several important considerations. Arguments specified in `...` are passed to +#' [`simulate_residuals()`], which relies on [`DHARMa::simulateResiduals()`] (and +#' therefore, arguments in `...` are passed further down to _DHARMa_). It is +#' recommended to read the 'Details' in `?DHARMa::simulateResiduals` closely to +#' understand the implications of the simulation process and which arguments +#' should be modified to get the most accurate results. +#' #' @family functions to check model assumptions and and assess model quality #' #' @examplesIf require("glmmTMB") && require("DHARMa") diff --git a/R/simulate_residuals.R b/R/simulate_residuals.R index 1e3b2b84d..65eda8553 100644 --- a/R/simulate_residuals.R +++ b/R/simulate_residuals.R @@ -21,6 +21,19 @@ #' functions in the **see** package. See also `vignette("DHARMa")`. There is a #' `plot()` method to visualize the distribution of the residuals. #' +#' @section Tests based on simulated residuals: +#' For certain models, resp. model from certain families, tests like +#' [`check_zeroinflation()`] or [`check_overdispersion()`] are based on +#' `simulated_residuals()`. These are usually more accurate for such tests than +#' the traditionally used Pearson residuals. However, when simulating from more +#' complex model, such as mixed models or models with zero-inflation, there are +#' several important considerations. `simulate_residuals()` relies on +#' [`DHARMa::simulateResiduals()`], and additional arguments specified in `...` +#' are passed further down to that function. It is recommended to read the +#' 'Details' in `?DHARMa::simulateResiduals` closely to understand the +#' implications of the simulation process and which arguments should be modified +#' to get the most accurate results. +#' #' @references #' #' - Hartig, F., & Lohse, L. (2022). DHARMa: Residual Diagnostics for Hierarchical diff --git a/man/check_model.Rd b/man/check_model.Rd index a117b1fba..4bdea6ebe 100644 --- a/man/check_model.Rd +++ b/man/check_model.Rd @@ -176,7 +176,9 @@ predict the outcome well for that range that shows larger deviations from the line. For generalized linear models and when \code{residual_type = "normal"}, a half-normal Q-Q plot of the absolute value of the standardized deviance residuals is shown, however, the interpretation of the plot remains the same. -See \code{\link[=check_normality]{check_normality()}} for further details. +See \code{\link[=check_normality]{check_normality()}} for further details. Usually, for generalized linear +(mixed) models, a test for uniformity of residuals based on simulated residuals +is conducted (see next section). } \section{Uniformity of Residuals}{ diff --git a/man/check_overdispersion.Rd b/man/check_overdispersion.Rd index 5ae8bf44c..8c88d44c9 100644 --- a/man/check_overdispersion.Rd +++ b/man/check_overdispersion.Rd @@ -73,6 +73,20 @@ by choosing a different distributional family (like Quasi-Poisson, or negative binomial, see \emph{Gelman and Hill (2007), pages 115-116}). } +\section{Tests based on simulated residuals}{ + +For certain models, resp. model from certain families, tests are based on +\code{\link[=simulated_residuals]{simulated_residuals()}}. These are usually more accurate for tests than the +traditionally used Pearson residuals. However, when simulating from more +complex model, such as mixed models or models with zero-inflation, there are +several important considerations. Arguments specified in \code{...} are passed to +\code{\link[=simulate_residuals]{simulate_residuals()}}, which relies on \code{\link[DHARMa:simulateResiduals]{DHARMa::simulateResiduals()}} (and +therefore, arguments in \code{...} are passed further down to \emph{DHARMa}). It is +recommended to read the 'Details' in \code{?DHARMa::simulateResiduals} closely to +understand the implications of the simulation process and which arguments +should be modified to get the most accurate results. +} + \examples{ \dontshow{if (getRversion() >= "4.0.0" && require("glmmTMB", quietly = TRUE)) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} diff --git a/man/check_residuals.Rd b/man/check_residuals.Rd index b91615953..d9579dd8a 100644 --- a/man/check_residuals.Rd +++ b/man/check_residuals.Rd @@ -31,6 +31,21 @@ residual spatial and temporal autocorrelation. Uniformity of residuals is checked using a Kolmogorov-Smirnov test. There is a \code{plot()} method to visualize the distribution of the residuals. } +\section{Tests based on simulated residuals}{ + +For certain models, resp. model from certain families, tests like +\code{\link[=check_zeroinflation]{check_zeroinflation()}} or \code{\link[=check_overdispersion]{check_overdispersion()}} are based on +\code{simulated_residuals()}. These are usually more accurate for such tests than +the traditionally used Pearson residuals. However, when simulating from more +complex model, such as mixed models or models with zero-inflation, there are +several important considerations. \code{simulate_residuals()} relies on +\code{\link[DHARMa:simulateResiduals]{DHARMa::simulateResiduals()}}, and additional arguments specified in \code{...} +are passed further down to that function. It is recommended to read the +'Details' in \code{?DHARMa::simulateResiduals} closely to understand the +implications of the simulation process and which arguments should be modified +to get the most accurate results. +} + \examples{ \dontshow{if (require("DHARMa")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} dat <- DHARMa::createData(sampleSize = 100, overdispersion = 0.5, family = poisson()) diff --git a/man/check_zeroinflation.Rd b/man/check_zeroinflation.Rd index c6054e986..b41ab5ec5 100644 --- a/man/check_zeroinflation.Rd +++ b/man/check_zeroinflation.Rd @@ -51,6 +51,20 @@ or hurdle models, the results from \code{check_zeroinflation()} are based on \code{\link[=simulate_residuals]{simulate_residuals()}}, i.e. \code{check_zeroinflation(simulate_residuals(model))} is internally called if necessary. } +\section{Tests based on simulated residuals}{ + +For certain models, resp. model from certain families, tests are based on +\code{\link[=simulated_residuals]{simulated_residuals()}}. These are usually more accurate for tests than the +traditionally used Pearson residuals. However, when simulating from more +complex model, such as mixed models or models with zero-inflation, there are +several important considerations. Arguments specified in \code{...} are passed to +\code{\link[=simulate_residuals]{simulate_residuals()}}, which relies on \code{\link[DHARMa:simulateResiduals]{DHARMa::simulateResiduals()}} (and +therefore, arguments in \code{...} are passed further down to \emph{DHARMa}). It is +recommended to read the 'Details' in \code{?DHARMa::simulateResiduals} closely to +understand the implications of the simulation process and which arguments +should be modified to get the most accurate results. +} + \examples{ \dontshow{if (require("glmmTMB") && require("DHARMa")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} data(Salamanders, package = "glmmTMB") diff --git a/man/simulate_residuals.Rd b/man/simulate_residuals.Rd index 134adc3cc..5ead325c3 100644 --- a/man/simulate_residuals.Rd +++ b/man/simulate_residuals.Rd @@ -30,6 +30,21 @@ It basically only sets \code{plot = FALSE} and adds an additional class attribut functions in the \strong{see} package. See also \code{vignette("DHARMa")}. There is a \code{plot()} method to visualize the distribution of the residuals. } +\section{Tests based on simulated residuals}{ + +For certain models, resp. model from certain families, tests like +\code{\link[=check_zeroinflation]{check_zeroinflation()}} or \code{\link[=check_overdispersion]{check_overdispersion()}} are based on +\code{simulated_residuals()}. These are usually more accurate for such tests than +the traditionally used Pearson residuals. However, when simulating from more +complex model, such as mixed models or models with zero-inflation, there are +several important considerations. \code{simulate_residuals()} relies on +\code{\link[DHARMa:simulateResiduals]{DHARMa::simulateResiduals()}}, and additional arguments specified in \code{...} +are passed further down to that function. It is recommended to read the +'Details' in \code{?DHARMa::simulateResiduals} closely to understand the +implications of the simulation process and which arguments should be modified +to get the most accurate results. +} + \examples{ \dontshow{if (require("DHARMa")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} m <- lm(mpg ~ wt + cyl + gear + disp, data = mtcars) From 39b3751ce845926d08acd08d8bc3084e65cf1521 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 16 Mar 2024 12:38:15 +0100 Subject: [PATCH 50/73] closes #632 --- R/check_zeroinflation.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/check_zeroinflation.R b/R/check_zeroinflation.R index bb1582f26..0c4f80c0a 100644 --- a/R/check_zeroinflation.R +++ b/R/check_zeroinflation.R @@ -85,7 +85,7 @@ check_zeroinflation.default <- function(x, tolerance = 0.05, ...) { # for models with zero-inflation component or negative binomial families, # we use simulated_residuals() - if (!inherits(x, not_supported) && (model_info$is_zero_inflated || model_info$is_negbin)) { + if (!inherits(x, not_supported) && (model_info$is_zero_inflated || model_info$is_negbin || model_info$family == "genpois")) { # nolint if (missing(tolerance)) { tolerance <- 0.1 } From 3382ecb7cb1f230e1cb81ea15da984a699aadaaa Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 16 Mar 2024 12:39:11 +0100 Subject: [PATCH 51/73] fix genpois --- R/check_overdispersion.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/check_overdispersion.R b/R/check_overdispersion.R index ec8f480e8..42861b95b 100644 --- a/R/check_overdispersion.R +++ b/R/check_overdispersion.R @@ -248,7 +248,7 @@ check_overdispersion.merMod <- function(x, ...) { info <- insight::model_info(x) # for certain distributions, simulated residuals are more accurate - use_simulated <- info$is_zero_inflated || info$is_bernoulli || info$is_binomial || (!info$is_count && !info$is_binomial) || info$is_negbin + use_simulated <- info$family == "genpois" || info$is_zero_inflated || info$is_bernoulli || info$is_binomial || (!info$is_count && !info$is_binomial) || info$is_negbin # nolint if (use_simulated) { return(check_overdispersion(simulate_residuals(x, ...), ...)) From 62d2bfecd2d2361b740e41393a9b6bdd3d6057e8 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 16 Mar 2024 12:41:56 +0100 Subject: [PATCH 52/73] add tests --- tests/testthat/test-check_overdispersion.R | 26 +++++++++++++++++++ tests/testthat/test-check_zeroinflation.R | 29 ++++++++++++++++++++++ 2 files changed, 55 insertions(+) diff --git a/tests/testthat/test-check_overdispersion.R b/tests/testthat/test-check_overdispersion.R index 9f38cc5ab..a720e7cdc 100644 --- a/tests/testthat/test-check_overdispersion.R +++ b/tests/testthat/test-check_overdispersion.R @@ -133,3 +133,29 @@ test_that("check_overdispersion, MASS::negbin", { tolerance = 1e-4 ) }) + + +test_that("check_overdispersion, genpois", { + skip_if_not_installed("glmmTMB") + skip_if_not_installed("DHARMa") + skip_if_not(getRversion() >= "4.0.0") + data(Salamanders, package = "glmmTMB") + + model <- glmmTMB::glmmTMB( + count ~ mined + spp + (1 | site), + family = glmmTMB::genpois(), + data = Salamanders + ) + expect_equal( + check_overdispersion(model), + structure( + list( + dispersion_ratio = 1.02883236131678, + p_value = 0.88 + ), + class = c("check_overdisp", "see_check_overdisp") + ), + tolerance = 1e-4, + ignore_attr = TRUE + ) +}) diff --git a/tests/testthat/test-check_zeroinflation.R b/tests/testthat/test-check_zeroinflation.R index ef8ee3e44..38a5c7726 100644 --- a/tests/testthat/test-check_zeroinflation.R +++ b/tests/testthat/test-check_zeroinflation.R @@ -160,3 +160,32 @@ test_that("check_zeroinflation, MASS::negbin", { tolerance = 1e-4 ) }) + + +test_that("check_zeroinflation, genpois", { + skip_if_not_installed("glmmTMB") + skip_if_not_installed("DHARMa") + skip_if_not(getRversion() >= "4.0.0") + data(Salamanders, package = "glmmTMB") + + model <- glmmTMB::glmmTMB( + count ~ mined + spp + (1 | site), + family = glmmTMB::genpois(), + data = Salamanders + ) + expect_equal( + check_zeroinflation(model), + structure( + list( + predicted.zeros = 386, + observed.zeros = 387L, + ratio = 0.997860465116279, + tolerance = 0.1, + p.value = 1 + ), + class = "check_zi" + ), + tolerance = 1e-4, + ignore_attr = TRUE + ) +}) From 18eed23d87dc01e81b44502960ac40733f4b7663 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 16 Mar 2024 21:56:46 +0100 Subject: [PATCH 53/73] fix --- R/check_model.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/check_model.R b/R/check_model.R index cfcdaf258..ef4126832 100644 --- a/R/check_model.R +++ b/R/check_model.R @@ -206,7 +206,7 @@ check_model.default <- function(x, # set default for residual_type if (is.null(residual_type)) { - residual_type <- ifelse(minfo$is_linear, "normal", "simulated") + residual_type <- ifelse(minfo$is_linear && !minfo$is_gam, "normal", "simulated") } # set default for detrend if (missing(detrend)) { From 42d20d9f1635b8c0fe8ab57987f0fc6e9735e771 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 16 Mar 2024 22:00:49 +0100 Subject: [PATCH 54/73] check_model for DHARMa and simres --- NAMESPACE | 2 ++ R/check_model.R | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/NAMESPACE b/NAMESPACE index d4b36a3fa..f9252ee3f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -55,9 +55,11 @@ S3method(check_heteroscedasticity,default) S3method(check_homogeneity,afex_aov) S3method(check_homogeneity,default) S3method(check_homogeneity,htest) +S3method(check_model,DHARMa) S3method(check_model,brmsfit) S3method(check_model,default) S3method(check_model,model_fit) +S3method(check_model,performance_simres) S3method(check_model,stanreg) S3method(check_multimodal,data.frame) S3method(check_multimodal,numeric) diff --git a/R/check_model.R b/R/check_model.R index ef4126832..6be17be22 100644 --- a/R/check_model.R +++ b/R/check_model.R @@ -367,6 +367,47 @@ check_model.model_fit <- function(x, } +#' @export +check_model.performance_simres <- function(x, + dot_size = 2, + line_size = 0.8, + panel = TRUE, + check = "all", + alpha = 0.2, + dot_alpha = 0.8, + colors = c("#3aaf85", "#1b6ca8", "#cd201f"), + theme = "see::theme_lucid", + detrend = TRUE, + show_dots = NULL, + bandwidth = "nrd", + type = "density", + residual_type = NULL, + verbose = TRUE, + ...) { + check_model( + x$fittedModel, + dot_size = dot_size, + line_size = line_size, + panel = panel, + check = check, + alpha = alpha, + dot_alpha = dot_alpha, + colors = colors, + theme = theme, + detrend = detrend, + show_dots = show_dots, + bandwidth = bandwidth, + type = type, + residual_type = "simulated", + verbose = verbose, + ... + ) +} + +#' @export +check_model.DHARMa <- check_model.performance_simres + + # compile plots for checks of linear models ------------------------ From 22cb2a680e7730ad6774555ffa182b0e027471b1 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 16 Mar 2024 22:05:11 +0100 Subject: [PATCH 55/73] docs --- R/check_zeroinflation.R | 11 +++++++---- R/simulate_residuals.R | 11 +++++++---- man/check_overdispersion.Rd | 11 +++++++---- man/check_residuals.Rd | 11 +++++++---- man/check_zeroinflation.Rd | 11 +++++++---- man/simulate_residuals.Rd | 11 +++++++---- 6 files changed, 42 insertions(+), 24 deletions(-) diff --git a/R/check_zeroinflation.R b/R/check_zeroinflation.R index 0c4f80c0a..63badc5d4 100644 --- a/R/check_zeroinflation.R +++ b/R/check_zeroinflation.R @@ -35,10 +35,13 @@ #' complex model, such as mixed models or models with zero-inflation, there are #' several important considerations. Arguments specified in `...` are passed to #' [`simulate_residuals()`], which relies on [`DHARMa::simulateResiduals()`] (and -#' therefore, arguments in `...` are passed further down to _DHARMa_). It is -#' recommended to read the 'Details' in `?DHARMa::simulateResiduals` closely to -#' understand the implications of the simulation process and which arguments -#' should be modified to get the most accurate results. +#' therefore, arguments in `...` are passed further down to _DHARMa_). The +#' defaults in DHARMa are set on the most conservative option that works for +#' all models. However, in many cases, the help advises to use different settings +#' in particular situations or for particular models. It is recommended to read +#' the 'Details' in `?DHARMa::simulateResiduals` closely to understand the +#' implications of the simulation process and which arguments should be modified +#' to get the most accurate results. #' #' @family functions to check model assumptions and and assess model quality #' diff --git a/R/simulate_residuals.R b/R/simulate_residuals.R index 65eda8553..0dee1e045 100644 --- a/R/simulate_residuals.R +++ b/R/simulate_residuals.R @@ -29,10 +29,13 @@ #' complex model, such as mixed models or models with zero-inflation, there are #' several important considerations. `simulate_residuals()` relies on #' [`DHARMa::simulateResiduals()`], and additional arguments specified in `...` -#' are passed further down to that function. It is recommended to read the -#' 'Details' in `?DHARMa::simulateResiduals` closely to understand the -#' implications of the simulation process and which arguments should be modified -#' to get the most accurate results. +#' are passed further down to that function. The defaults in DHARMa are set on +#' the most conservative option that works for all models. However, in many +#' cases, the help advises to use different settings in particular situations +#' or for particular models. It is recommended to read the 'Details' in +#' `?DHARMa::simulateResiduals` closely to understand the implications of the +#' simulation process and which arguments should be modified to get the most +#' accurate results. #' #' @references #' diff --git a/man/check_overdispersion.Rd b/man/check_overdispersion.Rd index 8c88d44c9..762ede07f 100644 --- a/man/check_overdispersion.Rd +++ b/man/check_overdispersion.Rd @@ -81,10 +81,13 @@ traditionally used Pearson residuals. However, when simulating from more complex model, such as mixed models or models with zero-inflation, there are several important considerations. Arguments specified in \code{...} are passed to \code{\link[=simulate_residuals]{simulate_residuals()}}, which relies on \code{\link[DHARMa:simulateResiduals]{DHARMa::simulateResiduals()}} (and -therefore, arguments in \code{...} are passed further down to \emph{DHARMa}). It is -recommended to read the 'Details' in \code{?DHARMa::simulateResiduals} closely to -understand the implications of the simulation process and which arguments -should be modified to get the most accurate results. +therefore, arguments in \code{...} are passed further down to \emph{DHARMa}). The +defaults in DHARMa are set on the most conservative option that works for +all models. However, in many cases, the help advises to use different settings +in particular situations or for particular models. It is recommended to read +the 'Details' in \code{?DHARMa::simulateResiduals} closely to understand the +implications of the simulation process and which arguments should be modified +to get the most accurate results. } \examples{ diff --git a/man/check_residuals.Rd b/man/check_residuals.Rd index d9579dd8a..cf4295d81 100644 --- a/man/check_residuals.Rd +++ b/man/check_residuals.Rd @@ -40,10 +40,13 @@ the traditionally used Pearson residuals. However, when simulating from more complex model, such as mixed models or models with zero-inflation, there are several important considerations. \code{simulate_residuals()} relies on \code{\link[DHARMa:simulateResiduals]{DHARMa::simulateResiduals()}}, and additional arguments specified in \code{...} -are passed further down to that function. It is recommended to read the -'Details' in \code{?DHARMa::simulateResiduals} closely to understand the -implications of the simulation process and which arguments should be modified -to get the most accurate results. +are passed further down to that function. The defaults in DHARMa are set on +the most conservative option that works for all models. However, in many +cases, the help advises to use different settings in particular situations +or for particular models. It is recommended to read the 'Details' in +\code{?DHARMa::simulateResiduals} closely to understand the implications of the +simulation process and which arguments should be modified to get the most +accurate results. } \examples{ diff --git a/man/check_zeroinflation.Rd b/man/check_zeroinflation.Rd index b41ab5ec5..9de6c1f5c 100644 --- a/man/check_zeroinflation.Rd +++ b/man/check_zeroinflation.Rd @@ -59,10 +59,13 @@ traditionally used Pearson residuals. However, when simulating from more complex model, such as mixed models or models with zero-inflation, there are several important considerations. Arguments specified in \code{...} are passed to \code{\link[=simulate_residuals]{simulate_residuals()}}, which relies on \code{\link[DHARMa:simulateResiduals]{DHARMa::simulateResiduals()}} (and -therefore, arguments in \code{...} are passed further down to \emph{DHARMa}). It is -recommended to read the 'Details' in \code{?DHARMa::simulateResiduals} closely to -understand the implications of the simulation process and which arguments -should be modified to get the most accurate results. +therefore, arguments in \code{...} are passed further down to \emph{DHARMa}). The +defaults in DHARMa are set on the most conservative option that works for +all models. However, in many cases, the help advises to use different settings +in particular situations or for particular models. It is recommended to read +the 'Details' in \code{?DHARMa::simulateResiduals} closely to understand the +implications of the simulation process and which arguments should be modified +to get the most accurate results. } \examples{ diff --git a/man/simulate_residuals.Rd b/man/simulate_residuals.Rd index 5ead325c3..cf8042697 100644 --- a/man/simulate_residuals.Rd +++ b/man/simulate_residuals.Rd @@ -39,10 +39,13 @@ the traditionally used Pearson residuals. However, when simulating from more complex model, such as mixed models or models with zero-inflation, there are several important considerations. \code{simulate_residuals()} relies on \code{\link[DHARMa:simulateResiduals]{DHARMa::simulateResiduals()}}, and additional arguments specified in \code{...} -are passed further down to that function. It is recommended to read the -'Details' in \code{?DHARMa::simulateResiduals} closely to understand the -implications of the simulation process and which arguments should be modified -to get the most accurate results. +are passed further down to that function. The defaults in DHARMa are set on +the most conservative option that works for all models. However, in many +cases, the help advises to use different settings in particular situations +or for particular models. It is recommended to read the 'Details' in +\code{?DHARMa::simulateResiduals} closely to understand the implications of the +simulation process and which arguments should be modified to get the most +accurate results. } \examples{ From d9e22e85164c1f162d77a2887bf9de2a635a4f62 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 16 Mar 2024 22:13:06 +0100 Subject: [PATCH 56/73] tweak --- R/check_model_diagnostics.R | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/R/check_model_diagnostics.R b/R/check_model_diagnostics.R index e3ec99ce1..33a5cd478 100644 --- a/R/check_model_diagnostics.R +++ b/R/check_model_diagnostics.R @@ -297,16 +297,18 @@ faminfo <- insight::model_info(model) d <- data.frame(Predicted = stats::predict(model, type = "response")) + simres <- simulate_residuals(model, ...) + predicted <- simres$fittedPredictedResponse # residuals based on simulated residuals - but we want normally distributed residuals - d$Residuals <- stats::residuals(simulate_residuals(model, ...), quantileFunction = stats::qnorm, ...) + d$Residuals <- stats::residuals(simres, quantileFunction = stats::qnorm, ...) d$Res2 <- d$Residuals^2 # standard residuals: residuals / sqrt(mse) d$StdRes <- d$Residuals / sqrt(mean(d$Res2, na.rm = TRUE)) # data for poisson models if (faminfo$is_poisson && !faminfo$is_zero_inflated) { - d$V <- d$Predicted + d$V <- predicted } # data for negative binomial models @@ -314,14 +316,14 @@ if (inherits(model, "glmmTMB")) { if (faminfo$family == "nbinom1") { # for nbinom1, we can use "sigma()" - d$V <- insight::get_sigma(model)^2 * stats::family(model)$variance(d$Predicted) + d$V <- insight::get_sigma(model)^2 * stats::family(model)$variance(predicted) } else { # for nbinom2, "sigma()" has "inverse meaning" (see #654) - d$V <- (1 / insight::get_sigma(model)^2) * stats::family(model)$variance(d$Predicted) + d$V <- (1 / insight::get_sigma(model)^2) * stats::family(model)$variance(predicted) } } else { ## FIXME: this is not correct for glm.nb models? - d$V <- d$Predicted * (1 + d$Predicted / insight::get_sigma(model)) + d$V <- predicted * (1 + predicted / insight::get_sigma(model)) } } @@ -333,7 +335,7 @@ ptype <- "zero" } d$Prob <- stats::predict(model, type = ptype) - d$V <- d$Predicted * (1 - d$Prob) * (1 + d$Predicted * d$Prob) + d$V <- predicted * (1 - d$Prob) * (1 + predicted * d$Prob) } # data for zero-inflated negative binomial models @@ -345,7 +347,7 @@ } d$Prob <- stats::predict(model, type = ptype) d$Disp <- insight::get_sigma(model) - d$V <- d$Predicted * (1 + d$Predicted / d$Disp) * (1 - d$Prob) * (1 + d$Predicted * (1 + d$Predicted / d$Disp) * d$Prob) # nolint + d$V <- predicted * (1 + predicted / d$Disp) * (1 - d$Prob) * (1 + predicted * (1 + predicted / d$Disp) * d$Prob) # nolint } # data for zero-inflated negative binomial models with dispersion @@ -358,7 +360,7 @@ } d$Prob <- stats::predict(model, type = ptype) d$Disp <- stats::predict(model, type = "disp") - d$V <- d$Predicted * (1 + d$Predicted / d$Disp) * (1 - d$Prob) * (1 + d$Predicted * (1 + d$Predicted / d$Disp) * d$Prob) # nolint + d$V <- predicted * (1 + predicted / d$Disp) * (1 - d$Prob) * (1 + predicted * (1 + predicted / d$Disp) * d$Prob) # nolint } d From 825d45608d52128db998e9d9a5400d29affdbba2 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 16 Mar 2024 22:16:23 +0100 Subject: [PATCH 57/73] ... --- R/check_model_diagnostics.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/check_model_diagnostics.R b/R/check_model_diagnostics.R index 33a5cd478..441ac91cc 100644 --- a/R/check_model_diagnostics.R +++ b/R/check_model_diagnostics.R @@ -296,9 +296,10 @@ .new_diag_overdispersion <- function(model, ...) { faminfo <- insight::model_info(model) - d <- data.frame(Predicted = stats::predict(model, type = "response")) simres <- simulate_residuals(model, ...) predicted <- simres$fittedPredictedResponse + # d <- data.frame(Predicted = stats::predict(model, type = "response")) + d <- data.frame(Predicted = predicted) # residuals based on simulated residuals - but we want normally distributed residuals d$Residuals <- stats::residuals(simres, quantileFunction = stats::qnorm, ...) From 4d00962a3f5ad2282dcac00cd2e6b3a716be2e30 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 16 Mar 2024 22:28:44 +0100 Subject: [PATCH 58/73] try --- R/check_model_diagnostics.R | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/R/check_model_diagnostics.R b/R/check_model_diagnostics.R index 441ac91cc..39a3c843d 100644 --- a/R/check_model_diagnostics.R +++ b/R/check_model_diagnostics.R @@ -298,14 +298,12 @@ simres <- simulate_residuals(model, ...) predicted <- simres$fittedPredictedResponse - # d <- data.frame(Predicted = stats::predict(model, type = "response")) d <- data.frame(Predicted = predicted) # residuals based on simulated residuals - but we want normally distributed residuals d$Residuals <- stats::residuals(simres, quantileFunction = stats::qnorm, ...) d$Res2 <- d$Residuals^2 - # standard residuals: residuals / sqrt(mse) - d$StdRes <- d$Residuals / sqrt(mean(d$Res2, na.rm = TRUE)) + d$StdRes <- insight::get_residuals(model, type = "pearson") # data for poisson models if (faminfo$is_poisson && !faminfo$is_zero_inflated) { From fe1ee7665362d591dfb2e3a78f34a6370844a476 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sun, 17 Mar 2024 10:31:40 +0100 Subject: [PATCH 59/73] Fix #696 --- DESCRIPTION | 2 +- NEWS.md | 2 ++ R/check_distribution.R | 20 ++++++++++++++++---- 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index f607579f3..be9f19035 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: performance Title: Assessment of Regression Models Performance -Version: 0.10.9.7 +Version: 0.10.9.8 Authors@R: c(person(given = "Daniel", family = "Lüdecke", diff --git a/NEWS.md b/NEWS.md index e9a457c09..bc3638554 100644 --- a/NEWS.md +++ b/NEWS.md @@ -13,6 +13,8 @@ * Improved error messages for `check_model()` when QQ-plots cannot be created. +* `check_distribution()` is more stable for possibly sparse data. + ## Bug fixes * Fixed issue in `check_normality()` for t-tests. diff --git a/R/check_distribution.R b/R/check_distribution.R index 6dc5f7481..21c012df4 100644 --- a/R/check_distribution.R +++ b/R/check_distribution.R @@ -77,7 +77,7 @@ check_distribution.default <- function(model) { } else { x <- stats::residuals(model) } - dat <- .extract_features(x) + dat <- .extract_features(x, "residuals") dist_residuals <- as.data.frame(t(stats::predict(classify_distribution, dat, type = "prob"))) @@ -88,7 +88,7 @@ check_distribution.default <- function(model) { dummy_factors = FALSE, preserve_levels = TRUE ) - dat <- .extract_features(x) + dat <- .extract_features(x, "response") dist_response <- as.data.frame(t(stats::predict(classify_distribution, dat, type = "prob"))) @@ -189,15 +189,27 @@ check_distribution.numeric <- function(model) { # utilities ----------------------------- -.extract_features <- function(x) { +.extract_features <- function(x, type = NULL) { # validation check, remove missings x <- x[!is.na(x)] + # this might fail, so we wrap in ".safe()" + map_est <- .safe(mean(x) - as.numeric(bayestestR::map_estimate(x, bw = "nrd0"))) + + if (is.null(map_est)) { + map_est <- datawizard::distribution_mode(x) + msg <- "Could not accurately estimate the mode." + if (!is.null(type)) { + msg <- paste(msg, "Predicted distribution of the", type, "may be less accurate.") + } + insight::format_alert(msg) + } + data.frame( SD = stats::sd(x), MAD = stats::mad(x, constant = 1), Mean_Median_Distance = mean(x) - stats::median(x), - Mean_Mode_Distance = mean(x) - as.numeric(bayestestR::map_estimate(x, bw = "nrd0")), + Mean_Mode_Distance = map_est, SD_MAD_Distance = stats::sd(x) - stats::mad(x, constant = 1), Var_Mean_Distance = stats::var(x) - mean(x), Range_SD = diff(range(x)) / stats::sd(x), From a47329603028cb2ec19006e930a15c4859db0a87 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sun, 17 Mar 2024 10:32:05 +0100 Subject: [PATCH 60/73] fix --- R/check_distribution.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/check_distribution.R b/R/check_distribution.R index 21c012df4..89f48263a 100644 --- a/R/check_distribution.R +++ b/R/check_distribution.R @@ -197,7 +197,7 @@ check_distribution.numeric <- function(model) { map_est <- .safe(mean(x) - as.numeric(bayestestR::map_estimate(x, bw = "nrd0"))) if (is.null(map_est)) { - map_est <- datawizard::distribution_mode(x) + map_est <- mean(x) - datawizard::distribution_mode(x) msg <- "Could not accurately estimate the mode." if (!is.null(type)) { msg <- paste(msg, "Predicted distribution of the", type, "may be less accurate.") From 567f2c1abfa15f7b99530c8b7cae47e0debac031 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sun, 17 Mar 2024 11:25:43 +0100 Subject: [PATCH 61/73] add tests --- tests/testthat/_snaps/check_collinearity.md | 23 +++++++++++++++++ tests/testthat/test-binned_residuals.R | 4 +++ tests/testthat/test-check_autocorrelation.R | 5 ++++ tests/testthat/test-check_collinearity.R | 5 ++-- tests/testthat/test-checks.R | 28 +++++++++++++++++++-- 5 files changed, 61 insertions(+), 4 deletions(-) diff --git a/tests/testthat/_snaps/check_collinearity.md b/tests/testthat/_snaps/check_collinearity.md index 3e9aa24b7..dec439e85 100644 --- a/tests/testthat/_snaps/check_collinearity.md +++ b/tests/testthat/_snaps/check_collinearity.md @@ -12,3 +12,26 @@ P 1.00 1.00 1.00 K 1.00 1.00 1.00 +# check_collinearity, hurdle/zi models w/o zi-formula + + Code + print(out) + Output + # Check for Multicollinearity + + * conditional component: + + Low Correlation + + Term VIF VIF 95% CI Increased SE Tolerance Tolerance 95% CI + fem 1.06 [1.02, 1.20] 1.03 0.95 [0.83, 0.98] + mar 1.06 [1.02, 1.20] 1.03 0.95 [0.83, 0.98] + + * zero inflated component: + + Low Correlation + + Term VIF VIF 95% CI Increased SE Tolerance Tolerance 95% CI + fem 1.07 [1.02, 1.20] 1.03 0.94 [0.83, 0.98] + mar 1.07 [1.02, 1.20] 1.03 0.94 [0.83, 0.98] + diff --git a/tests/testthat/test-binned_residuals.R b/tests/testthat/test-binned_residuals.R index 7b966797e..e23e05d01 100644 --- a/tests/testthat/test-binned_residuals.R +++ b/tests/testthat/test-binned_residuals.R @@ -21,6 +21,10 @@ test_that("binned_residuals", { c(-0.05686, -0.12331, -0.35077, -0.57683, 0.17916, -0.44147), tolerance = 1e-4 ) + expect_identical( + capture.output(print(result)), + "Warning: Probably bad model fit. Only about 50% of the residuals are inside the error bounds." + ) }) diff --git a/tests/testthat/test-check_autocorrelation.R b/tests/testthat/test-check_autocorrelation.R index f70617565..a97b2eeaa 100644 --- a/tests/testthat/test-check_autocorrelation.R +++ b/tests/testthat/test-check_autocorrelation.R @@ -4,4 +4,9 @@ test_that("check_autocorrelation", { set.seed(123) out <- check_autocorrelation(m) expect_equal(as.vector(out), 0.316, ignore_attr = TRUE, tolerance = 1e-2) + expect_identical( + capture.output(print(out)), + "OK: Residuals appear to be independent and not autocorrelated (p = 0.316)." + ) + expect_warning(plot(out), "There is currently") }) diff --git a/tests/testthat/test-check_collinearity.R b/tests/testthat/test-check_collinearity.R index ea41af513..4811c5afe 100644 --- a/tests/testthat/test-check_collinearity.R +++ b/tests/testthat/test-check_collinearity.R @@ -202,14 +202,15 @@ test_that("check_collinearity, hurdle/zi models w/o zi-formula", { link = "logit" ) out <- check_collinearity(m) - expect_identical( - colnames(out), + expect_named( + out, c( "Term", "VIF", "VIF_CI_low", "VIF_CI_high", "SE_factor", "Tolerance", "Tolerance_CI_low", "Tolerance_CI_high", "Component" ) ) expect_equal(out$VIF, c(1.05772, 1.05772, 1.06587, 1.06587), tolerance = 1e-4) + expect_snapshot(print(out)) }) test_that("check_collinearity, invalid data", { diff --git a/tests/testthat/test-checks.R b/tests/testthat/test-checks.R index 46fd77dc5..34250875e 100644 --- a/tests/testthat/test-checks.R +++ b/tests/testthat/test-checks.R @@ -5,8 +5,32 @@ test_that("check_factorstructure", { expect_equal(x$sphericity$chisq, 408.011, tolerance = 0.01) }) -test_that("check_clusterstructure", { +test_that("check_clusterstructure, ok", { skip_if_not_installed("parameters") set.seed(333) - expect_equal(check_clusterstructure(iris[, 1:4])$H, 0.187, tolerance = 0.01) + out <- check_clusterstructure(iris[, 1:4]) + expect_equal(out$H, 0.187, tolerance = 0.01) + expect_identical( + capture.output(print(out)), + c( + "# Clustering tendency", + "", + "The dataset is suitable for clustering (Hopkins' H = 0.18)." + ) + ) +}) + +test_that("check_clusterstructure, bad", { + skip_if_not_installed("parameters") + set.seed(13) + out <- check_clusterstructure(mtcars[, 10:11]) + expect_equal(out$H, 0.5142575, tolerance = 0.01) + expect_identical( + capture.output(print(out)), + c( + "# Clustering tendency", + "", + "The dataset is not suitable for clustering (Hopkins' H = 0.51)." + ) + ) }) From 24faae5980004d9495a6db293017df1d24f9420d Mon Sep 17 00:00:00 2001 From: Daniel Date: Sun, 17 Mar 2024 11:31:00 +0100 Subject: [PATCH 62/73] add tests --- tests/testthat/test-check_convergence.R | 17 +++++++++++++++++ tests/testthat/test-check_heterogeneity_bias.R | 2 +- tests/testthat/test-check_heteroskedasticity.R | 17 +++++++++++++++++ 3 files changed, 35 insertions(+), 1 deletion(-) create mode 100644 tests/testthat/test-check_heteroskedasticity.R diff --git a/tests/testthat/test-check_convergence.R b/tests/testthat/test-check_convergence.R index 1663d1219..8897b785d 100644 --- a/tests/testthat/test-check_convergence.R +++ b/tests/testthat/test-check_convergence.R @@ -26,3 +26,20 @@ test_that("check_convergence", { model <- lme4::lmer(Reaction ~ Days + (1 + Days | Subject), data = sleepstudy) expect_true(check_convergence(model)) }) + + +test_that("check_convergence, glmmTMB", { + skip_if_not_installed("glmmTMB") + data(iris) + model <- suppressWarnings(glmmTMB::glmmTMB( + Sepal.Length ~ poly(Petal.Width, 4) * poly(Petal.Length, 4) + + (1 + poly(Petal.Width, 4) | Species), + data = iris + )) + expect_false(check_convergence(model)) + model <- suppressWarnings(glmmTMB::glmmTMB( + Sepal.Length ~ Petal.Width + (1 | Species), + data = iris + )) + expect_true(check_convergence(model)) +}) diff --git a/tests/testthat/test-check_heterogeneity_bias.R b/tests/testthat/test-check_heterogeneity_bias.R index 7abc6af30..2bd63856e 100644 --- a/tests/testthat/test-check_heterogeneity_bias.R +++ b/tests/testthat/test-check_heterogeneity_bias.R @@ -1,7 +1,7 @@ test_that("check_heterogeneity_bias", { data(iris) set.seed(123) - iris$ID <- sample(1:4, nrow(iris), replace = TRUE) # fake-ID + iris$ID <- sample.int(4, nrow(iris), replace = TRUE) # fake-ID out <- check_heterogeneity_bias(iris, select = c("Sepal.Length", "Petal.Length"), group = "ID") expect_equal(out, c("Sepal.Length", "Petal.Length"), ignore_attr = TRUE) expect_output(print(out), "Possible heterogeneity bias due to following predictors: Sepal\\.Length, Petal\\.Length") diff --git a/tests/testthat/test-check_heteroskedasticity.R b/tests/testthat/test-check_heteroskedasticity.R new file mode 100644 index 000000000..4d64a870b --- /dev/null +++ b/tests/testthat/test-check_heteroskedasticity.R @@ -0,0 +1,17 @@ +test_that("check_heteroskedasticity", { + data(mtcars) + m <- lm(mpg ~ wt + cyl + gear + disp, data = mtcars) + out <- check_heteroscedasticity(m) + expect_equal(as.vector(out), 0.0423, ignore_attr = TRUE, tolerance = 1e-2) + expect_identical( + capture.output(print(out)), + "Warning: Heteroscedasticity (non-constant error variance) detected (p = 0.042)." + ) + m <- lm(mpg ~ hp, data = mtcars) + out <- check_heteroscedasticity(m) + expect_equal(as.vector(out), 0.8271352, ignore_attr = TRUE, tolerance = 1e-2) + expect_identical( + capture.output(print(out)), + "OK: Error variance appears to be homoscedastic (p = 0.827)." + ) +}) From c0940251fae0efc9add5ac1e0e7e8102ddd92343 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sun, 17 Mar 2024 11:43:50 +0100 Subject: [PATCH 63/73] fix test --- tests/testthat/test-checks.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/testthat/test-checks.R b/tests/testthat/test-checks.R index 34250875e..b6d59af3b 100644 --- a/tests/testthat/test-checks.R +++ b/tests/testthat/test-checks.R @@ -1,21 +1,21 @@ test_that("check_factorstructure", { skip_if_not_installed("parameters") x <- check_factorstructure(mtcars) - expect_equal(x$KMO$MSA, 0.826, tolerance = 0.01) - expect_equal(x$sphericity$chisq, 408.011, tolerance = 0.01) + expect_equal(x$KMO$MSA, 0.8265536, tolerance = 0.01) + expect_equal(x$sphericity$chisq, 408.0116, tolerance = 0.01) }) test_that("check_clusterstructure, ok", { skip_if_not_installed("parameters") set.seed(333) out <- check_clusterstructure(iris[, 1:4]) - expect_equal(out$H, 0.187, tolerance = 0.01) + expect_equal(out$H, 0.1869618, tolerance = 0.01) expect_identical( capture.output(print(out)), c( "# Clustering tendency", "", - "The dataset is suitable for clustering (Hopkins' H = 0.18)." + "The dataset is suitable for clustering (Hopkins' H = 0.19)." ) ) }) From 3d78e364c6dae4fc97bab311a8bd8b6425f6fb4f Mon Sep 17 00:00:00 2001 From: Daniel Date: Sun, 17 Mar 2024 12:53:26 +0100 Subject: [PATCH 64/73] detect over and underdispersion --- R/check_overdispersion.R | 21 ++++++---- man/check_overdispersion.Rd | 11 +++-- tests/testthat/test-check_overdispersion.R | 49 +++++++++++++++++++--- 3 files changed, 64 insertions(+), 17 deletions(-) diff --git a/R/check_overdispersion.R b/R/check_overdispersion.R index 42861b95b..2da75f4ea 100644 --- a/R/check_overdispersion.R +++ b/R/check_overdispersion.R @@ -2,7 +2,7 @@ #' @name check_overdispersion #' #' @description `check_overdispersion()` checks generalized linear (mixed) -#' models for overdispersion. +#' models for overdispersion (and underdispersion). #' #' @param x Fitted model of class `merMod`, `glmmTMB`, `glm`, or `glm.nb` #' (package **MASS**), or an object returned by `simulate_residuals()`. @@ -13,15 +13,18 @@ #' statistics, p-value or dispersion ratio. #' #' @details Overdispersion occurs when the observed variance is higher than the -#' variance of a theoretical model. For Poisson models, variance increases -#' with the mean and, therefore, variance usually (roughly) equals the mean -#' value. If the variance is much higher, the data are "overdispersed". +#' variance of a theoretical model. For Poisson models, variance increases +#' with the mean and, therefore, variance usually (roughly) equals the mean +#' value. If the variance is much higher, the data are "overdispersed". A less +#' common case is underdispersion, where the variance is much lower than the +#' mean. #' #' @section Interpretation of the Dispersion Ratio: #' If the dispersion ratio is close to one, a Poisson model fits well to the #' data. Dispersion ratios larger than one indicate overdispersion, thus a -#' negative binomial model or similar might fit better to the data. A p-value < -#' .05 indicates overdispersion. +#' negative binomial model or similar might fit better to the data. Dispersion +#' ratios much smaller than one indicate underdispersion. A p-value < .05 +#' indicates either overdispersion or underdisperion (the first being more common). #' #' @section Overdispersion in Poisson Models: #' For Poisson models, the overdispersion test is based on the code from @@ -147,8 +150,10 @@ print.check_overdisp <- function(x, digits = 3, ...) { if (pval > 0.05) { message("No overdispersion detected.") - } else { + } else if (x$dispersion_ratio > 1) { message("Overdispersion detected.") + } else { + message("Underdispersion detected.") } invisible(orig_x) @@ -302,7 +307,7 @@ check_overdispersion.performance_simres <- function(x, alternative = c("two.side result <- .simres_statistics(x, statistic_fun = dispersion, alternative = alternative) out <- list( - dispersion_ratio = mean(result$simulated) / result$observed, + dispersion_ratio = result$observed / mean(result$simulated), p_value = result$p ) diff --git a/man/check_overdispersion.Rd b/man/check_overdispersion.Rd index 762ede07f..9d1186638 100644 --- a/man/check_overdispersion.Rd +++ b/man/check_overdispersion.Rd @@ -25,20 +25,23 @@ statistics, p-value or dispersion ratio. } \description{ \code{check_overdispersion()} checks generalized linear (mixed) -models for overdispersion. +models for overdispersion (and underdispersion). } \details{ Overdispersion occurs when the observed variance is higher than the variance of a theoretical model. For Poisson models, variance increases with the mean and, therefore, variance usually (roughly) equals the mean -value. If the variance is much higher, the data are "overdispersed". +value. If the variance is much higher, the data are "overdispersed". A less +common case is underdispersion, where the variance is much lower than the +mean. } \section{Interpretation of the Dispersion Ratio}{ If the dispersion ratio is close to one, a Poisson model fits well to the data. Dispersion ratios larger than one indicate overdispersion, thus a -negative binomial model or similar might fit better to the data. A p-value < -.05 indicates overdispersion. +negative binomial model or similar might fit better to the data. Dispersion +ratios much smaller than one indicate underdispersion. A p-value < .05 +indicates either overdispersion or underdisperion (the first being more common). } \section{Overdispersion in Poisson Models}{ diff --git a/tests/testthat/test-check_overdispersion.R b/tests/testthat/test-check_overdispersion.R index a720e7cdc..bcb62d256 100644 --- a/tests/testthat/test-check_overdispersion.R +++ b/tests/testthat/test-check_overdispersion.R @@ -4,8 +4,9 @@ test_that("check_overdispersion, glmmTMB-poisson", { data(Salamanders, package = "glmmTMB") m1 <- glm(count ~ spp + mined, family = poisson, data = Salamanders) + out <- check_overdispersion(m1) expect_equal( - check_overdispersion(m1), + out, structure( list( chisq_statistic = 1873.71012423995, @@ -18,6 +19,32 @@ test_that("check_overdispersion, glmmTMB-poisson", { ), tolerance = 1e-3 ) + expect_identical( + capture.output(print(out)), + c( + "# Overdispersion test", + "", + " dispersion ratio = 2.946", + " Pearson's Chi-Squared = 1873.710", + " p-value = < 0.001", + "" + ) + ) + expect_message(out, "Overdispersion detected") + + set.seed(123) + out <- check_overdispersion(simulate_residuals(m1)) + expect_equal( + out, + structure( + list( + dispersion_ratio = 3.91516791651235, + p_value = 0 + ), + class = c("check_overdisp", "see_check_overdisp") + ), + tolerance = 1e-3 + ) }) test_that("check_overdispersion, glmmTMB-poisson mixed", { @@ -73,7 +100,7 @@ test_that("check_overdispersion, zero-inflated and negbin", { check_overdispersion(m1), structure( list( - dispersion_ratio = 0.504903379544268, + dispersion_ratio = 1.98057695890769, p_value = 0 ), class = c("check_overdisp", "see_check_overdisp") @@ -100,7 +127,7 @@ test_that("check_overdispersion, zero-inflated and negbin", { check_overdispersion(m1), structure( list( - dispersion_ratio = 0.504903379544268, + dispersion_ratio = 1.98057695890769, p_value = 0 ), class = c("check_overdisp", "see_check_overdisp") @@ -120,11 +147,12 @@ test_that("check_overdispersion, MASS::negbin", { x <- ceiling(x) x <- pmax(x, 0) m <- MASS::glm.nb(x ~ mu) + out <- check_overdispersion(m) expect_equal( - check_overdispersion(m), + out, structure( list( - dispersion_ratio = 2.44187535015136, + dispersion_ratio = 0.409521313173506, p_value = 0 ), class = c("check_overdisp", "see_check_overdisp") @@ -132,6 +160,17 @@ test_that("check_overdispersion, MASS::negbin", { ignore_attr = TRUE, tolerance = 1e-4 ) + expect_identical( + capture.output(print(out)), + c( + "# Overdispersion test", + "", + " dispersion ratio = 0.410", + " p-value = < 0.001", + "" + ) + ) + expect_message(out, "Underdispersion detected") }) From 37e7adca06c3517f9002bdd44751fad360c84f6c Mon Sep 17 00:00:00 2001 From: Daniel Date: Sun, 17 Mar 2024 12:53:59 +0100 Subject: [PATCH 65/73] docs --- R/check_overdispersion.R | 2 +- man/check_overdispersion.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/check_overdispersion.R b/R/check_overdispersion.R index 2da75f4ea..bd8311fb9 100644 --- a/R/check_overdispersion.R +++ b/R/check_overdispersion.R @@ -1,4 +1,4 @@ -#' @title Check overdispersion of GL(M)M's +#' @title Check overdispersion (and underdispersion) of GL(M)M's #' @name check_overdispersion #' #' @description `check_overdispersion()` checks generalized linear (mixed) diff --git a/man/check_overdispersion.Rd b/man/check_overdispersion.Rd index 9d1186638..d1aaea175 100644 --- a/man/check_overdispersion.Rd +++ b/man/check_overdispersion.Rd @@ -3,7 +3,7 @@ \name{check_overdispersion} \alias{check_overdispersion} \alias{check_overdispersion.performance_simres} -\title{Check overdispersion of GL(M)M's} +\title{Check overdispersion (and underdispersion) of GL(M)M's} \usage{ check_overdispersion(x, ...) From 35b5e19988386b584d91116be542baca1e98f33f Mon Sep 17 00:00:00 2001 From: Daniel Date: Sun, 17 Mar 2024 13:01:16 +0100 Subject: [PATCH 66/73] fix Underdispersion? #263 --- R/check_overdispersion.R | 22 ++++++++++++++++------ tests/testthat/test-check_overdispersion.R | 4 ++++ 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/R/check_overdispersion.R b/R/check_overdispersion.R index bd8311fb9..9e063fec5 100644 --- a/R/check_overdispersion.R +++ b/R/check_overdispersion.R @@ -167,6 +167,7 @@ print.check_overdisp <- function(x, digits = 3, ...) { check_overdispersion.glm <- function(x, verbose = TRUE, ...) { # model info info <- insight::model_info(x) + obj_name <- insight::safe_deparse_symbol(substitute(x)) # for certain distributions, simulated residuals are more accurate use_simulated <- info$is_bernoulli || info$is_binomial || (!info$is_count && !info$is_binomial) || info$is_negbin @@ -175,7 +176,7 @@ check_overdispersion.glm <- function(x, verbose = TRUE, ...) { not_supported <- c("fixest", "glmx") if (use_simulated && !inherits(x, not_supported)) { - return(check_overdispersion(simulate_residuals(x, ...), ...)) + return(check_overdispersion(simulate_residuals(x, ...), object_name = obj_name, ...)) } # check if we have poisson - need this for models not supported by DHARMa @@ -208,7 +209,7 @@ check_overdispersion.glm <- function(x, verbose = TRUE, ...) { ) class(out) <- c("check_overdisp", "see_check_overdisp") - attr(out, "object_name") <- insight::safe_deparse_symbol(substitute(x)) + attr(out, "object_name") <- obj_name out } @@ -251,12 +252,13 @@ check_overdispersion.model_fit <- check_overdispersion.poissonmfx check_overdispersion.merMod <- function(x, ...) { # for certain distributions, simulated residuals are more accurate info <- insight::model_info(x) + obj_name <- insight::safe_deparse_symbol(substitute(x)) # for certain distributions, simulated residuals are more accurate use_simulated <- info$family == "genpois" || info$is_zero_inflated || info$is_bernoulli || info$is_binomial || (!info$is_count && !info$is_binomial) || info$is_negbin # nolint if (use_simulated) { - return(check_overdispersion(simulate_residuals(x, ...), ...)) + return(check_overdispersion(simulate_residuals(x, ...), object_name = obj_name, ...)) } rdf <- stats::df.residual(x) @@ -264,7 +266,7 @@ check_overdispersion.merMod <- function(x, ...) { # check if pearson residuals are available if (insight::is_empty_object(rp)) { - return(check_overdispersion(simulate_residuals(x, ...), ...)) + return(check_overdispersion(simulate_residuals(x, ...), object_name = obj_name, ...)) } Pearson.chisq <- sum(rp^2) @@ -279,7 +281,7 @@ check_overdispersion.merMod <- function(x, ...) { ) class(out) <- c("check_overdisp", "see_check_overdisp") - attr(out, "object_name") <- insight::safe_deparse_symbol(substitute(x)) + attr(out, "object_name") <- obj_name out } @@ -299,6 +301,14 @@ check_overdispersion.performance_simres <- function(x, alternative = c("two.side # match arguments alternative <- match.arg(alternative) + # check for special arguments - we may pass "object_name" from other methods + dots <- list(...) + if (!is.null(dots$object_name)) { + obj_name <- dots$object_name + } else { + obj_name <- insight::safe_deparse_symbol(substitute(x)) + } + # statistics function variance <- stats::sd(x$simulatedResponse)^2 dispersion <- function(i) stats::var(i - x$fittedPredictedResponse) / variance @@ -312,7 +322,7 @@ check_overdispersion.performance_simres <- function(x, alternative = c("two.side ) class(out) <- c("check_overdisp", "see_check_overdisp") - attr(out, "object_name") <- insight::safe_deparse_symbol(substitute(x)) + attr(out, "object_name") <- obj_name out } diff --git a/tests/testthat/test-check_overdispersion.R b/tests/testthat/test-check_overdispersion.R index bcb62d256..beaffe562 100644 --- a/tests/testthat/test-check_overdispersion.R +++ b/tests/testthat/test-check_overdispersion.R @@ -171,6 +171,10 @@ test_that("check_overdispersion, MASS::negbin", { ) ) expect_message(out, "Underdispersion detected") + + # check that plot works + skip_if_not_installed("see") + expect_s3_class(plot(out), "ggplot") }) From 50735e3f4a1eb388c0ec69383d9e096a08804bd4 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sun, 17 Mar 2024 13:12:58 +0100 Subject: [PATCH 67/73] typo, tests --- R/check_overdispersion.R | 2 +- man/check_overdispersion.Rd | 2 +- tests/testthat/test-check_overdispersion.R | 7 ++++--- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/R/check_overdispersion.R b/R/check_overdispersion.R index 9e063fec5..db4873c18 100644 --- a/R/check_overdispersion.R +++ b/R/check_overdispersion.R @@ -24,7 +24,7 @@ #' data. Dispersion ratios larger than one indicate overdispersion, thus a #' negative binomial model or similar might fit better to the data. Dispersion #' ratios much smaller than one indicate underdispersion. A p-value < .05 -#' indicates either overdispersion or underdisperion (the first being more common). +#' indicates either overdispersion or underdispersion (the first being more common). #' #' @section Overdispersion in Poisson Models: #' For Poisson models, the overdispersion test is based on the code from diff --git a/man/check_overdispersion.Rd b/man/check_overdispersion.Rd index d1aaea175..19c957323 100644 --- a/man/check_overdispersion.Rd +++ b/man/check_overdispersion.Rd @@ -41,7 +41,7 @@ If the dispersion ratio is close to one, a Poisson model fits well to the data. Dispersion ratios larger than one indicate overdispersion, thus a negative binomial model or similar might fit better to the data. Dispersion ratios much smaller than one indicate underdispersion. A p-value < .05 -indicates either overdispersion or underdisperion (the first being more common). +indicates either overdispersion or underdispersion (the first being more common). } \section{Overdispersion in Poisson Models}{ diff --git a/tests/testthat/test-check_overdispersion.R b/tests/testthat/test-check_overdispersion.R index beaffe562..2930322bb 100644 --- a/tests/testthat/test-check_overdispersion.R +++ b/tests/testthat/test-check_overdispersion.R @@ -30,7 +30,7 @@ test_that("check_overdispersion, glmmTMB-poisson", { "" ) ) - expect_message(out, "Overdispersion detected") + expect_message(capture.output(print(out)), "Overdispersion detected") set.seed(123) out <- check_overdispersion(simulate_residuals(m1)) @@ -47,6 +47,7 @@ test_that("check_overdispersion, glmmTMB-poisson", { ) }) + test_that("check_overdispersion, glmmTMB-poisson mixed", { skip_if_not_installed("glmmTMB") skip_if_not(getRversion() >= "4.0.0") @@ -170,7 +171,7 @@ test_that("check_overdispersion, MASS::negbin", { "" ) ) - expect_message(out, "Underdispersion detected") + expect_message(capture.output(print(out)), "Underdispersion detected") # check that plot works skip_if_not_installed("see") @@ -193,7 +194,7 @@ test_that("check_overdispersion, genpois", { check_overdispersion(model), structure( list( - dispersion_ratio = 1.02883236131678, + dispersion_ratio = 0.971975646955856, p_value = 0.88 ), class = c("check_overdisp", "see_check_overdisp") From 0180abda5e7748b90a3202d792be5f6b212ad1f6 Mon Sep 17 00:00:00 2001 From: Daniel Date: Mon, 18 Mar 2024 10:04:52 +0100 Subject: [PATCH 68/73] check_outliers for DHARMa --- NAMESPACE | 3 +++ NEWS.md | 20 ++++++++++++++- R/check_outliers.R | 55 ++++++++++++++++++++++++++++++++++++++++-- R/simulate_residuals.R | 14 ++++++++--- 4 files changed, 86 insertions(+), 6 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index f9252ee3f..24daed512 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -73,6 +73,7 @@ S3method(check_normality,lmerModLmerTest) S3method(check_normality,merMod) S3method(check_normality,numeric) S3method(check_outliers,BFBayesFactor) +S3method(check_outliers,DHARMa) S3method(check_outliers,character) S3method(check_outliers,data.frame) S3method(check_outliers,default) @@ -89,6 +90,7 @@ S3method(check_outliers,meta) S3method(check_outliers,metabin) S3method(check_outliers,metagen) S3method(check_outliers,numeric) +S3method(check_outliers,performance_simres) S3method(check_outliers,rma) S3method(check_outliers,rma.uni) S3method(check_outliers,rq) @@ -295,6 +297,7 @@ S3method(print,check_normality_binom) S3method(print,check_outliers) S3method(print,check_outliers_metafor) S3method(print,check_outliers_metagen) +S3method(print,check_outliers_simres) S3method(print,check_overdisp) S3method(print,check_residuals) S3method(print,check_sphericity) diff --git a/NEWS.md b/NEWS.md index bc3638554..6fb76ca53 100644 --- a/NEWS.md +++ b/NEWS.md @@ -7,7 +7,25 @@ ## New functions * `simulate_residuals()` and `check_residuals()`, to simulate and check residuals - from generalized linear (mixed) models. + from generalized linear (mixed) models. Simulating residuals is based on the + DHARMa package, and objects returned by `simulate_residuals()` inherit from + the `DHARMa` class, and thus can be used with any functions from the *DHARMa* + package. However, there are also implementations in the *performance* package, + such as `check_overdispersion()`, `check_zeroinflation()`, `check_outliers()` + or `check_model()`. + +* Plots for `check_model()` have been improved. The Q-Q plots are now based + on simulated residuals from the DHARMa package for non-Gaussian models, thus + providing more accurate and informative plots. The half-normal QQ plot for + generalized linear models can still be obtained by setting the new argument + `residual_type = "normal"`. + +* Following functions now support simulated residuals (from `simulate_residuals()`) + resp. objects returned from `DHARMa::simulateResiduals()`: + - `check_overdispersion()` + - `check_zeroinflation()` + - `check_outliers()` + - `check_model()` ## General diff --git a/R/check_outliers.R b/R/check_outliers.R index 2017d4cc5..96ef06f12 100644 --- a/R/check_outliers.R +++ b/R/check_outliers.R @@ -12,7 +12,8 @@ #' by at least half of the methods). See the **Details** section below #' for a description of the methods. #' -#' @param x A model or a data.frame object. +#' @param x A model, a data.frame, a `performance_simres` [`simulate_residuals()`] +#' or a `DHARMa` object. #' @param method The outlier detection method(s). Can be `"all"` or some of #' `"cook"`, `"pareto"`, `"zscore"`, `"zscore_robust"`, `"iqr"`, `"ci"`, `"eti"`, #' `"hdi"`, `"bci"`, `"mahalanobis"`, `"mahalanobis_robust"`, `"mcd"`, `"ics"`, @@ -27,7 +28,11 @@ #' @param ... When `method = "ics"`, further arguments in `...` are passed #' down to [ICSOutlier::ics.outlier()]. When `method = "mahalanobis"`, #' they are passed down to [stats::mahalanobis()]. `percentage_central` can -#' be specified when `method = "mcd"`. +#' be specified when `method = "mcd"`. For objects of class `performance_simres` +#' or `DHARMa`, further arguments are passed down to `DHARMa::testOutliers()`. +#' +#' @inheritParams check_zeroinflation +#' @inheritParams simulate_residuals #' #' @return A logical vector of the detected outliers with a nice printing #' method: a check (message) on whether outliers were detected or not. The @@ -785,6 +790,28 @@ plot.check_outliers <- function(x, ...) { NextMethod() } +#' @export +print.check_outliers_simres <- function(x, digits = 2, ...) { + result <- paste0( + insight::format_value(100 * x$Coefficient, digits = digits, ...), + "%, ", + insight::format_ci(100 * x$CI_low, 100 * x$CI_high, digits = digits, ...) + ) + insight::print_color("# Outliers detection\n\n", "blue") + cat(sprintf(" Proportion of expected outliers: %.*f%%\n", digits, 100 * x$Expected)) + cat(sprintf(" Proportion of observed outliers: %s\n\n", result)) + + p_string <- paste0(" (", insight::format_p(x$p_value), ")") + + if (x$p_value < 0.05) { + message("Outliers were detected", p_string, ".") + } else { + message("No outliers were detected", p_string, ".") + } + + invisible(x) +} + # other classes ------------------------- @@ -1438,6 +1465,30 @@ check_outliers.meta <- check_outliers.metagen check_outliers.metabin <- check_outliers.metagen +#' @rdname check_outliers +#' @export +check_outliers.performance_simres <- function(x, type = "default", iterations = 100, alternative = "two.sided", ...) { + type <- match.arg(type, c("default", "binomial", "bootstrap")) + alternative <- match.arg(alternative, c("two.sided", "greater", "less")) + + insight::check_if_installed("DHARMa") + result <- DHARMa::testOutliers(x, type = type, nBoot = iterations, alternative = alternative, plot = FALSE, ...) + + outlier <- list( + Coefficient = as.vector(result$estimate), + Expected = as.numeric(gsub("(.*)\\(expected: (\\d.*)\\)", "\\2", names(result$estimate))), + CI_low = result$conf.int[1], + CI_high = result$conf.int[2], + p_value = result$p.value + ) + class(outlier) <- c("check_outliers_simres", class(outlier)) + outlier +} + +#' @export +check_outliers.DHARMa <- check_outliers.performance_simres + + # Thresholds -------------------------------------------------------------- diff --git a/R/simulate_residuals.R b/R/simulate_residuals.R index 0dee1e045..a88d0deb9 100644 --- a/R/simulate_residuals.R +++ b/R/simulate_residuals.R @@ -96,9 +96,17 @@ plot.performance_simres <- function(x, ...) { # helper functions --------------------- .simres_statistics <- function(x, statistic_fun, alternative = "two.sided") { - # count observed and simulated zeros - observed <- statistic_fun(x$observedResponse) - simulated <- apply(x$simulatedResponse, 2, statistic_fun) + # summarize the observed and simulated residuals + if (!is.null(statistic_fun)) { + # either apply a function to observed and simulated residusls, + # to calcualte a summary statistic + observed <- statistic_fun(x$observedResponse) + simulated <- apply(x$simulatedResponse, 2, statistic_fun) + } else { + # or we pass the values to compute the p-value directly (for "check_outliers()") + observed <- x + simulated <- statistic_fun + } # p is simply ratio of simulated zeros to observed zeros p <- switch(alternative, greater = mean(simulated >= observed), From 276a847d1ab6d72796222584cc638525df79a8d0 Mon Sep 17 00:00:00 2001 From: Daniel Date: Mon, 18 Mar 2024 10:07:55 +0100 Subject: [PATCH 69/73] fix --- R/check_outliers.R | 6 +++--- man/check_outliers.Rd | 19 +++++++++++++++++-- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/R/check_outliers.R b/R/check_outliers.R index 96ef06f12..e638308aa 100644 --- a/R/check_outliers.R +++ b/R/check_outliers.R @@ -793,13 +793,13 @@ plot.check_outliers <- function(x, ...) { #' @export print.check_outliers_simres <- function(x, digits = 2, ...) { result <- paste0( - insight::format_value(100 * x$Coefficient, digits = digits, ...), + insight::format_value(100 * x$Expected, digits = digits, ...), "%, ", insight::format_ci(100 * x$CI_low, 100 * x$CI_high, digits = digits, ...) ) insight::print_color("# Outliers detection\n\n", "blue") - cat(sprintf(" Proportion of expected outliers: %.*f%%\n", digits, 100 * x$Expected)) - cat(sprintf(" Proportion of observed outliers: %s\n\n", result)) + cat(sprintf(" Proportion of observed outliers: %.*f%%\n", digits, 100 * x$Coefficient)) + cat(sprintf(" Proportion of expected outliers: %s\n\n", result)) p_string <- paste0(" (", insight::format_p(x$p_value), ")") diff --git a/man/check_outliers.Rd b/man/check_outliers.Rd index 22b88228c..9987d12c5 100644 --- a/man/check_outliers.Rd +++ b/man/check_outliers.Rd @@ -5,6 +5,7 @@ \alias{check_outliers.default} \alias{check_outliers.numeric} \alias{check_outliers.data.frame} +\alias{check_outliers.performance_simres} \title{Outliers detection (check for influential observations)} \usage{ check_outliers(x, ...) @@ -21,14 +22,24 @@ check_outliers(x, ...) \method{check_outliers}{numeric}(x, method = "zscore_robust", threshold = NULL, ...) \method{check_outliers}{data.frame}(x, method = "mahalanobis", threshold = NULL, ID = NULL, ...) + +\method{check_outliers}{performance_simres}( + x, + type = "default", + iterations = 100, + alternative = "two.sided", + ... +) } \arguments{ -\item{x}{A model or a data.frame object.} +\item{x}{A model, a data.frame, a \code{performance_simres} \code{\link[=simulate_residuals]{simulate_residuals()}} +or a \code{DHARMa} object.} \item{...}{When \code{method = "ics"}, further arguments in \code{...} are passed down to \code{\link[ICSOutlier:ics.outlier]{ICSOutlier::ics.outlier()}}. When \code{method = "mahalanobis"}, they are passed down to \code{\link[stats:mahalanobis]{stats::mahalanobis()}}. \code{percentage_central} can -be specified when \code{method = "mcd"}.} +be specified when \code{method = "mcd"}. For objects of class \code{performance_simres} +or \code{DHARMa}, further arguments are passed down to \code{DHARMa::testOutliers()}.} \item{method}{The outlier detection method(s). Can be \code{"all"} or some of \code{"cook"}, \code{"pareto"}, \code{"zscore"}, \code{"zscore_robust"}, \code{"iqr"}, \code{"ci"}, \code{"eti"}, @@ -44,6 +55,10 @@ for any of the method run.} \item{ID}{Optional, to report an ID column along with the row number.} \item{verbose}{Toggle warnings.} + +\item{iterations}{Number of simulations to run.} + +\item{alternative}{A character string specifying the alternative hypothesis.} } \value{ A logical vector of the detected outliers with a nice printing From 7b7ba1c6dc3b6711520b9ed1d65cc1dcd1172a9e Mon Sep 17 00:00:00 2001 From: Daniel Date: Mon, 18 Mar 2024 10:40:07 +0100 Subject: [PATCH 70/73] docs --- R/check_outliers.R | 11 +++++++++++ R/check_predictions.R | 2 ++ R/check_residuals.R | 5 ++++- R/simulate_residuals.R | 2 +- man/check_outliers.Rd | 13 +++++++++++++ man/check_predictions.Rd | 2 ++ man/check_residuals.Rd | 5 ++++- man/simulate_residuals.Rd | 2 +- 8 files changed, 38 insertions(+), 4 deletions(-) diff --git a/R/check_outliers.R b/R/check_outliers.R index e638308aa..c0f459614 100644 --- a/R/check_outliers.R +++ b/R/check_outliers.R @@ -205,6 +205,17 @@ #' observations located at `qnorm(1-0.025) * SD)` of the log-transformed #' LOF distance. Requires the **dbscan** package. #' +#' @section Methods for simulated residuals: +#' +#' The approach for detecting outliers based on simulated residuals differs +#' from the traditional methods and may not be detecting outliers as expected. +#' Literally, this approach compares observed to simulated values. However, we +#' do not know the deviation of the observed data to the model expectation, and +#' thus, the term "outlier" should be taken with a grain of salt. Basically, the +#' comparison tests whether on observed data point is outside the simulated range. +#' It is strongly recommended to read the related documentations in the **DHARMa** +#' package, e.g. `?DHARMa::testOutliers`. +#' #' @section Threshold specification: #' #' Default thresholds are currently specified as follows: diff --git a/R/check_predictions.R b/R/check_predictions.R index ac4eaf45c..a1df0c28f 100644 --- a/R/check_predictions.R +++ b/R/check_predictions.R @@ -37,6 +37,8 @@ #' #' @return A data frame of simulated responses and the original response vector. #' +#' @seealso [`simulate_residuals()`] and [`check_residuals()`]. +#' #' @details An example how posterior predictive checks can also be used for model #' comparison is Figure 6 from _Gabry et al. 2019, Figure 6_. #' diff --git a/R/check_residuals.R b/R/check_residuals.R index 25156be85..7d7d3df3d 100644 --- a/R/check_residuals.R +++ b/R/check_residuals.R @@ -13,10 +13,13 @@ #' #' @details Uniformity of residuals is checked using a Kolmogorov-Smirnov test. #' There is a `plot()` method to visualize the distribution of the residuals. +#' The test fpr uniformity basically tests to which extent the observed values +#' deviate from the model expectations (i.e. simulated values). In this sense, +#' the `check_residuals()` function has similar goals like [`check_predictions()`]. #' #' @inheritSection simulate_residuals Tests based on simulated residuals #' -#' @seealso [`simulate_residuals()`] +#' @seealso [`simulate_residuals()`] and [`check_predictions()`]. #' #' @return The p-value of the test statistics. #' diff --git a/R/simulate_residuals.R b/R/simulate_residuals.R index a88d0deb9..3dbba0b5b 100644 --- a/R/simulate_residuals.R +++ b/R/simulate_residuals.R @@ -13,7 +13,7 @@ #' [`check_residuals()`]. The returned object is of class `DHARMa` and #' `performance_simres`. #' -#' @seealso [`check_residuals()`] +#' @seealso [`check_residuals()`] and [`check_predictions()`]. #' #' @details This function is a small wrapper around [`DHARMa::simulateResiduals()`]. #' It basically only sets `plot = FALSE` and adds an additional class attribute diff --git a/man/check_outliers.Rd b/man/check_outliers.Rd index 9987d12c5..1e6df8bc2 100644 --- a/man/check_outliers.Rd +++ b/man/check_outliers.Rd @@ -245,6 +245,19 @@ LOF distance. Requires the \strong{dbscan} package. } } +\section{Methods for simulated residuals}{ + + +The approach for detecting outliers based on simulated residuals differs +from the traditional methods and may not be detecting outliers as expected. +Literally, this approach compares observed to simulated values. However, we +do not know the deviation of the observed data to the model expectation, and +thus, the term "outlier" should be taken with a grain of salt. Basically, the +comparison tests whether on observed data point is outside the simulated range. +It is strongly recommended to read the related documentations in the \strong{DHARMa} +package, e.g. \code{?DHARMa::testOutliers}. +} + \section{Threshold specification}{ diff --git a/man/check_predictions.Rd b/man/check_predictions.Rd index 591c813da..e64b306fb 100644 --- a/man/check_predictions.Rd +++ b/man/check_predictions.Rd @@ -117,6 +117,8 @@ Cambridge University Press. } } \seealso{ +\code{\link[=simulate_residuals]{simulate_residuals()}} and \code{\link[=check_residuals]{check_residuals()}}. + Other functions to check model assumptions and and assess model quality: \code{\link{check_autocorrelation}()}, \code{\link{check_collinearity}()}, diff --git a/man/check_residuals.Rd b/man/check_residuals.Rd index cf4295d81..5919e1550 100644 --- a/man/check_residuals.Rd +++ b/man/check_residuals.Rd @@ -30,6 +30,9 @@ residual spatial and temporal autocorrelation. \details{ Uniformity of residuals is checked using a Kolmogorov-Smirnov test. There is a \code{plot()} method to visualize the distribution of the residuals. +The test fpr uniformity basically tests to which extent the observed values +deviate from the model expectations (i.e. simulated values). In this sense, +the \code{check_residuals()} function has similar goals like \code{\link[=check_predictions]{check_predictions()}}. } \section{Tests based on simulated residuals}{ @@ -58,5 +61,5 @@ check_residuals(res) \dontshow{\}) # examplesIf} } \seealso{ -\code{\link[=simulate_residuals]{simulate_residuals()}} +\code{\link[=simulate_residuals]{simulate_residuals()}} and \code{\link[=check_predictions]{check_predictions()}}. } diff --git a/man/simulate_residuals.Rd b/man/simulate_residuals.Rd index cf8042697..030e69501 100644 --- a/man/simulate_residuals.Rd +++ b/man/simulate_residuals.Rd @@ -64,5 +64,5 @@ of Computational and Graphical Statistics, 5(3), 236. \doi{10.2307/1390802} } } \seealso{ -\code{\link[=check_residuals]{check_residuals()}} +\code{\link[=check_residuals]{check_residuals()}} and \code{\link[=check_predictions]{check_predictions()}}. } From a6fd86dc4f96281a692d09419482af338a83402a Mon Sep 17 00:00:00 2001 From: Daniel Date: Mon, 18 Mar 2024 11:16:25 +0100 Subject: [PATCH 71/73] docs --- R/check_outliers.R | 12 ++++++++---- man/check_outliers.Rd | 13 +++++++++---- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/R/check_outliers.R b/R/check_outliers.R index c0f459614..a589b1309 100644 --- a/R/check_outliers.R +++ b/R/check_outliers.R @@ -24,6 +24,10 @@ #' 'Details'). If a numeric value is given, it will be used as the threshold #' for any of the method run. #' @param ID Optional, to report an ID column along with the row number. +#' @param type Type of method to test for outliers. Can be one of `"default"`, +#' `"binomial"` or `"bootstrap"`. Only applies when `x` is an object returned +#' by `simulate_residuals()` or of class `DHARMa`. See 'Details' in +#' `?DHARMa::testOutliers` for a detailed description of the types. #' @param verbose Toggle warnings. #' @param ... When `method = "ics"`, further arguments in `...` are passed #' down to [ICSOutlier::ics.outlier()]. When `method = "mahalanobis"`, @@ -211,10 +215,10 @@ #' from the traditional methods and may not be detecting outliers as expected. #' Literally, this approach compares observed to simulated values. However, we #' do not know the deviation of the observed data to the model expectation, and -#' thus, the term "outlier" should be taken with a grain of salt. Basically, the -#' comparison tests whether on observed data point is outside the simulated range. -#' It is strongly recommended to read the related documentations in the **DHARMa** -#' package, e.g. `?DHARMa::testOutliers`. +#' thus, the term "outlier" should be taken with a grain of salt. It refers to +#' "simulation outliers". Basically, the comparison tests whether on observed +#' data point is outside the simulated range. It is strongly recommended to read +#' the related documentations in the **DHARMa** package, e.g. `?DHARMa::testOutliers`. #' #' @section Threshold specification: #' diff --git a/man/check_outliers.Rd b/man/check_outliers.Rd index 1e6df8bc2..84a381985 100644 --- a/man/check_outliers.Rd +++ b/man/check_outliers.Rd @@ -56,6 +56,11 @@ for any of the method run.} \item{verbose}{Toggle warnings.} +\item{type}{Type of method to test for outliers. Can be one of \code{"default"}, +\code{"binomial"} or \code{"bootstrap"}. Only applies when \code{x} is an object returned +by \code{simulate_residuals()} or of class \code{DHARMa}. See 'Details' in +\code{?DHARMa::testOutliers} for a detailed description of the types.} + \item{iterations}{Number of simulations to run.} \item{alternative}{A character string specifying the alternative hypothesis.} @@ -252,10 +257,10 @@ The approach for detecting outliers based on simulated residuals differs from the traditional methods and may not be detecting outliers as expected. Literally, this approach compares observed to simulated values. However, we do not know the deviation of the observed data to the model expectation, and -thus, the term "outlier" should be taken with a grain of salt. Basically, the -comparison tests whether on observed data point is outside the simulated range. -It is strongly recommended to read the related documentations in the \strong{DHARMa} -package, e.g. \code{?DHARMa::testOutliers}. +thus, the term "outlier" should be taken with a grain of salt. It refers to +"simulation outliers". Basically, the comparison tests whether on observed +data point is outside the simulated range. It is strongly recommended to read +the related documentations in the \strong{DHARMa} package, e.g. \code{?DHARMa::testOutliers}. } \section{Threshold specification}{ From 54327de43a319882ec7a60cf7655612386f05eae Mon Sep 17 00:00:00 2001 From: Daniel Date: Mon, 18 Mar 2024 11:19:53 +0100 Subject: [PATCH 72/73] add tests --- tests/testthat/test-check_outliers.R | 37 ++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/tests/testthat/test-check_outliers.R b/tests/testthat/test-check_outliers.R index 8840e26bf..016c827af 100644 --- a/tests/testthat/test-check_outliers.R +++ b/tests/testthat/test-check_outliers.R @@ -342,3 +342,40 @@ test_that("check_outliers with invald data", { regex = "No numeric variables found" ) }) + + +test_that("check_outliers with DHARMa", { + skip_if_not_installed("DHARMa") + mt1 <- mtcars[, c(1, 3, 4)] + # create some fake outliers and attach outliers to main df + mt2 <- rbind(mt1, data.frame( + mpg = c(37, 40), disp = c(300, 400), + hp = c(110, 120) + )) + # fit model with outliers + model <- lm(disp ~ mpg + hp, data = mt2) + set.seed(123) + res <- simulate_residuals(model) + out <- check_outliers(res) + expect_equal( + out, + structure( + list( + Coefficient = 0.0294117647058824, Expected = 0.00796812749003984, + CI_low = 0.000744364234690261, CI_high = 0.153267669560318, + p_value = 0.238146844116552 + ), + class = c("check_outliers_simres", "list") + ), + ignore_attr = TRUE, + tolerance = 1e-4 + ) + expect_identical( + capture.output(print(out)), + c( + "# Outliers detection", "", " Proportion of observed outliers: 2.94%", + " Proportion of expected outliers: 0.80%, 95% CI [0.07, 15.33]", + "" + ) + ) +}) From 4127d9dcfe5385e6de1dcfe78204b419414b02e3 Mon Sep 17 00:00:00 2001 From: Daniel Date: Mon, 18 Mar 2024 11:31:58 +0100 Subject: [PATCH 73/73] lintrs, spelling --- R/check_overdispersion.R | 6 +++--- R/check_predictions.R | 2 +- R/check_residuals.R | 2 +- R/simulate_residuals.R | 12 ++++++------ man/check_residuals.Rd | 2 +- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/R/check_overdispersion.R b/R/check_overdispersion.R index db4873c18..0a9cfb595 100644 --- a/R/check_overdispersion.R +++ b/R/check_overdispersion.R @@ -303,10 +303,10 @@ check_overdispersion.performance_simres <- function(x, alternative = c("two.side # check for special arguments - we may pass "object_name" from other methods dots <- list(...) - if (!is.null(dots$object_name)) { - obj_name <- dots$object_name - } else { + if (is.null(dots$object_name)) { obj_name <- insight::safe_deparse_symbol(substitute(x)) + } else { + obj_name <- dots$object_name } # statistics function diff --git a/R/check_predictions.R b/R/check_predictions.R index a1df0c28f..69222d7f8 100644 --- a/R/check_predictions.R +++ b/R/check_predictions.R @@ -106,7 +106,7 @@ check_predictions.default <- function(object, minfo <- insight::model_info(object, verbose = FALSE) # try to find sensible default for "type" argument - suggest_dots <- (minfo$is_bernoulli || minfo$is_count || minfo$is_ordinal || minfo$is_categorical || minfo$is_multinomial) + suggest_dots <- (minfo$is_bernoulli || minfo$is_count || minfo$is_ordinal || minfo$is_categorical || minfo$is_multinomial) # nolint if (missing(type) && suggest_dots) { type <- "discrete_interval" } diff --git a/R/check_residuals.R b/R/check_residuals.R index 7d7d3df3d..2bb284c88 100644 --- a/R/check_residuals.R +++ b/R/check_residuals.R @@ -13,7 +13,7 @@ #' #' @details Uniformity of residuals is checked using a Kolmogorov-Smirnov test. #' There is a `plot()` method to visualize the distribution of the residuals. -#' The test fpr uniformity basically tests to which extent the observed values +#' The test for uniformity basically tests to which extent the observed values #' deviate from the model expectations (i.e. simulated values). In this sense, #' the `check_residuals()` function has similar goals like [`check_predictions()`]. #' diff --git a/R/simulate_residuals.R b/R/simulate_residuals.R index 3dbba0b5b..207b660db 100644 --- a/R/simulate_residuals.R +++ b/R/simulate_residuals.R @@ -97,15 +97,15 @@ plot.performance_simres <- function(x, ...) { .simres_statistics <- function(x, statistic_fun, alternative = "two.sided") { # summarize the observed and simulated residuals - if (!is.null(statistic_fun)) { - # either apply a function to observed and simulated residusls, + if (is.null(statistic_fun)) { + # we pass the values to compute the p-value directly (for "check_outliers()") + observed <- x + simulated <- statistic_fun + } else { + # or apply a function to observed and simulated residusls, # to calcualte a summary statistic observed <- statistic_fun(x$observedResponse) simulated <- apply(x$simulatedResponse, 2, statistic_fun) - } else { - # or we pass the values to compute the p-value directly (for "check_outliers()") - observed <- x - simulated <- statistic_fun } # p is simply ratio of simulated zeros to observed zeros p <- switch(alternative, diff --git a/man/check_residuals.Rd b/man/check_residuals.Rd index 5919e1550..dfb56ff83 100644 --- a/man/check_residuals.Rd +++ b/man/check_residuals.Rd @@ -30,7 +30,7 @@ residual spatial and temporal autocorrelation. \details{ Uniformity of residuals is checked using a Kolmogorov-Smirnov test. There is a \code{plot()} method to visualize the distribution of the residuals. -The test fpr uniformity basically tests to which extent the observed values +The test for uniformity basically tests to which extent the observed values deviate from the model expectations (i.e. simulated values). In this sense, the \code{check_residuals()} function has similar goals like \code{\link[=check_predictions]{check_predictions()}}. }