New Function: Anomalize

business-science · Oct 28, 2023 · 90060be · 90060be
1 parent 182e485
commit 90060be
Show file tree

Hide file tree

Showing 4 changed files with 421 additions and 12 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -7,6 +7,8 @@ S3method(add_time,default)
 S3method(add_time,numeric)
 S3method(add_time,yearmon)
 S3method(add_time,yearqtr)
+S3method(anomalize,data.frame)
+S3method(anomalize,grouped_df)
 S3method(bake,step_box_cox)
 S3method(bake,step_diff)
 S3method(bake,step_fourier)
@@ -251,6 +253,7 @@ export("%||%")
 export(":=")
 export(.data)
 export(add_time)
+export(anomalize)
 export(as_label)
 export(as_name)
 export(auto_lambda)

diff --git a/R/00_global_vars.R b/R/00_global_vars.R
@@ -1,12 +1,14 @@
-globalVariables(c("index", "index.num", "y", "yhat", "week", "mday", "diff.median", ".",
-                  "data", "nested.col", "weekday", "locale", "holiday_name", "values", "value",
-                  "holidays", "lag", ".facets_collapsed", ".groups_consolidated", ".value_mod", ".value_smooth", "name",
-                  ".group", ".group_value", "feature", "id", "key", "splits",
-                  "observed", "remainder", "seasadj", "season", "trend", ".color_mod",
-                  ".id", ".key", ".splits", ".value", "abs_diff_lower", "abs_diff_upper", "below_max_anoms",
-                  "centerline", "direction", "limit_lower", "limit_upper", "max_abs_diff", "outlier", "outlier_reported",
-                  "sorting", "anomaly", "recomposed_l1", "recomposed_l2", "remainder_l1", "remainder_l2",
-                  ".metric", ".metrics", ".rank_metric", ".rank_std_err", "failure_rate", "std_err",
-                  ".rank_failure_rate", ".txt", "training", "testing", "fitted", "model", "..val", "..nm",
-                  ".white_noise_lower", ".white_noise_upper", ".date_var_collapsed", ".rowid", "..date_agg", "grp_names",
-                  "check_row_exists", ".box_group"))
+globalVariables(
+    c("index", "index.num", "y", "yhat", "week", "mday", "diff.median", ".",
+      "data", "nested.col", "weekday", "locale", "holiday_name", "values", "value",
+      "holidays", "lag", ".facets_collapsed", ".groups_consolidated", ".value_mod", ".value_smooth", "name",
+      ".group", ".group_value", "feature", "id", "key", "splits",
+      "observed", "remainder", "seasadj", "season", "trend", ".color_mod",
+      ".id", ".key", ".splits", ".value", "abs_diff_lower", "abs_diff_upper", "below_max_anoms",
+      "centerline", "direction", "limit_lower", "limit_upper", "max_abs_diff", "outlier", "outlier_reported",
+      "sorting", "anomaly", "recomposed_l1", "recomposed_l2", "remainder_l1", "remainder_l2",
+      ".metric", ".metrics", ".rank_metric", ".rank_std_err", "failure_rate", "std_err",
+      ".rank_failure_rate", ".txt", "training", "testing", "fitted", "model", "..val", "..nm",
+      ".white_noise_lower", ".white_noise_upper", ".date_var_collapsed", ".rowid", "..date_agg", "grp_names",
+      "check_row_exists", ".box_group", ".method")
+)
diff --git a/R/anomalize.R b/R/anomalize.R
@@ -0,0 +1,277 @@
+#' Automatic group-wise Anomaly Detection
+#'
+#' `anomalize()` is used to detect anomalies in time series data,
+#' either for a single time series or for multiple time series grouped by a specific column.
+#'
+#' @param .data A `tibble` or `data.frame` with a time-based column
+#' @param .date_var A column containing either date or date-time values
+#' @param .value A column containing numeric values
+#' @param .frequency Controls the seasonal adjustment (removal of seasonality).
+#'  Input can be either "auto", a time-based definition (e.g. "2 weeks"),
+#'  or a numeric number of observations per frequency (e.g. 10).
+#'  Refer to [tk_get_frequency()].
+#' @param .trend Controls the trend component.
+#'  For STL, trend controls the sensitivity of the LOESS smoother, which is used to remove the remainder.
+#'  Refer to [tk_get_trend()].
+#' @param .iqr_alpha Controls the width of the "normal" range. Lower values are more conservative
+#'  while higher values are less prone to incorrectly classifying "normal" observations.
+#' @param .clean_alpha Controls the threshold for cleaning
+#' the outliers. The default is 0.75, which means that the anomalies will be
+#' cleaned using the 0.75 * lower or upper bound of the recomposed time series,
+#' depending on the direction of the anomaly.
+#' @param .max_anomalies The maximum percent of anomalies permitted to be identified.
+#' @param .message A boolean. If `TRUE`, will output information related to automatic frequency
+#' and trend selection (if applicable).
+#'
+#' @return
+#' A `tibble` or `data.frame` with the following columns:
+#'  - observed: original data
+#'  - seasonal: seasonal component
+#'  - seasadaj: seasonal adjusted
+#'  - trend: trend component
+#'  - remainder: residual component
+#'  - anomaly: Yes/No flag for outlier detection
+#'  - anomaly score: distance from centerline
+#'  - anomaly direction: -1, 0, 1 inidicator for direction of the anomaly
+#'  - recomposed_l1: lower level bound of recomposed time series
+#'  - recomposed_l2: upper level bound of recomposed time series
+#'  - observed_clean: original data with anomalies interpolated
+#'
+#' @details
+#'
+#' The `anomalize()` method for anomaly detection that implements a 2-step process to
+#' detect outliers in time series.
+#'
+#' __Step 1: Detrend & Remove Seasonality using STL Decomposition__
+#'
+#' The decomposition separates the "season" and "trend" components from the "observed" values
+#' leaving the "remainder" for anomaly detection.
+#'
+#' The user can control two parameters: frequency and trend.
+#'
+#' 1. `.frequency`: Adjusts the "season" component that is removed from the "observed" values.
+#' 2. `.trend`: Adjusts the trend window (t.window parameter from [stats::stl()] that is used.
+#'
+#' The user may supply both `.frequency` and `.trend` as time-based durations (e.g. "6 weeks") or
+#' numeric values (e.g. 180) or "auto", which predetermines the frequency and/or trend based on
+#' the scale of the time series using the [tk_time_scale_template()].
+#'
+#' __Step 2: Anomaly Detection__
+#'
+#' Once "trend" and "season" (seasonality) is removed, anomaly detection is performed on the "remainder".
+#' Anomalies are identified, and boundaries (recomposed_l1 and recomposed_l2) are determined.
+#'
+#' The Anomaly Detection Method uses an inner quartile range (IQR) of +/-25 the median.
+#'
+#' _IQR Adjustment, alpha parameter_
+#'
+#' With the default `alpha = 0.05`, the limits are established by expanding
+#' the 25/75 baseline by an IQR Factor of 3 (3X).
+#' The _IQR Factor = 0.15 / alpha_ (hence 3X with alpha = 0.05):
+#'
+#' - To increase the IQR Factor controlling the limits, decrease the alpha,
+#' which makes it more difficult to be an outlier.
+#' - Increase alpha to make it easier to be an outlier.
+#'
+#'
+#' - The IQR outlier detection method is used in `forecast::tsoutliers()`.
+#' - A similar outlier detection method is used by Twitter's `AnomalyDetection` package.
+#' - Both Twitter and Forecast tsoutliers methods have been implemented in Business Science's `anomalize`
+#'  package.
+#'
+#'
+#' @references
+#' 1. CLEVELAND, R. B., CLEVELAND, W. S., MCRAE, J. E., AND TERPENNING, I.
+#'  STL: A Seasonal-Trend Decomposition Procedure Based on Loess.
+#'  Journal of Official Statistics, Vol. 6, No. 1 (1990), pp. 3-73.
+#'
+#' 2. Owen S. Vallis, Jordan Hochenbaum and Arun Kejariwal (2014).
+#'  A Novel Technique for Long-Term Anomaly Detection in the Cloud. Twitter Inc.
+#'
+#' @examples
+#' library(dplyr)
+#'
+#' walmart_sales_weekly %>%
+#'     filter(id %in% c("1_1", "1_3")) %>%
+#'     group_by(id) %>%
+#'     anomalize(Date, Weekly_Sales)
+#'
+#' @name anomalize
+#' @export
+#'
+anomalize <- function(.data, .date_var, .value,
+                      .frequency = "auto", .trend = "auto",
+                      .method = "stl",
+                      .iqr_alpha = 0.05, .clean_alpha = 0.75, .max_anomalies = 0.2,
+                      .message = TRUE) {
+
+    # Tidyeval Setup
+    date_var_expr <- rlang::enquo(.date_var)
+    value_expr    <- rlang::enquo(.value)
+
+    # Checks
+    if (!is.data.frame(.data)) {
+        stop(call. = FALSE, "anomalize(.data) is not a data-frame or tibble. Please supply a data.frame or tibble.")
+    }
+    if (rlang::quo_is_missing(date_var_expr)) {
+        stop(call. = FALSE, "anomalize(.date_var) is missing. Please supply a date or date-time column.")
+    }
+    if (rlang::quo_is_missing(value_expr)) {
+        stop(call. = FALSE, "anomalize(.value) is missing. Please supply a numeric column.")
+    }
+
+    if (.method != "stl") {
+        stop(call. = FALSE, "anomalize(.method): Only 'stl' is currently implemented.")
+    }
+
+    UseMethod("anomalize", .data)
+
+}
+
+
+#' @export
+anomalize.data.frame <- function(.data, .date_var, .value,
+                                 .frequency = "auto", .trend = "auto",
+                                 .method = "stl",
+                                 .iqr_alpha = 0.05, .clean_alpha = 0.75, .max_anomalies = 0.2,
+                                 .message = TRUE) {
+
+    # STL Decomposition (observed, season, trend, remainder, seasadj)
+    ret <- .data %>%
+        tk_stl_diagnostics(
+            .date_var  = !! rlang::enquo(.date_var),
+            .value     = !! rlang::enquo(.value),
+            .frequency = .frequency,
+            .trend     = .trend,
+            .message   = .message
+        )
+
+    # Detect Anomalies (remainder_l1, remainder_l2, anomaly)
+    ret <- ret %>%
+        mutate_anomalies_2(
+            target    = remainder,
+            alpha     = .iqr_alpha,
+            max_anoms = .max_anomalies
+        )
+
+    # Recomposition
+    ret <- ret %>%
+        dplyr::mutate(
+            recomposed_l1 = season + trend + remainder_l1,
+            recomposed_l2 = season + trend + remainder_l2
+        ) %>%
+        dplyr::select(-remainder_l1, -remainder_l2)
+
+
+    # Clean
+    ret <- ret %>%
+        dplyr::mutate(
+            observed_cleaned = dplyr::case_when(
+                anomaly_direction == -1 ~ .clean_alpha * recomposed_l1,
+                anomaly_direction == 1 ~ .clean_alpha * recomposed_l2,
+                TRUE ~ observed
+            )
+        )
+
+
+    return(ret)
+
+}
+
+#' @export
+anomalize.grouped_df <- function(.data, .date_var, .value,
+                                 .frequency = "auto", .trend = "auto",
+                                 .method = "stl",
+                                 .iqr_alpha = 0.05, .clean_alpha = 0.75, .max_anomalies = 0.2,
+                                 .message = TRUE) {
+
+
+    # Tidy Eval Setup
+    value_expr    <- rlang::enquo(.value)
+    date_var_expr <- rlang::enquo(.date_var)
+    group_names   <- dplyr::group_vars(.data)
+
+    # Process groups individually
+    .data %>%
+        tidyr::nest() %>%
+        dplyr::mutate(nested.col = purrr::map(
+            .x         = data,
+            .f         = function(df) anomalize(
+                .data          = df,
+                .date_var      = !! date_var_expr,
+                .value         = !! value_expr,
+                .frequency     = .frequency,
+                .trend         = .trend,
+                .method        = .method,
+                .iqr_alpha     = .iqr_alpha,
+                .clean_alpha   = .clean_alpha,
+                .max_anomalies = .max_anomalies,
+                .message       = .message
+            )
+        )) %>%
+        dplyr::select(-"data") %>%
+        tidyr::unnest(cols = nested.col) %>%
+        dplyr::group_by_at(.vars = group_names)
+
+}
+
+# UTILITIES ----
+
+mutate_anomalies_2 <- function(data, target, alpha = 0.05, max_anoms = 0.20) {
+
+    # Checks
+    if (missing(target)) stop('Error in anomalize(): argument "target" is missing, with no default', call. = FALSE)
+
+    # Setup
+    target_expr <- rlang::enquo(target)
+    x           <- data %>% dplyr::pull(!! target_expr)
+
+    # Explicitly call functions
+    outlier_tbl <- iqr_2(x = x, alpha = alpha, max_anoms = max_anoms, verbose = TRUE)
+
+    ret <- data %>% dplyr::bind_cols(outlier_tbl)
+
+    return(ret)
+
+}
+
+
+# This is anomalize::iqr()
+iqr_2 <- function(x, alpha = 0.05, max_anoms = 0.2, verbose = FALSE) {
+
+    quantile_x <- stats::quantile(x, prob = c(0.25, 0.75), na.rm = TRUE)
+    iq_range   <- quantile_x[[2]] - quantile_x[[1]]
+    limits     <- quantile_x + (0.15 / alpha) * iq_range * c(-1, 1)
+
+    outlier_idx      <- ((x < limits[1]) | (x > limits[2]))
+
+    centerline <- sum(limits) / 2
+    score <- abs(x - centerline)
+
+    outlier_reported <- dplyr::case_when(
+        x < limits[1] ~ "Yes",
+        x > limits[2] ~ "Yes",
+        TRUE ~ "No"
+    )
+
+    direction <- dplyr::case_when(
+        x < limits[1] ~ -1,
+        x > limits[2] ~ 1,
+        TRUE ~ 0
+    )
+
+    remainder_l1 <- limits[1]
+    remainder_l2 <- limits[2]
+
+    return(
+        tibble::tibble(
+            anomaly = outlier_reported,
+            anomaly_direction = direction,
+            anomaly_score = score,
+            remainder_l1 = remainder_l1,
+            remainder_l2 = remainder_l2
+        )
+    )
+
+}
+