Skip to content

Commit

Permalink
New Function: Anomalize
Browse files Browse the repository at this point in the history
  • Loading branch information
mdancho84 committed Oct 28, 2023
1 parent 182e485 commit 90060be
Show file tree
Hide file tree
Showing 4 changed files with 421 additions and 12 deletions.
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ S3method(add_time,default)
S3method(add_time,numeric)
S3method(add_time,yearmon)
S3method(add_time,yearqtr)
S3method(anomalize,data.frame)
S3method(anomalize,grouped_df)
S3method(bake,step_box_cox)
S3method(bake,step_diff)
S3method(bake,step_fourier)
Expand Down Expand Up @@ -251,6 +253,7 @@ export("%||%")
export(":=")
export(.data)
export(add_time)
export(anomalize)
export(as_label)
export(as_name)
export(auto_lambda)
Expand Down
26 changes: 14 additions & 12 deletions R/00_global_vars.R
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
globalVariables(c("index", "index.num", "y", "yhat", "week", "mday", "diff.median", ".",
"data", "nested.col", "weekday", "locale", "holiday_name", "values", "value",
"holidays", "lag", ".facets_collapsed", ".groups_consolidated", ".value_mod", ".value_smooth", "name",
".group", ".group_value", "feature", "id", "key", "splits",
"observed", "remainder", "seasadj", "season", "trend", ".color_mod",
".id", ".key", ".splits", ".value", "abs_diff_lower", "abs_diff_upper", "below_max_anoms",
"centerline", "direction", "limit_lower", "limit_upper", "max_abs_diff", "outlier", "outlier_reported",
"sorting", "anomaly", "recomposed_l1", "recomposed_l2", "remainder_l1", "remainder_l2",
".metric", ".metrics", ".rank_metric", ".rank_std_err", "failure_rate", "std_err",
".rank_failure_rate", ".txt", "training", "testing", "fitted", "model", "..val", "..nm",
".white_noise_lower", ".white_noise_upper", ".date_var_collapsed", ".rowid", "..date_agg", "grp_names",
"check_row_exists", ".box_group"))
globalVariables(
c("index", "index.num", "y", "yhat", "week", "mday", "diff.median", ".",
"data", "nested.col", "weekday", "locale", "holiday_name", "values", "value",
"holidays", "lag", ".facets_collapsed", ".groups_consolidated", ".value_mod", ".value_smooth", "name",
".group", ".group_value", "feature", "id", "key", "splits",
"observed", "remainder", "seasadj", "season", "trend", ".color_mod",
".id", ".key", ".splits", ".value", "abs_diff_lower", "abs_diff_upper", "below_max_anoms",
"centerline", "direction", "limit_lower", "limit_upper", "max_abs_diff", "outlier", "outlier_reported",
"sorting", "anomaly", "recomposed_l1", "recomposed_l2", "remainder_l1", "remainder_l2",
".metric", ".metrics", ".rank_metric", ".rank_std_err", "failure_rate", "std_err",
".rank_failure_rate", ".txt", "training", "testing", "fitted", "model", "..val", "..nm",
".white_noise_lower", ".white_noise_upper", ".date_var_collapsed", ".rowid", "..date_agg", "grp_names",
"check_row_exists", ".box_group", ".method")
)
277 changes: 277 additions & 0 deletions R/anomalize.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,277 @@
#' Automatic group-wise Anomaly Detection
#'
#' `anomalize()` is used to detect anomalies in time series data,
#' either for a single time series or for multiple time series grouped by a specific column.
#'
#' @param .data A `tibble` or `data.frame` with a time-based column
#' @param .date_var A column containing either date or date-time values
#' @param .value A column containing numeric values
#' @param .frequency Controls the seasonal adjustment (removal of seasonality).
#' Input can be either "auto", a time-based definition (e.g. "2 weeks"),
#' or a numeric number of observations per frequency (e.g. 10).
#' Refer to [tk_get_frequency()].
#' @param .trend Controls the trend component.
#' For STL, trend controls the sensitivity of the LOESS smoother, which is used to remove the remainder.
#' Refer to [tk_get_trend()].
#' @param .iqr_alpha Controls the width of the "normal" range. Lower values are more conservative
#' while higher values are less prone to incorrectly classifying "normal" observations.
#' @param .clean_alpha Controls the threshold for cleaning
#' the outliers. The default is 0.75, which means that the anomalies will be
#' cleaned using the 0.75 * lower or upper bound of the recomposed time series,
#' depending on the direction of the anomaly.
#' @param .max_anomalies The maximum percent of anomalies permitted to be identified.
#' @param .message A boolean. If `TRUE`, will output information related to automatic frequency
#' and trend selection (if applicable).
#'
#' @return
#' A `tibble` or `data.frame` with the following columns:
#' - observed: original data
#' - seasonal: seasonal component
#' - seasadaj: seasonal adjusted
#' - trend: trend component
#' - remainder: residual component
#' - anomaly: Yes/No flag for outlier detection
#' - anomaly score: distance from centerline
#' - anomaly direction: -1, 0, 1 inidicator for direction of the anomaly
#' - recomposed_l1: lower level bound of recomposed time series
#' - recomposed_l2: upper level bound of recomposed time series
#' - observed_clean: original data with anomalies interpolated
#'
#' @details
#'
#' The `anomalize()` method for anomaly detection that implements a 2-step process to
#' detect outliers in time series.
#'
#' __Step 1: Detrend & Remove Seasonality using STL Decomposition__
#'
#' The decomposition separates the "season" and "trend" components from the "observed" values
#' leaving the "remainder" for anomaly detection.
#'
#' The user can control two parameters: frequency and trend.
#'
#' 1. `.frequency`: Adjusts the "season" component that is removed from the "observed" values.
#' 2. `.trend`: Adjusts the trend window (t.window parameter from [stats::stl()] that is used.
#'
#' The user may supply both `.frequency` and `.trend` as time-based durations (e.g. "6 weeks") or
#' numeric values (e.g. 180) or "auto", which predetermines the frequency and/or trend based on
#' the scale of the time series using the [tk_time_scale_template()].
#'
#' __Step 2: Anomaly Detection__
#'
#' Once "trend" and "season" (seasonality) is removed, anomaly detection is performed on the "remainder".
#' Anomalies are identified, and boundaries (recomposed_l1 and recomposed_l2) are determined.
#'
#' The Anomaly Detection Method uses an inner quartile range (IQR) of +/-25 the median.
#'
#' _IQR Adjustment, alpha parameter_
#'
#' With the default `alpha = 0.05`, the limits are established by expanding
#' the 25/75 baseline by an IQR Factor of 3 (3X).
#' The _IQR Factor = 0.15 / alpha_ (hence 3X with alpha = 0.05):
#'
#' - To increase the IQR Factor controlling the limits, decrease the alpha,
#' which makes it more difficult to be an outlier.
#' - Increase alpha to make it easier to be an outlier.
#'
#'
#' - The IQR outlier detection method is used in `forecast::tsoutliers()`.
#' - A similar outlier detection method is used by Twitter's `AnomalyDetection` package.
#' - Both Twitter and Forecast tsoutliers methods have been implemented in Business Science's `anomalize`
#' package.
#'
#'
#' @references
#' 1. CLEVELAND, R. B., CLEVELAND, W. S., MCRAE, J. E., AND TERPENNING, I.
#' STL: A Seasonal-Trend Decomposition Procedure Based on Loess.
#' Journal of Official Statistics, Vol. 6, No. 1 (1990), pp. 3-73.
#'
#' 2. Owen S. Vallis, Jordan Hochenbaum and Arun Kejariwal (2014).
#' A Novel Technique for Long-Term Anomaly Detection in the Cloud. Twitter Inc.
#'
#' @examples
#' library(dplyr)
#'
#' walmart_sales_weekly %>%
#' filter(id %in% c("1_1", "1_3")) %>%
#' group_by(id) %>%
#' anomalize(Date, Weekly_Sales)
#'
#' @name anomalize
#' @export
#'
anomalize <- function(.data, .date_var, .value,
.frequency = "auto", .trend = "auto",
.method = "stl",
.iqr_alpha = 0.05, .clean_alpha = 0.75, .max_anomalies = 0.2,
.message = TRUE) {

# Tidyeval Setup
date_var_expr <- rlang::enquo(.date_var)
value_expr <- rlang::enquo(.value)

# Checks
if (!is.data.frame(.data)) {
stop(call. = FALSE, "anomalize(.data) is not a data-frame or tibble. Please supply a data.frame or tibble.")
}
if (rlang::quo_is_missing(date_var_expr)) {
stop(call. = FALSE, "anomalize(.date_var) is missing. Please supply a date or date-time column.")
}
if (rlang::quo_is_missing(value_expr)) {
stop(call. = FALSE, "anomalize(.value) is missing. Please supply a numeric column.")
}

if (.method != "stl") {
stop(call. = FALSE, "anomalize(.method): Only 'stl' is currently implemented.")
}

UseMethod("anomalize", .data)

}


#' @export
anomalize.data.frame <- function(.data, .date_var, .value,
.frequency = "auto", .trend = "auto",
.method = "stl",
.iqr_alpha = 0.05, .clean_alpha = 0.75, .max_anomalies = 0.2,
.message = TRUE) {

# STL Decomposition (observed, season, trend, remainder, seasadj)
ret <- .data %>%
tk_stl_diagnostics(
.date_var = !! rlang::enquo(.date_var),
.value = !! rlang::enquo(.value),
.frequency = .frequency,
.trend = .trend,
.message = .message
)

# Detect Anomalies (remainder_l1, remainder_l2, anomaly)
ret <- ret %>%
mutate_anomalies_2(
target = remainder,
alpha = .iqr_alpha,
max_anoms = .max_anomalies
)

# Recomposition
ret <- ret %>%
dplyr::mutate(
recomposed_l1 = season + trend + remainder_l1,
recomposed_l2 = season + trend + remainder_l2
) %>%
dplyr::select(-remainder_l1, -remainder_l2)


# Clean
ret <- ret %>%
dplyr::mutate(
observed_cleaned = dplyr::case_when(
anomaly_direction == -1 ~ .clean_alpha * recomposed_l1,
anomaly_direction == 1 ~ .clean_alpha * recomposed_l2,
TRUE ~ observed
)
)


return(ret)

}

#' @export
anomalize.grouped_df <- function(.data, .date_var, .value,
.frequency = "auto", .trend = "auto",
.method = "stl",
.iqr_alpha = 0.05, .clean_alpha = 0.75, .max_anomalies = 0.2,
.message = TRUE) {


# Tidy Eval Setup
value_expr <- rlang::enquo(.value)
date_var_expr <- rlang::enquo(.date_var)
group_names <- dplyr::group_vars(.data)

# Process groups individually
.data %>%
tidyr::nest() %>%
dplyr::mutate(nested.col = purrr::map(
.x = data,
.f = function(df) anomalize(
.data = df,
.date_var = !! date_var_expr,
.value = !! value_expr,
.frequency = .frequency,
.trend = .trend,
.method = .method,
.iqr_alpha = .iqr_alpha,
.clean_alpha = .clean_alpha,
.max_anomalies = .max_anomalies,
.message = .message
)
)) %>%
dplyr::select(-"data") %>%
tidyr::unnest(cols = nested.col) %>%
dplyr::group_by_at(.vars = group_names)

}

# UTILITIES ----

mutate_anomalies_2 <- function(data, target, alpha = 0.05, max_anoms = 0.20) {

# Checks
if (missing(target)) stop('Error in anomalize(): argument "target" is missing, with no default', call. = FALSE)

# Setup
target_expr <- rlang::enquo(target)
x <- data %>% dplyr::pull(!! target_expr)

# Explicitly call functions
outlier_tbl <- iqr_2(x = x, alpha = alpha, max_anoms = max_anoms, verbose = TRUE)

ret <- data %>% dplyr::bind_cols(outlier_tbl)

return(ret)

}


# This is anomalize::iqr()
iqr_2 <- function(x, alpha = 0.05, max_anoms = 0.2, verbose = FALSE) {

quantile_x <- stats::quantile(x, prob = c(0.25, 0.75), na.rm = TRUE)
iq_range <- quantile_x[[2]] - quantile_x[[1]]
limits <- quantile_x + (0.15 / alpha) * iq_range * c(-1, 1)

outlier_idx <- ((x < limits[1]) | (x > limits[2]))

centerline <- sum(limits) / 2
score <- abs(x - centerline)

outlier_reported <- dplyr::case_when(
x < limits[1] ~ "Yes",
x > limits[2] ~ "Yes",
TRUE ~ "No"
)

direction <- dplyr::case_when(
x < limits[1] ~ -1,
x > limits[2] ~ 1,
TRUE ~ 0
)

remainder_l1 <- limits[1]
remainder_l2 <- limits[2]

return(
tibble::tibble(
anomaly = outlier_reported,
anomaly_direction = direction,
anomaly_score = score,
remainder_l1 = remainder_l1,
remainder_l2 = remainder_l2
)
)

}

Loading

0 comments on commit 90060be

Please sign in to comment.