From 4a175da0410cbe5f2eed507aeb3d79d0c24ca286 Mon Sep 17 00:00:00 2001 From: Venkata Challa Date: Fri, 26 Jul 2024 08:35:53 +0100 Subject: [PATCH] Increase alert frequency for long window size The default 1 minute interval between checks doesn't work for window sizes that are more than 4h. Increase to 5 minutes for 6h and 12h. --- aks/postgres/resources.tf | 13 +++++++++++++ aks/postgres/tfdocs.md | 2 +- aks/postgres/variables.tf | 14 ++++++++++---- aks/redis/resources.tf | 11 +++++++++++ aks/redis/tfdocs.md | 2 +- aks/redis/variables.tf | 12 ++++++++---- 6 files changed, 44 insertions(+), 10 deletions(-) diff --git a/aks/postgres/resources.tf b/aks/postgres/resources.tf index 7129e02..f2f17ef 100644 --- a/aks/postgres/resources.tf +++ b/aks/postgres/resources.tf @@ -10,6 +10,16 @@ locals { azure_enable_monitoring = var.use_azure && var.azure_enable_monitoring kubernetes_name = "${var.service_name}-${var.environment}-postgres${local.name_suffix}" + + alert_frequency_map = { + PT5M = "PT1M" + PT15M = "PT1M" + PT30M = "PT1M" + PT1H = "PT1M" + PT6H = "PT5M" + PT12H = "PT5M" + } + alert_frequency = local.alert_frequency_map[var.alert_window_size] } # Username & password @@ -172,6 +182,7 @@ resource "azurerm_monitor_metric_alert" "memory" { scopes = [azurerm_postgresql_flexible_server.main[0].id] description = "Action will be triggered when memory use is greater than 75%" window_size = var.alert_window_size + frequency = local.alert_frequency criteria { metric_namespace = "Microsoft.DBforPostgreSQL/flexibleServers" @@ -200,6 +211,7 @@ resource "azurerm_monitor_metric_alert" "cpu" { scopes = [azurerm_postgresql_flexible_server.main[0].id] description = "Action will be triggered when cpu use is greater than ${var.azure_cpu_threshold}%" window_size = var.alert_window_size + frequency = local.alert_frequency criteria { metric_namespace = "Microsoft.DBforPostgreSQL/flexibleServers" @@ -228,6 +240,7 @@ resource "azurerm_monitor_metric_alert" "storage" { scopes = [azurerm_postgresql_flexible_server.main[0].id] description = "Action will be triggered when storage use is greater than ${var.azure_storage_threshold}%" window_size = var.alert_window_size + frequency = local.alert_frequency criteria { metric_namespace = "Microsoft.DBforPostgreSQL/flexibleServers" diff --git a/aks/postgres/tfdocs.md b/aks/postgres/tfdocs.md index 8010eda..05182c4 100644 --- a/aks/postgres/tfdocs.md +++ b/aks/postgres/tfdocs.md @@ -48,7 +48,7 @@ No modules. |------|-------------|------|---------|:--------:| | [admin\_password](#input\_admin\_password) | Password of the admin user | `string` | `null` | no | | [admin\_username](#input\_admin\_username) | Username of the admin user | `string` | `null` | no | -| [alert\_window\_size](#input\_alert\_window\_size) | The period of time that is used to monitor alert activity e.g PT1M, PT5M, PT15M, PT30M, PT1H, PT6H or PT12H | `string` | `"PT5M"` | no | +| [alert\_window\_size](#input\_alert\_window\_size) | The period of time that is used to monitor alert activity e.g. PT1M, PT5M, PT15M, PT30M, PT1H, PT6H, PT12H. The interval between checks is adjusted accordingly. | `string` | `"PT5M"` | no | | [azure\_cpu\_threshold](#input\_azure\_cpu\_threshold) | n/a | `number` | `80` | no | | [azure\_enable\_backup\_storage](#input\_azure\_enable\_backup\_storage) | n/a | `bool` | `true` | no | | [azure\_enable\_high\_availability](#input\_azure\_enable\_high\_availability) | n/a | `bool` | `false` | no | diff --git a/aks/postgres/variables.tf b/aks/postgres/variables.tf index 98beb73..08e2f09 100644 --- a/aks/postgres/variables.tf +++ b/aks/postgres/variables.tf @@ -124,11 +124,17 @@ variable "azure_enable_monitoring" { default = true } + + variable "alert_window_size" { - type = string - nullable = false - default = "PT5M" - description = "The period of time that is used to monitor alert activity e.g PT1M, PT5M, PT15M, PT30M, PT1H, PT6H or PT12H" + type = string + nullable = false + default = "PT5M" + validation { + condition = contains(["PT1M", "PT5M", "PT15M", "PT30M", "PT1H", "PT6H", "PT12H"], var.alert_window_size) + error_message = "The alert_window_size must be one of: PT1M, PT5M, PT15M, PT30M, PT1H, PT6H, PT12H" + } + description = "The period of time that is used to monitor alert activity e.g. PT1M, PT5M, PT15M, PT30M, PT1H, PT6H, PT12H. The interval between checks is adjusted accordingly." } variable "azure_maintenance_window" { diff --git a/aks/redis/resources.tf b/aks/redis/resources.tf index 5afa82d..72a08e8 100644 --- a/aks/redis/resources.tf +++ b/aks/redis/resources.tf @@ -6,6 +6,16 @@ locals { azure_enable_monitoring = var.use_azure && var.azure_enable_monitoring kubernetes_name = "${var.service_name}-${var.environment}-redis${local.name_suffix}" + + alert_frequency_map = { + PT5M = "PT1M" + PT15M = "PT1M" + PT30M = "PT1M" + PT1H = "PT1M" + PT6H = "PT5M" + PT12H = "PT5M" + } + alert_frequency = local.alert_frequency_map[var.alert_window_size] } # Azure @@ -86,6 +96,7 @@ resource "azurerm_monitor_metric_alert" "memory" { scopes = [azurerm_redis_cache.main[0].id] description = "Action will be triggered when memory use is greater than ${var.azure_memory_threshold}%" window_size = var.alert_window_size + frequency = local.alert_frequency criteria { metric_namespace = "Microsoft.Cache/redis" diff --git a/aks/redis/tfdocs.md b/aks/redis/tfdocs.md index 1e3958d..187ff85 100644 --- a/aks/redis/tfdocs.md +++ b/aks/redis/tfdocs.md @@ -32,7 +32,7 @@ No modules. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [alert\_window\_size](#input\_alert\_window\_size) | The period of time that is used to monitor alert activity e.g PT1M, PT5M, PT15M, PT30M, PT1H, PT6H or PT12H | `string` | `"PT5M"` | no | +| [alert\_window\_size](#input\_alert\_window\_size) | The period of time that is used to monitor alert activity e,g, PT1M, PT5M, PT15M, PT30M, PT1H, PT6H, PT12H. The interval between checks is adjusted accordingly. | `string` | `"PT5M"` | no | | [azure\_capacity](#input\_azure\_capacity) | n/a | `number` | `1` | no | | [azure\_enable\_monitoring](#input\_azure\_enable\_monitoring) | n/a | `bool` | `true` | no | | [azure\_family](#input\_azure\_family) | n/a | `string` | `"C"` | no | diff --git a/aks/redis/variables.tf b/aks/redis/variables.tf index ed77404..63178c5 100644 --- a/aks/redis/variables.tf +++ b/aks/redis/variables.tf @@ -111,8 +111,12 @@ variable "azure_patch_schedule" { } variable "alert_window_size" { - type = string - default = "PT5M" - nullable = false - description = "The period of time that is used to monitor alert activity e.g PT1M, PT5M, PT15M, PT30M, PT1H, PT6H or PT12H" + type = string + default = "PT5M" + nullable = false + validation { + condition = contains(["PT1M", "PT5M", "PT15M", "PT30M", "PT1H", "PT6H", "PT12H"], var.alert_window_size) + error_message = "The alert_window_size must be one of: PT1M, PT5M, PT15M, PT30M, PT1H, PT6H, PT12H" + } + description = "The period of time that is used to monitor alert activity e,g, PT1M, PT5M, PT15M, PT30M, PT1H, PT6H, PT12H. The interval between checks is adjusted accordingly." }