From e33bead00d02d521dd04241b8aa2e94a9158b862 Mon Sep 17 00:00:00 2001 From: viktoryathegreat Date: Tue, 9 Jan 2024 11:02:42 +0400 Subject: [PATCH 1/3] fix(DMVP-3291): Fixed alarms' metrics --- alarms.tf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/alarms.tf b/alarms.tf index e7bfef4..fd6c833 100644 --- a/alarms.tf +++ b/alarms.tf @@ -24,7 +24,7 @@ module "cw_alerts" { // Replicas { name = "${var.name} has 0 available replicas in ${var.cluster_name}" - source = "ContainerInsights/kube_deployment_spec_replicas" + source = "ContainerInsights/service_number_of_running_pods" filters = { ClusterName = var.cluster_name, Deployment = var.name, @@ -38,7 +38,7 @@ module "cw_alerts" { // CPU { name = "${var.name} has cpu problem in ${var.cluster_name}", - source = "ContainerInsights/pod_cpu_utilization", + source = "ContainerInsights/pod_cpu_usage_total", filters = { PodName = var.name, ClusterName = var.cluster_name, @@ -52,7 +52,7 @@ module "cw_alerts" { // MEMORY { name = "${var.name} has memory problem in ${var.cluster_name}", - source = "ContainerInsights/pod_memory_utilization", + source = "ContainerInsights/pod_memory_working_set", filters = { PodName = var.name, ClusterName = var.cluster_name, From 5c3a459fb8a6456077db506415b396e537f0b622 Mon Sep 17 00:00:00 2001 From: viktoryathegreat Date: Tue, 9 Jan 2024 16:13:41 +0400 Subject: [PATCH 2/3] fix(DMVP-3291): Added network in, out and hpa max alarms. --- README.md | 43 ++++++- alarms.tf | 134 +++++++++++++--------- examples/basic-yaml/api.yaml | 2 +- examples/customized_alarms/main.tf | 14 ++- examples/deployment_disabled/main.tf | 1 + examples/some_alarms_disabled/README.md | 29 +++++ examples/some_alarms_disabled/main.tf | 19 +++ examples/some_alarms_disabled/provider.tf | 40 +++++++ variables.tf | 35 +++++- 9 files changed, 252 insertions(+), 65 deletions(-) create mode 100644 examples/some_alarms_disabled/README.md create mode 100644 examples/some_alarms_disabled/main.tf create mode 100644 examples/some_alarms_disabled/provider.tf diff --git a/README.md b/README.md index 04d9b70..0c43d7b 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,42 @@ # service +## What + +This module +- deploys services by using `helm_release` tf resource: + - it is conditional: set `deploy_service` to false if you don't want to deploy a service, +- creates these basic alarms in CloudWatch for the service: + - service pod's received traffic is out of anomaly band, + - service pod's transmitted traffic is out of anomaly band, + - service pod has 2 or more restarts in 5 minues, + - service has 0 available replicas, + - service HPA has been on its maximum for 5 minutes: there are maximum pods of the service. + +## How +Alarms are configured by default but can be customized via `alarms.custom_values` parameter. +By default all 5 alarms are enabled but each of them can be disabled: +``` +module "this" { + .... + + alarms = { + sns_topic = "default" + restarts = { + enabled = false + } + network_out = { + enabled = false + } + } + + .... +} +``` +In this case restarts, network_out alarms will not be created. Only maximum_replicas_usage, replicas, network_in alarms will be created. + +## Use Cases +Please check `examples` folder for more detailed examples. + ## Requirements @@ -27,11 +64,11 @@ No requirements. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [alarms](#input\_alarms) | Alarms enabled by default you need set sns topic name for send alarms for customize alarms threshold use custom\_values |
object({
enabled = optional(bool, true)
sns_topic = string
custom_values = optional(any, {})
})
| n/a | yes | +| [alarms](#input\_alarms) | Alarms are enabled by default. You need to set SNS topic name to send alarms. Use custom\_values to customize alarms. |
object({
enabled = optional(bool, true)
sns_topic = string
custom_values = optional(any, {})
restarts = optional(object({
enabled = bool
}), {
enabled = true
})
replicas = optional(object({
enabled = bool
}), {
enabled = true
})
network_in = optional(object({
enabled = bool
}), {
enabled = true
})
network_out = optional(object({
enabled = bool
}), {
enabled = true
})
maximum_replicas_usage = optional(object({
enabled = bool
maximum_replicas = number
}), {
enabled = true
maximum_replicas = 3 //The count of HPA maximum for a service. It will be used as a threshold for HPA maximum alarm.
})

})
| n/a | yes | | [cluster\_name](#input\_cluster\_name) | Cluster name | `string` | n/a | yes | | [deploy\_service](#input\_deploy\_service) | Wether to deploy the service via helm or not. | `bool` | `true` | no | -| [helm\_values](#input\_helm\_values) | Values which is overwrite chart defaults | `any` | `null` | no | -| [name](#input\_name) | Service names | `string` | n/a | yes | +| [helm\_values](#input\_helm\_values) | Values which overwrite chart defaults | `any` | `null` | no | +| [name](#input\_name) | Service name. It's used as a helm release name and specified PodName in AWS CloudWatch metrics for which alarms will be created. | `string` | n/a | yes | | [namespace](#input\_namespace) | Namespace | `string` | `null` | no | ## Outputs diff --git a/alarms.tf b/alarms.tf index fd6c833..a6b3a31 100644 --- a/alarms.tf +++ b/alarms.tf @@ -6,64 +6,90 @@ module "cw_alerts" { sns_topic = var.alarms.sns_topic - alerts = [ + alerts = concat( // Restarts - { - name = "${var.name} has too many restarts in ${var.cluster_name}" - source = "ContainerInsights/pod_number_of_container_restarts" - filters = { - ClusterName = var.cluster_name, - Deployment = var.name, - Namespace = var.namespace - } - period = try(var.alarms.custom_values.restarts.period, 300), - statistic = try(var.alarms.custom_values.restarts.statistic, "max"), - threshold = try(var.alarms.custom_values.restarts.threshold, 2) - equation = try(var.alarms.custom_values.restarts.equation, "gte") - }, + var.alarms.restarts.enabled ? [ + { + name = "${var.name} has too many restarts in ${var.cluster_name}" + source = "ContainerInsights/pod_number_of_container_restarts" + filters = { + ClusterName = var.cluster_name, + Namespace = var.namespace + PodName = var.name, + } + period = try(var.alarms.custom_values.restarts.period, 300), + statistic = try(var.alarms.custom_values.restarts.statistic, "max"), + threshold = try(var.alarms.custom_values.restarts.threshold, 2) + equation = try(var.alarms.custom_values.restarts.equation, "gte") + }, + ] : [], // Replicas - { - name = "${var.name} has 0 available replicas in ${var.cluster_name}" - source = "ContainerInsights/service_number_of_running_pods" - filters = { - ClusterName = var.cluster_name, - Deployment = var.name, - Namespace = var.namespace - } - period = try(var.alarms.custom_values.replicas.period, 300), - statistic = try(var.alarms.custom_values.replicas.statistic, "avg"), - threshold = try(var.alarms.custom_values.replicas.threshold, 0), - equation = try(var.alarms.custom_values.replicas.equation, "lte") - }, - // CPU - { - name = "${var.name} has cpu problem in ${var.cluster_name}", - source = "ContainerInsights/pod_cpu_usage_total", - filters = { - PodName = var.name, - ClusterName = var.cluster_name, - Namespace = var.namespace + var.alarms.replicas.enabled ? [ + { + name = "${var.name} has 0 available replicas in ${var.cluster_name}" + source = "ContainerInsights/service_number_of_running_pods" + filters = { + ClusterName = var.cluster_name, + Namespace = var.namespace + Service = var.name, + } + period = try(var.alarms.custom_values.replicas.period, 300), + statistic = try(var.alarms.custom_values.replicas.statistic, "avg"), + threshold = try(var.alarms.custom_values.replicas.threshold, 0), + equation = try(var.alarms.custom_values.replicas.equation, "lte") }, - period = try(var.alarms.custom_values.cpu.period, 300), - statistic = try(var.alarms.custom_values.cpu.statistic, "avg"), - threshold = try(var.alarms.custom_values.cpu.threshold, 90) - equation = try(var.alarms.custom_values.cpu.equation, "gte") - }, - // MEMORY - { - name = "${var.name} has memory problem in ${var.cluster_name}", - source = "ContainerInsights/pod_memory_working_set", - filters = { - PodName = var.name, - ClusterName = var.cluster_name, - Namespace = var.namespace + ] : [], + // Network In + var.alarms.network_in.enabled ? [ + { + name = "${var.name} is outside of Network < In band in ${var.cluster_name}", + source = "ContainerInsights/pod_network_rx_bytes", + filters = { + ClusterName = var.cluster_name, + Namespace = var.namespace + PodName = var.name, + }, + period = try(var.alarms.custom_values.network_in.period, 300), + statistic = try(var.alarms.custom_values.network_in.statistic, "avg"), + threshold = try(var.alarms.custom_values.network_in.threshold, 350000) + equation = try(var.alarms.custom_values.network_in.equation, "ltlgtu") + anomaly_detection = true }, - period = try(var.alarms.custom_values.memory.period, 300), - statistic = try(var.alarms.custom_values.memory.statistic, "avg"), - threshold = try(var.alarms.custom_values.memory.threshold, 90) - equation = try(var.alarms.custom_values.memory.equation, "gte") - }, - ] + ] : [], + // Network Out + var.alarms.network_out.enabled ? [ + { + name = "${var.name} is outside of Network > Out band ${var.cluster_name}", + source = "ContainerInsights/pod_network_tx_bytes", + filters = { + ClusterName = var.cluster_name, + Namespace = var.namespace + PodName = var.name, + }, + period = try(var.alarms.custom_values.network_out.period, 300), + statistic = try(var.alarms.custom_values.network_out.statistic, "avg"), + threshold = try(var.alarms.custom_values.network_out.threshold, 90) + equation = try(var.alarms.custom_values.network_out.equation, "ltlgtu") + anomaly_detection = true + }, + ] : [], + // HPA Maximum + var.alarms.maximum_replicas_usage.enabled ? [ + { + name = "${var.name} has been on HPA maximum for 5 minutes in ${var.cluster_name}", + source = "ContainerInsights/kube_deployment_status_replicas_available", + filters = { + ClusterName = var.cluster_name, + Namespace = var.namespace + Deployment = var.name, + }, + period = try(var.alarms.custom_values.maximum_replicas_usage.period, 300), + statistic = try(var.alarms.custom_values.maximum_replicas_usage.statistic, "avg"), + threshold = try(var.alarms.custom_values.maximum_replicas_usage.threshold, var.alarms.maximum_replicas_usage.maximum_replicas) + equation = try(var.alarms.custom_values.maximum_replicas_usage.equation, "gte") + } + ] : [], + ) depends_on = [ helm_release.service diff --git a/examples/basic-yaml/api.yaml b/examples/basic-yaml/api.yaml index 69f64f9..546523e 100644 --- a/examples/basic-yaml/api.yaml +++ b/examples/basic-yaml/api.yaml @@ -5,7 +5,7 @@ variables: namespace: test cluster_name: "eks-dev" alarms: - - sns_topic: "Default" + sns_topic: "Default" helm_values: image: repository: xxxxx.dkr.ecr.us-east-1.amazonaws.com/api diff --git a/examples/customized_alarms/main.tf b/examples/customized_alarms/main.tf index 8ebb05a..0064a7f 100644 --- a/examples/customized_alarms/main.tf +++ b/examples/customized_alarms/main.tf @@ -23,16 +23,16 @@ module "this" { alarms = { sns_topic = "Default" custom_values = { - cpu = { + network_in = { period = 300, statistic = "avg", - threshold = 80 + threshold = 80000 equation = "gte" }, - memory = { + network_out = { period = 300, statistic = "avg", - threshold = 80 + threshold = 80000 equation = "gte" }, restarts = { @@ -47,6 +47,12 @@ module "this" { threshold = 0 equation = "lte" }, + maximum_replicas_usage = { + period = 300, + statistic = "avg", + threshold = 6 + equation = "gte" + }, } } } diff --git a/examples/deployment_disabled/main.tf b/examples/deployment_disabled/main.tf index 6c6020e..1250018 100644 --- a/examples/deployment_disabled/main.tf +++ b/examples/deployment_disabled/main.tf @@ -4,6 +4,7 @@ module "this" { deploy_service = false name = "api01" + namespace = "test" cluster_name = "eks-dev" alarms = { diff --git a/examples/some_alarms_disabled/README.md b/examples/some_alarms_disabled/README.md new file mode 100644 index 0000000..786caa6 --- /dev/null +++ b/examples/some_alarms_disabled/README.md @@ -0,0 +1,29 @@ +# deployment_disabled + + +## Requirements + +No requirements. + +## Providers + +No providers. + +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [this](#module\_this) | ../../ | n/a | + +## Resources + +No resources. + +## Inputs + +No inputs. + +## Outputs + +No outputs. + diff --git a/examples/some_alarms_disabled/main.tf b/examples/some_alarms_disabled/main.tf new file mode 100644 index 0000000..561d6ef --- /dev/null +++ b/examples/some_alarms_disabled/main.tf @@ -0,0 +1,19 @@ +module "this" { + source = "../../" + + deploy_service = false + + name = "api01" + namespace = "test" + cluster_name = "eks-dev" + + alarms = { + sns_topic = "default" + restarts = { + enabled = false + } + network_out = { + enabled = false + } + } +} diff --git a/examples/some_alarms_disabled/provider.tf b/examples/some_alarms_disabled/provider.tf new file mode 100644 index 0000000..0f519b7 --- /dev/null +++ b/examples/some_alarms_disabled/provider.tf @@ -0,0 +1,40 @@ +## This file and its content are generated based on config, pleas check README.md for more details +provider "aws" { + region = "us-east-1" +} + +provider "kubernetes" { + cluster_ca_certificate = "cluster_ca_certificate" + host = "host" + + exec { + api_version = "client.authentication.k8s.io/v1beta1" + args = [ + "eks", + "--region", + "eu-central-1", + "get-token", + "--cluster-name", + "dev"] + command = "aws" + } +} + +provider "helm" { + kubernetes { + cluster_ca_certificate = "cluster_ca_certificate" + host = "host" + exec { + api_version = "client.authentication.k8s.io/v1beta1" + args = [ + "eks", + "--region", + "us-east-1", + "get-token", + "--cluster-name", + "eks-dev" + ] + command = "aws" + } + } +} diff --git a/variables.tf b/variables.tf index 22f075b..6a4b5d7 100644 --- a/variables.tf +++ b/variables.tf @@ -1,6 +1,6 @@ variable "name" { type = string - description = "Service names" + description = "Service name. It's used as a helm release name and specified PodName in AWS CloudWatch metrics for which alarms will be created." } variable "namespace" { @@ -16,7 +16,7 @@ variable "cluster_name" { variable "helm_values" { type = any - description = "Values which is overwrite chart defaults" + description = "Values which overwrite chart defaults" default = null } @@ -25,10 +25,39 @@ variable "alarms" { enabled = optional(bool, true) sns_topic = string custom_values = optional(any, {}) + restarts = optional(object({ + enabled = bool + }), { + enabled = true + }) + replicas = optional(object({ + enabled = bool + }), { + enabled = true + }) + network_in = optional(object({ + enabled = bool + }), { + enabled = true + }) + network_out = optional(object({ + enabled = bool + }), { + enabled = true + }) + maximum_replicas_usage = optional(object({ + enabled = bool + maximum_replicas = number + }), { + enabled = true + maximum_replicas = 3 //The count of HPA maximum for a service. It will be used as a threshold for HPA maximum alarm. + }) + }) - description = "Alarms enabled by default you need set sns topic name for send alarms for customize alarms threshold use custom_values" + description = "Alarms are enabled by default. You need to set SNS topic name to send alarms. Use custom_values to customize alarms." } + variable "deploy_service" { type = bool description = "Wether to deploy the service via helm or not." From d0dd30a76a3b9f22514d7b474a0c7b3cadfcf323 Mon Sep 17 00:00:00 2001 From: viktoryathegreat Date: Tue, 9 Jan 2024 16:18:03 +0400 Subject: [PATCH 3/3] fix(DMVP-3291): Deleted thresholds for network in and out alerts --- README.md | 2 +- alarms.tf | 2 -- variables.tf | 2 +- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 0c43d7b..523a375 100644 --- a/README.md +++ b/README.md @@ -64,7 +64,7 @@ No requirements. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [alarms](#input\_alarms) | Alarms are enabled by default. You need to set SNS topic name to send alarms. Use custom\_values to customize alarms. |
object({
enabled = optional(bool, true)
sns_topic = string
custom_values = optional(any, {})
restarts = optional(object({
enabled = bool
}), {
enabled = true
})
replicas = optional(object({
enabled = bool
}), {
enabled = true
})
network_in = optional(object({
enabled = bool
}), {
enabled = true
})
network_out = optional(object({
enabled = bool
}), {
enabled = true
})
maximum_replicas_usage = optional(object({
enabled = bool
maximum_replicas = number
}), {
enabled = true
maximum_replicas = 3 //The count of HPA maximum for a service. It will be used as a threshold for HPA maximum alarm.
})

})
| n/a | yes | +| [alarms](#input\_alarms) | Alarms are enabled by default. You need to set SNS topic name to send alarms. Use custom\_values to customize alarms. |
object({
enabled = optional(bool, true)
sns_topic = string
custom_values = optional(any, {})
restarts = optional(object({
enabled = bool
}), {
enabled = true
})
replicas = optional(object({
enabled = bool
}), {
enabled = true
})
network_in = optional(object({
enabled = bool
}), {
enabled = true
})
network_out = optional(object({
enabled = bool
}), {
enabled = true
})
maximum_replicas_usage = optional(object({
enabled = optional(bool, true)
maximum_replicas = number
}), {
enabled = true
maximum_replicas = 3 //The count of HPA maximum for a service. It will be used as a threshold for HPA maximum alarm.
})

})
| n/a | yes | | [cluster\_name](#input\_cluster\_name) | Cluster name | `string` | n/a | yes | | [deploy\_service](#input\_deploy\_service) | Wether to deploy the service via helm or not. | `bool` | `true` | no | | [helm\_values](#input\_helm\_values) | Values which overwrite chart defaults | `any` | `null` | no | diff --git a/alarms.tf b/alarms.tf index a6b3a31..55cb378 100644 --- a/alarms.tf +++ b/alarms.tf @@ -51,7 +51,6 @@ module "cw_alerts" { }, period = try(var.alarms.custom_values.network_in.period, 300), statistic = try(var.alarms.custom_values.network_in.statistic, "avg"), - threshold = try(var.alarms.custom_values.network_in.threshold, 350000) equation = try(var.alarms.custom_values.network_in.equation, "ltlgtu") anomaly_detection = true }, @@ -68,7 +67,6 @@ module "cw_alerts" { }, period = try(var.alarms.custom_values.network_out.period, 300), statistic = try(var.alarms.custom_values.network_out.statistic, "avg"), - threshold = try(var.alarms.custom_values.network_out.threshold, 90) equation = try(var.alarms.custom_values.network_out.equation, "ltlgtu") anomaly_detection = true }, diff --git a/variables.tf b/variables.tf index 6a4b5d7..dbe7962 100644 --- a/variables.tf +++ b/variables.tf @@ -46,7 +46,7 @@ variable "alarms" { enabled = true }) maximum_replicas_usage = optional(object({ - enabled = bool + enabled = optional(bool, true) maximum_replicas = number }), { enabled = true