From 26cdc3a8bfd1d85b0f8d3f313d044360374e8be9 Mon Sep 17 00:00:00 2001 From: Hector Huertas Date: Tue, 10 Oct 2023 15:24:48 +0200 Subject: [PATCH] Reduce ContainerRestarting alert noise Once fired, leave it firing for 10m. That should help with crashloops where the alert keeps getting resolved and firing again. --- common/all.yaml.tmpl | 1 + common/container.yaml.tmpl | 1 + 2 files changed, 2 insertions(+) diff --git a/common/all.yaml.tmpl b/common/all.yaml.tmpl index 478de60..d479405 100644 --- a/common/all.yaml.tmpl +++ b/common/all.yaml.tmpl @@ -143,6 +143,7 @@ groups: dashboard: "https://grafana.$ENVIRONMENT.$PROVIDER.uw.systems/d/VAE0wIcik/kubernetes-pod-resources?orgId=1&refresh=1m&from=now-12h&to=now&var-instance=All&var-namespace={{ $labels.namespace }}" - alert: SystemPodRestartingOften expr: increase(kube_pod_container_status_restarts_total{namespace=~"kube-system|sys-.*"}[10m]) > 3 + keep_firing_for: 10m labels: team: infra annotations: diff --git a/common/container.yaml.tmpl b/common/container.yaml.tmpl index 680bf4f..d7e759b 100644 --- a/common/container.yaml.tmpl +++ b/common/container.yaml.tmpl @@ -6,6 +6,7 @@ groups: rules: - alert: ContainerRestartingOften expr: increase(kube_pod_container_status_restarts_total[10m]) > 3 + keep_firing_for: 10m labels: group: container annotations: