diff --git a/common/vault.yaml.tmpl b/common/vault.yaml.tmpl index 43d1b68..aff23ca 100644 --- a/common/vault.yaml.tmpl +++ b/common/vault.yaml.tmpl @@ -2,43 +2,6 @@ # DO NOT REMOVE line above, used in `pre-commit` hook groups: - - name: VaultClients - # Recommendations from https://s3-us-west-2.amazonaws.com/hashicorp-education/whitepapers/Vault/Vault-Consul-Monitoring-Guide.pdf - rules: - - alert: VaultSidecarCredentialsExpired - expr: time() - vkcc_sidecar_expiry_timestamp_seconds > 0 - for: 10m - labels: - group: vault_clients - annotations: - description: | - The credentials served by the vault credentials agent sidecar have expired and have not - been renewed. This may cause issues for the other containers in the pod. - summary: "The credentials for '{{ $labels.kubernetes_pod_name }}' have expired" - dashboard: https://grafana.$ENVIRONMENT.$PROVIDER.uw.systems/d/U61wpstMk/vault-credentials-sidecars - - alert: VaultSidecarDown - expr: up{job="vault-credentials-agents"} == 0 - for: 10m - labels: - group: vault_clients - annotations: - description: | - The vault credentials agent sidecar is down. This may cause issues for the other containers - in the pod. - summary: "The vault credentials agent for '{{ $labels.kubernetes_pod_name }}' is down" - dashboard: https://grafana.$ENVIRONMENT.$PROVIDER.uw.systems/d/U61wpstMk/vault-credentials-sidecars - - alert: VaultSidecarMissing - expr: (kube_pod_annotations{annotation_injector_tumblr_com_request=~"vault-sidecar-.+"} and on (pod,namespace) kube_pod_status_scheduled{condition="true"} == 1) unless on (pod,namespace) kube_pod_container_info{container=~"vault-credentials-agent.*"} - for: 10m - labels: - group: vault_clients - annotations: - description: | - The pod is annotated with `{{ $labels.key }}={{ $labels.value }}` but does not have a - container matching the name `vault-credentials-agent.*`. This indicates an issue with - the sidecar injection. Check the `kube-system/k8s-sidecar-injector` deployment for problems. - summary: "Vault sidecar is missing from {{ $labels.namespace }}/{{ $labels.pod }}" - dashboard: https://grafana.$ENVIRONMENT.$PROVIDER.uw.systems/d/U61wpstMk/vault-credentials-sidecars - name: VaultSetup rules: - alert: VaultHighGCDuration diff --git a/common/container.yaml.tmpl b/stock/container.yaml.tmpl similarity index 98% rename from common/container.yaml.tmpl rename to stock/container.yaml.tmpl index f2dc699..17b6a9f 100644 --- a/common/container.yaml.tmpl +++ b/stock/container.yaml.tmpl @@ -29,6 +29,8 @@ groups: - alert: ContainerOOMing expr: kube_pod_container_status_last_terminated_reason{reason="OOMKilled"} and on (container,pod) kube_pod_container_status_ready == 0 for: 5m + labels: + group: container annotations: summary: "Container {{$labels.namespace}}/{{$labels.pod}}/{{$labels.container}} has been OOMKilled recently and it's not ready" impact: "Container not ready, may affect service uptime" diff --git a/common/missing_replicas.yaml.tmpl b/stock/missing_replicas.yaml.tmpl similarity index 100% rename from common/missing_replicas.yaml.tmpl rename to stock/missing_replicas.yaml.tmpl diff --git a/common/namespace_sync.yaml.tmpl b/stock/namespace_sync.yaml.tmpl similarity index 100% rename from common/namespace_sync.yaml.tmpl rename to stock/namespace_sync.yaml.tmpl diff --git a/common/storage.yaml.tmpl b/stock/storage.yaml.tmpl similarity index 100% rename from common/storage.yaml.tmpl rename to stock/storage.yaml.tmpl diff --git a/common/terraform_sync.yaml.tmpl b/stock/terraform_sync.yaml.tmpl similarity index 100% rename from common/terraform_sync.yaml.tmpl rename to stock/terraform_sync.yaml.tmpl diff --git a/stock/vault-clients.yaml.tmpl b/stock/vault-clients.yaml.tmpl new file mode 100644 index 0000000..4f840d2 --- /dev/null +++ b/stock/vault-clients.yaml.tmpl @@ -0,0 +1,41 @@ +# PROMETHEUS RULES +# DO NOT REMOVE line above, used in `pre-commit` hook + +groups: + - name: VaultClients + # Recommendations from https://s3-us-west-2.amazonaws.com/hashicorp-education/whitepapers/Vault/Vault-Consul-Monitoring-Guide.pdf + rules: + - alert: VaultSidecarCredentialsExpired + expr: time() - vkcc_sidecar_expiry_timestamp_seconds > 0 + for: 10m + labels: + group: vault_clients + annotations: + description: | + The credentials served by the vault credentials agent sidecar have expired and have not + been renewed. This may cause issues for the other containers in the pod. + summary: "The credentials for '{{ $labels.kubernetes_pod_name }}' have expired" + dashboard: https://grafana.$ENVIRONMENT.$PROVIDER.uw.systems/d/U61wpstMk/vault-credentials-sidecars + - alert: VaultSidecarDown + expr: up{job="vault-credentials-agents"} == 0 + for: 10m + labels: + group: vault_clients + annotations: + description: | + The vault credentials agent sidecar is down. This may cause issues for the other containers + in the pod. + summary: "The vault credentials agent for '{{ $labels.kubernetes_pod_name }}' is down" + dashboard: https://grafana.$ENVIRONMENT.$PROVIDER.uw.systems/d/U61wpstMk/vault-credentials-sidecars + - alert: VaultSidecarMissing + expr: (kube_pod_annotations{annotation_injector_tumblr_com_request=~"vault-sidecar-.+"} and on (pod,namespace) kube_pod_status_scheduled{condition="true"} == 1) unless on (pod,namespace) kube_pod_container_info{container=~"vault-credentials-agent.*"} + for: 10m + labels: + group: vault_clients + annotations: + description: | + The pod is annotated with `{{ $labels.key }}={{ $labels.value }}` but does not have a + container matching the name `vault-credentials-agent.*`. This indicates an issue with + the sidecar injection. Check the `kube-system/k8s-sidecar-injector` deployment for problems. + summary: "Vault sidecar is missing from {{ $labels.namespace }}/{{ $labels.pod }}" + dashboard: https://grafana.$ENVIRONMENT.$PROVIDER.uw.systems/d/U61wpstMk/vault-credentials-sidecars