From e8830b01061a40da94a445986895e5fb027d0017 Mon Sep 17 00:00:00 2001 From: ido Date: Tue, 24 Sep 2024 21:47:31 -0400 Subject: [PATCH 1/2] serviceMonitor --- charts/hermes/Chart.yaml | 2 +- charts/hermes/templates/deployment.yaml | 6 ++--- charts/hermes/templates/service.yaml | 2 ++ charts/hermes/templates/servicemonitor.yaml | 28 +++++++++++++++++++++ charts/hermes/values.yaml | 7 ++++++ 5 files changed, 41 insertions(+), 4 deletions(-) create mode 100644 charts/hermes/templates/servicemonitor.yaml diff --git a/charts/hermes/Chart.yaml b/charts/hermes/Chart.yaml index 74fedad51c..016c9bd4ad 100644 --- a/charts/hermes/Chart.yaml +++ b/charts/hermes/Chart.yaml @@ -15,7 +15,7 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 0.4.1 +version: 0.4.2 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to diff --git a/charts/hermes/templates/deployment.yaml b/charts/hermes/templates/deployment.yaml index 7a633c0515..24ac3f0170 100644 --- a/charts/hermes/templates/deployment.yaml +++ b/charts/hermes/templates/deployment.yaml @@ -3,18 +3,18 @@ kind: Deployment metadata: name: {{ include "hermes.fullname" . }} labels: - app: astria-dev-cluster + app: {{ include "hermes.fullname" . }} namespace: {{ include "hermes.namespace" . }} spec: replicas: {{ .Values.global.replicaCount }} selector: matchLabels: - app: astria-dev-cluster + app: {{ include "hermes.fullname" . }} template: metadata: name: {{ include "hermes.fullname" . }} labels: - app: astria-dev-cluster + app: {{ include "hermes.fullname" . }} spec: {{- if .Values.createChannel.enabled }} initContainers: diff --git a/charts/hermes/templates/service.yaml b/charts/hermes/templates/service.yaml index cac98ebeba..0eda70c8a7 100644 --- a/charts/hermes/templates/service.yaml +++ b/charts/hermes/templates/service.yaml @@ -2,6 +2,8 @@ kind: Service apiVersion: v1 metadata: + labels: + app: {{ include "hermes.fullname" . }} name: {{ include "hermes.fullname" . }}-service namespace: {{ include "hermes.namespace" . }} spec: diff --git a/charts/hermes/templates/servicemonitor.yaml b/charts/hermes/templates/servicemonitor.yaml new file mode 100644 index 0000000000..6cf7dfc356 --- /dev/null +++ b/charts/hermes/templates/servicemonitor.yaml @@ -0,0 +1,28 @@ +{{- if and .Values.serviceMonitor.enabled .Values.telemetry.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: hermes-relayer-metrics + labels: + app: {{ include "hermes.fullname" . }} + {{- with .Values.serviceMonitor.additionalLabels }} + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + jobLabel: hermes-relayer-metric + namespaceSelector: + matchNames: + - {{ include "hermes.namespace" . }} + selector: + matchLabels: + app: {{ include "hermes.fullname" . }} + endpoints: + - port: telemetry + path: /metrics + {{- with .Values.serviceMonitor.interval }} + interval: {{ . }} + {{- end }} + {{- with .Values.serviceMonitor.scrapeTimeout }} + scrapeTimeout: {{ . }} + {{- end }} +{{- end }} diff --git a/charts/hermes/values.yaml b/charts/hermes/values.yaml index fc2ca893b4..27a745fd00 100644 --- a/charts/hermes/values.yaml +++ b/charts/hermes/values.yaml @@ -34,6 +34,13 @@ telemetry: tracingServer: enabled: false +# ServiceMonitor configuration +serviceMonitor: + enabled: false + port: 26660 + additionalLabels: + release: kube-prometheus-stack + mode: clients: enabled: true From 4a0151659654d60c3d2848b587d1eba34688a020 Mon Sep 17 00:00:00 2001 From: ido Date: Wed, 25 Sep 2024 17:14:18 -0400 Subject: [PATCH 2/2] prometheus-rules, default values --- charts/hermes/files/config.toml | 4 ++-- charts/hermes/templates/_helpers.tpl | 14 ++++++++++++ charts/hermes/templates/prometheusrule.yaml | 20 +++++++++++++++++ charts/hermes/templates/service.yaml | 2 +- charts/hermes/values.yaml | 25 ++++++++++++++++++++- 5 files changed, 61 insertions(+), 4 deletions(-) create mode 100644 charts/hermes/templates/prometheusrule.yaml diff --git a/charts/hermes/files/config.toml b/charts/hermes/files/config.toml index cdc9a90ce2..ff2b396478 100644 --- a/charts/hermes/files/config.toml +++ b/charts/hermes/files/config.toml @@ -8,7 +8,7 @@ enabled = {{ .Values.rest.enabled }} # Specify the IPv4/6 host over which the built-in HTTP server will serve the RESTful # API requests. Default: 127.0.0.1 -host = '127.0.0.1' +host = '0.0.0.0' # Specify the port over which the built-in HTTP server will serve the restful API # requests. Default: 3000 @@ -35,7 +35,7 @@ tx_confirmation = {{ .Values.mode.packets.txConfirmation }} [telemetry] enabled = {{ .Values.telemetry.enabled }} -host = '127.0.0.1' +host = '0.0.0.0' port = {{ .Values.ports.telemetry }} [telemetry.buckets] diff --git a/charts/hermes/templates/_helpers.tpl b/charts/hermes/templates/_helpers.tpl index 3b63b396d0..d00d784ce5 100644 --- a/charts/hermes/templates/_helpers.tpl +++ b/charts/hermes/templates/_helpers.tpl @@ -22,6 +22,20 @@ We truncate at 63 chars because some Kubernetes name fields are limited to this {{- end -}} {{- end -}} +{{/* +Common labels +*/}} +{{- define "hermes.labels" -}} +{{ include "hermes.selectorLabels" . }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "hermes.selectorLabels" -}} +app: {{ include "hermes.fullname" . }} +{{- end }} + {{/* Return if ingress is stable. */}} diff --git a/charts/hermes/templates/prometheusrule.yaml b/charts/hermes/templates/prometheusrule.yaml new file mode 100644 index 0000000000..f07374c0f1 --- /dev/null +++ b/charts/hermes/templates/prometheusrule.yaml @@ -0,0 +1,20 @@ +{{- if .Values.alerting.enabled -}} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ template "hermes.fullname" . }} +{{- if .Values.alerting.prometheusRule.namespace }} + namespace: {{ .Values.alerting.prometheusRule.namespace | quote }} +{{- end }} + labels: + {{- include "hermes.labels" . | nindent 4 }} + {{- if .Values.alerting.prometheusRule.additionalLabels }} + {{- toYaml .Values.alerting.prometheusRule.additionalLabels | nindent 4 }} + {{- end }} +spec: +{{- if .Values.alerting.prometheusRule.rules }} + groups: + - name: {{ template "hermes.fullname" . }} + rules: {{- toYaml .Values.alerting.prometheusRule.rules | nindent 4 }} +{{- end }} +{{- end }} diff --git a/charts/hermes/templates/service.yaml b/charts/hermes/templates/service.yaml index 0eda70c8a7..d95aecf713 100644 --- a/charts/hermes/templates/service.yaml +++ b/charts/hermes/templates/service.yaml @@ -16,7 +16,7 @@ spec: targetPort: rest {{- end }} {{- if .Values.telemetry.enabled }} - - name: telemetry-svc + - name: telemetry port: {{ .Values.ports.telemetry }} targetPort: telemetry {{- end }} diff --git a/charts/hermes/values.yaml b/charts/hermes/values.yaml index 27a745fd00..cc878df14a 100644 --- a/charts/hermes/values.yaml +++ b/charts/hermes/values.yaml @@ -3,7 +3,7 @@ global: replicaCount: 1 logLevel: debug -image: ghcr.io/penumbra-zone/hermes:main +image: ghcr.io/astriaorg/hermes:sha-450f848 imagePullPolicy: IfNotPresent fullnameOverride: "" @@ -195,3 +195,26 @@ ingress: # - secretName: chart-example-tls # hosts: # - chart-example.local + +alerting: + enabled: false + interval: "" + additionalLabels: + release: kube-prometheus-stack + annotations: {} + # scrapeTimeout: 10s + # path: /metrics + prometheusRule: + enabled: true + additionalLabels: + release: kube-prometheus-stack + namespace: monitoring + rules: + - alert: Chain_Node_Down + expr: up{container="cometbft"} == 0 # Insert your query Expression + for: 1m # Rough number but should be enough to init warn + labels: + severity: critical + annotations: + summary: Chain Node is Down (instance {{ $labels.instance }}) + description: "chain node '{{ $labels.namespace }}' has disappeared from Prometheus target discovery.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"