From 869cd4dda42290ed3bb2b14e7cd4c79de32500c8 Mon Sep 17 00:00:00 2001 From: kubeJocker <102039539+kubeJocker@users.noreply.github.com> Date: Mon, 26 Jun 2023 18:55:41 +0800 Subject: [PATCH] feat: add prometheus cluster into addons (#3869) Co-authored-by: runsun Co-authored-by: huangzhangshu --- .github/workflows/cicd-pull-request.yml | 2 +- .github/workflows/cicd-push.yml | 2 +- .github/workflows/release-helm-chart.yml | 6 +- .gitignore | 1 + .../templates/addons/prometheus-addon.yaml | 25 +- .../grafana/configmaps-datasources.yaml | 6 +- .../templates/prometheus/alertmanager.yaml | 4 +- deploy/helm/values.yaml | 1004 +---------------- deploy/prometheus-cluster/Chart.yaml | 9 +- .../prometheus-cluster/templates/cluster.yaml | 6 +- deploy/prometheus-cluster/values.yaml | 6 +- deploy/prometheus/Chart.yaml | 4 +- .../config/server/serverFiles/prometheus.yml | 23 - deploy/prometheus/templates/configmap.yaml | 36 +- internal/cli/cmd/dashboard/dashboard.go | 8 +- 15 files changed, 108 insertions(+), 1034 deletions(-) diff --git a/.github/workflows/cicd-pull-request.yml b/.github/workflows/cicd-pull-request.yml index 88d6b9cad7b..e620f692b5d 100644 --- a/.github/workflows/cicd-pull-request.yml +++ b/.github/workflows/cicd-pull-request.yml @@ -131,6 +131,6 @@ jobs: VERSION: "v0.4.0-check" CHART_NAME: "kubeblocks" CHART_DIR: "deploy/helm" - DEP_REPO: "delphic" + DEP_REPO: "delphic|prometheus-cluster" APECD_REF: "v0.1.4" secrets: inherit diff --git a/.github/workflows/cicd-push.yml b/.github/workflows/cicd-push.yml index fc52b8586aa..681ea2043c9 100644 --- a/.github/workflows/cicd-push.yml +++ b/.github/workflows/cicd-push.yml @@ -222,7 +222,7 @@ jobs: VERSION: "v0.4.0-check" CHART_NAME: "kubeblocks" CHART_DIR: "deploy/helm" - DEP_REPO: "delphic" + DEP_REPO: "delphic|prometheus-cluster" APECD_REF: "v0.1.4" secrets: inherit diff --git a/.github/workflows/release-helm-chart.yml b/.github/workflows/release-helm-chart.yml index 9bc45df014f..81f9c0bbab0 100644 --- a/.github/workflows/release-helm-chart.yml +++ b/.github/workflows/release-helm-chart.yml @@ -33,15 +33,15 @@ jobs: release-chart: needs: chart-version - uses: apecloud/apecloud-cd/.github/workflows/release-charts.yml@v0.1.0 + uses: apecloud/apecloud-cd/.github/workflows/release-charts.yml@v0.1.4 with: MAKE_OPS: "bump-chart-ver" VERSION: "${{ needs.chart-version.outputs.chart-version }}" CHART_NAME: "kubeblocks" CHART_DIR: "deploy/helm" DEP_CHART_DIR: "deploy/helm/depend-charts" - DEP_REPO: "helm dep update deploy/delphic" - APECD_REF: "v0.1.0" + DEP_REPO: "delphic|prometheus-cluster" + APECD_REF: "v0.1.4" secrets: inherit send-message: diff --git a/.gitignore b/.gitignore index c867a5cc7b9..4886e678c92 100644 --- a/.gitignore +++ b/.gitignore @@ -63,4 +63,5 @@ tmp/ # helm dependency charts +deploy/**/Chart.lock deploy/**/charts diff --git a/deploy/helm/templates/addons/prometheus-addon.yaml b/deploy/helm/templates/addons/prometheus-addon.yaml index 59a1418e380..53685b25ad8 100644 --- a/deploy/helm/templates/addons/prometheus-addon.yaml +++ b/deploy/helm/templates/addons/prometheus-addon.yaml @@ -15,16 +15,17 @@ spec: helm: {{- if hasPrefix "oci://" .Values.addonChartLocationBase }} - chartLocationURL: {{ .Values.addonChartLocationBase }}/prometheus + chartLocationURL: {{ .Values.addonChartLocationBase }}/prometheus-cluster {{- else }} - chartLocationURL: {{ .Values.addonChartLocationBase }}/prometheus-15.16.1.tgz + chartLocationURL: {{ .Values.addonChartLocationBase }}/prometheus-cluster-{{ default .Chart.Version .Values.versionOverride }}.tgz {{- end }} installOptions: {{- if hasPrefix "oci://" .Values.addonChartLocationBase }} - version: 15.16.1 + version: {{ default .Chart.Version .Values.versionOverride }} {{- end }} + installValues: configMapRefs: - name: {{ include "addon.prometheus.name" . }}-chart-kubeblocks-values @@ -33,14 +34,14 @@ spec: valuesMapping: valueMap: replicaCount: server.replicaCount - storageClass: server.persistentVolume.storageClass - persistentVolumeEnabled: server.persistentVolume.enabled + storageClass: server.persistence.storageClass + persistentVolumeEnabled: server.persistence.enabled jsonMap: - tolerations: server.tolerations + tolerations: tolerations resources: - storage: server.persistentVolume.size + storage: server.persistence.size cpu: requests: server.resources.requests.cpu limits: server.resources.limits.cpu @@ -51,14 +52,14 @@ spec: - name: alertmanager valueMap: replicaCount: alertmanager.replicaCount - storageClass: alertmanager.persistentVolume.storageClass - persistentVolumeEnabled: alertmanager.persistentVolume.enabled + storageClass: alertmanager.persistence.storageClass + persistentVolumeEnabled: alertmanager.persistence.enabled jsonMap: - tolerations: alertmanager.tolerations + tolerations: tolerations resources: - storage: alertmanager.persistentVolume.size + storage: alertmanager.persistence.size cpu: requests: alertmanager.resources.requests.cpu limits: alertmanager.resources.limits.cpu @@ -143,4 +144,4 @@ spec: {{- end }} installable: - autoInstall: {{ .Values.prometheus.enabled }} \ No newline at end of file + autoInstall: {{ .Values.prometheus.enabled }} diff --git a/deploy/helm/templates/grafana/configmaps-datasources.yaml b/deploy/helm/templates/grafana/configmaps-datasources.yaml index 2e4054f5acf..013adea8429 100644 --- a/deploy/helm/templates/grafana/configmaps-datasources.yaml +++ b/deploy/helm/templates/grafana/configmaps-datasources.yaml @@ -16,7 +16,7 @@ data: datasource.yaml: |- apiVersion: 1 datasources: -{{- $scrapeInterval := default .Values.prometheus.server.global.scrape_interval | default "30s" }} +{{- $scrapeInterval := default .Values.prometheus.prometheus.server.global.scrape_interval | default "30s" }} {{- if .Values.grafana.sidecar.datasources.defaultDatasourceEnabled }} - name: Prometheus type: prometheus @@ -24,7 +24,7 @@ data: {{- if .Values.grafana.sidecar.datasources.url }} url: {{ .Values.grafana.sidecar.datasources.url }} {{- else }} - url: http://kb-addon-{{ include "addon.prometheus.name" . }}-server.{{ template "kubeblocks.namespace" . }}:80/ + url: http://kb-addon-{{ include "addon.prometheus.name" . }}-{{ .Values.prometheus.nameOverride }}-server.{{ template "kubeblocks.namespace" . }}:80/ {{- end }} access: proxy isDefault: true @@ -36,7 +36,7 @@ data: {{- if .Values.grafana.sidecar.datasources.url }} url: {{ .Values.grafana.sidecar.datasources.url }} {{- else }} - url: http://kb-addon-{{ include "addon.prometheus.name" . }}-server.{{ template "kubeblocks.namespace" . }}:80/ + url: http://kb-addon-{{ include "addon.prometheus.name" . }}-{{ .Values.prometheus.nameOverride }}-server.{{ template "kubeblocks.namespace" . }}:80/ {{- end }} access: proxy isDefault: false diff --git a/deploy/helm/templates/prometheus/alertmanager.yaml b/deploy/helm/templates/prometheus/alertmanager.yaml index 24a8af36e83..84fb7b5e58d 100644 --- a/deploy/helm/templates/prometheus/alertmanager.yaml +++ b/deploy/helm/templates/prometheus/alertmanager.yaml @@ -1,10 +1,10 @@ -{{- if not (empty .Values.prometheus.alertmanager.configMapOverrideName) }} +{{- if not (empty .Values.prometheus.prometheus.alertmanager.configMapOverrideName) }} apiVersion: v1 kind: ConfigMap metadata: labels: {{- include "kubeblocks.labels" . | nindent 4 }} - name: kb-addon-{{ include "addon.prometheus.name" . }}-{{ .Values.prometheus.alertmanager.configMapOverrideName }} + name: kb-addon-{{ include "addon.prometheus.name" . }}-{{ .Values.prometheus.prometheus.alertmanager.configMapOverrideName }} data: alertmanager.yml: | global: { } diff --git a/deploy/helm/values.yaml b/deploy/helm/values.yaml index 6af7e5d1872..5e04006ed94 100644 --- a/deploy/helm/values.yaml +++ b/deploy/helm/values.yaml @@ -315,994 +315,48 @@ addonHelmInstallOptions: dashboards: enabled: true -# Sub-charts values (NOTES: following doesn't required @param documentations) +## Prometheus Addon +## prometheus: ## If false, prometheus sub-chart will not be installed ## enabled: false - alertmanager: - ## If false, alertmanager will not be installed - ## - enabled: true + nameOverride: "playground" - ## alertmanager container image - ## - image: - repository: registry.cn-hangzhou.aliyuncs.com/apecloud/alertmanager - tag: v0.24.0 - - ## ConfigMap override where fullname is {{.Release.Name}}-{{.Values.alertmanager.configMapOverrideName}} - ## Defining configMapOverrideName will cause templates/alertmanager-configmap.yaml - ## to NOT generate a ConfigMap resource - ## - configMapOverrideName: "alertmanager-config" - - ## Node tolerations for alertmanager scheduling to nodes with taints - ## Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ - ## - tolerations: - - key: kb-controller - operator: Equal - value: "true" - effect: NoSchedule - - affinity: - nodeAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 100 - preference: - matchExpressions: - - key: kb-controller - operator: In - values: - - "true" - - persistentVolume: - ## If true, alertmanager will create/use a Persistent Volume Claim - ## If false, use emptyDir - ## - enabled: false + extraLabels: + addons.extensions.kubeblocks.io: "true" - ## alertmanager data Persistent Volume size - ## - size: 1Gi - - ## alertmanager data Persistent Volume Storage Class - ## If defined, storageClassName: - ## If set to "-", storageClassName: "", which disables dynamic provisioning - ## If undefined (the default) or set to null, no storageClassName spec is - ## set, choosing the default provisioner. (gp2 on AWS, standard on - ## GKE, AWS & OpenStack) - ## - # storageClass: "-" - - ## Use a StatefulSet if replicaCount needs to be greater than 1 (see below) - ## - replicaCount: 1 - - statefulSet: - ## If true, use a statefulset instead of a deployment for pod management. - ## This allows to scale replicas to more than 1 pod - ## - enabled: true - - ## Alertmanager headless service to use for the statefulset - ## - headless: - ## Enabling peer mesh service end points for enabling the HA alert manager - ## Ref: https://github.com/prometheus/alertmanager/blob/master/README.md - enableMeshPeer: true - - ## alertmanager resource requests and limits - ## Ref: http://kubernetes.io/docs/user-guide/compute-resources/ - ## - resources: {} - # limits: - # cpu: 10m - # memory: 32Mi - # requests: - # cpu: 10m - # memory: 32Mi - - ## Security context to be added to alertmanager pods - ## - securityContext: - runAsUser: 0 - runAsNonRoot: false - runAsGroup: 65534 - fsGroup: 65534 - - containerSecurityContext: - allowPrivilegeEscalation: false - - ingress: - ## If true, alertmanager Ingress will be created - ## - enabled: false - - # For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName - # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress - # ingressClassName: nginx - - ## alertmanager Ingress annotations - ## - annotations: {} - # kubernetes.io/ingress.class: nginx - # kubernetes.io/tls-acme: 'true' - - ## alertmanager Ingress additional labels - ## - extraLabels: {} - - ## alertmanager Ingress hostnames with optional path - ## Must be provided if Ingress is enabled - ## - hosts: [] - # - alertmanager.domain.com - # - domain.com/alertmanager - - path: / - - # pathType is only for k8s >= 1.18 - pathType: Prefix - - ## Extra paths to prepend to every host configuration. This is useful when working with annotation based services. - extraPaths: [] - # - path: /* - # backend: - # serviceName: ssl-redirect - # servicePort: use-annotation - - ## alertmanager Ingress TLS configuration - ## Secrets must be manually created in the namespace - ## - tls: [] - # - secretName: prometheus-alerts-tls - # hosts: - # - alertmanager.domain.com - - service: - annotations: {} - labels: {} - clusterIP: "" - - ## Enabling peer mesh service end points for enabling the HA alert manager - ## Ref: https://github.com/prometheus/alertmanager/blob/master/README.md - # enableMeshPeer : true - - ## List of IP addresses at which the alertmanager service is available - ## Ref: https://kubernetes.io/docs/user-guide/services/#external-ips - ## - externalIPs: [] - - loadBalancerIP: "" - loadBalancerSourceRanges: [] - servicePort: 80 - # nodePort: 30000 - sessionAffinity: None - type: ClusterIP - - - kubeStateMetrics: - ## If false, kube-state-metrics sub-chart will not be installed - ## - enabled: false - - nodeExporter: - ## If false, node-exporter will not be installed - ## - enabled: false - - ## node-exporter container image - ## - image: - repository: registry.cn-hangzhou.aliyuncs.com/apecloud/node-exporter - tag: v1.3.1 - - configmapReload: - prometheus: - ## configmap-reload container image - ## - image: - repository: registry.cn-hangzhou.aliyuncs.com/apecloud/configmap-reload - tag: v0.5.0 + # prometheus sub chart + prometheus: alertmanager: - ## configmap-reload container image - ## - image: - repository: registry.cn-hangzhou.aliyuncs.com/apecloud/configmap-reload - tag: v0.5.0 - server: - ## Prometheus server container name - ## - enabled: true - - ## Prometheus server container image - ## - image: - repository: registry.cn-hangzhou.aliyuncs.com/apecloud/prometheus - tag: v2.44.0 - - global: - ## How frequently to scrape targets by default + ## ConfigMap override where fullname is {{.Release.Name}}-{{.Values.alertmanager.configMapOverrideName}} + ## Defining configMapOverrideName will cause templates/alertmanager-configmap.yaml + ## to NOT generate a ConfigMap resource ## - scrape_interval: 15s - ## How long until a scrape request times out + configMapOverrideName: "alertmanager-config" + + server: + global: + ## How frequently to scrape targets by default + ## + scrape_interval: 15s + ## How long until a scrape request times out + ## + scrape_timeout: 10s + ## How frequently to evaluate rules + ## + evaluation_interval: 15s + + + ## https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write ## - scrape_timeout: 10s - ## How frequently to evaluate rules - ## - evaluation_interval: 15s - - ## Additional Prometheus server container flags - ## - extraFlags: - - web.enable-lifecycle - - web.enable-remote-write-receiver - - ## Additional Prometheus server container arguments - ## - extraArgs: - log.level: info - storage.tsdb.min-block-duration: 30m - enable-feature: memory-snapshot-on-shutdown - - ## https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write - ## - remoteWrite: [] - - ## Prefix used to register routes, overriding externalUrl route. - ## Useful for proxies that rewrite URLs. - ## - routePrefix: / - - ## Node tolerations for server scheduling to nodes with taints - ## Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ - ## - tolerations: - - key: kb-controller - operator: Equal - value: "true" - effect: NoSchedule - - affinity: - nodeAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 100 - preference: - matchExpressions: - - key: kb-controller - operator: In - values: - - "true" - - persistentVolume: - ## If true, Prometheus server will create/use a Persistent Volume Claim - ## If false, use emptyDir - ## - enabled: false + remoteWrite: [ ] - ## Prometheus server data Persistent Volume size + ## Prometheus' data retention period (default if not specified is 15 days) ## - size: 20Gi - - ## Prometheus server data Persistent Volume Storage Class - ## If defined, storageClassName: - ## If set to "-", storageClassName: "", which disables dynamic provisioning - ## If undefined (the default) or set to null, no storageClassName spec is - ## set, choosing the default provisioner. (gp2 on AWS, standard on - ## GKE, AWS & OpenStack) - ## - # storageClass: "-" - - ## Use a StatefulSet if replicaCount needs to be greater than 1 (see below) - ## - replicaCount: 1 - - statefulSet: - ## If true, use a statefulset instead of a deployment for pod management. - ## This allows to scale replicas to more than 1 pod - ## - enabled: true - - ## Prometheus server resource requests and limits - ## Ref: http://kubernetes.io/docs/user-guide/compute-resources/ - ## - resources: {} - # limits: - # cpu: 500m - # memory: 512Mi - # requests: - # cpu: 500m - # memory: 512Mi - - ## Prometheus' data retention period (default if not specified is 15 days) - ## - retention: "2d" - - ## Security context to be added to server pods - ## - securityContext: - runAsUser: 0 - runAsNonRoot: false - runAsGroup: 65534 - fsGroup: 65534 - - containerSecurityContext: - allowPrivilegeEscalation: false - - service: - ## If false, no Service will be created for the Prometheus server - ## - enabled: true - - annotations: {} - labels: {} - clusterIP: "" - - ## List of IP addresses at which the Prometheus server service is available - ## Ref: https://kubernetes.io/docs/user-guide/services/#external-ips - ## - externalIPs: [] - - loadBalancerIP: "" - loadBalancerSourceRanges: [] - servicePort: 80 - sessionAffinity: None - type: ClusterIP - - ## Enable gRPC port on service to allow auto discovery with thanos-querier - gRPC: - enabled: false - servicePort: 10901 - # nodePort: 10901 - - ## If using a statefulSet (statefulSet.enabled=true), configure the - ## service to connect to a specific replica to have a consistent view - ## of the data. - statefulsetReplica: - enabled: false - replica: 0 - - ingress: - ## If true, Prometheus server Ingress will be created - ## - enabled: false - - # For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName - # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress - # ingressClassName: nginx - - ## Prometheus server Ingress annotations - ## - annotations: {} - # kubernetes.io/ingress.class: nginx - # kubernetes.io/tls-acme: 'true' - - ## Prometheus server Ingress additional labels - ## - extraLabels: {} - - ## Prometheus server Ingress hostnames with optional path - ## Must be provided if Ingress is enabled - ## - hosts: [] - # - prometheus.domain.com - # - domain.com/prometheus - - path: / - - # pathType is only for k8s >= 1.18 - pathType: Prefix - - ## Extra paths to prepend to every host configuration. This is useful when working with annotation based services. - extraPaths: [] - # - path: /* - # backend: - # serviceName: ssl-redirect - # servicePort: use-annotation - - ## Prometheus server Ingress TLS configuration - ## Secrets must be manually created in the namespace - ## - tls: [] - # - secretName: prometheus-server-tls - # hosts: - # - prometheus.domain.com - - - - - ## AlertManager ConfigMap Entries - ## NOTE: Please review these carefully as thresholds and behavior may not meet - ## your SLOs or labels. - ## - alertmanagerFiles: - alertmanager.yml: - global: { } - - receivers: - - name: default-receiver - - route: - receiver: default-receiver - group_wait: 5s - group_interval: 30s - repeat_interval: 10m - - ## Sample prometheus rules/alerts - ## NOTE: Please review these carefully as thresholds and behavior may not meet - ## your SLOs or labels. - ## - ruleFiles: - mysql_alert_rules.yml: | - groups: - - name: MysqldExporter - rules: - - alert: MysqlDown - expr: 'max_over_time(mysql_up[1m]) == 0' - for: 0m - labels: - severity: critical - annotations: - summary: 'MySQL is down' - description: 'MySQL is down. (instance: {{ $labels.pod }})' - - - alert: MysqlRestarted - expr: 'mysql_global_status_uptime < 60' - for: 0m - labels: - severity: info - annotations: - summary: 'MySQL has just been restarted (< 60s)' - description: 'MySQL has just been restarted {{ $value | printf "%.1f" }} seconds ago. (instance: {{ $labels.pod }})' - - - alert: MysqlTooManyConnections - expr: 'sum(max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections) BY (namespace,app_kubernetes_io_instance,pod) * 100 > 80' - for: 2m - labels: - severity: warning - annotations: - summary: 'MySQL has too many connections (> 80%)' - description: '{{ $value | printf "%.2f" }} percent of MySQL connections are in use. (instance: {{ $labels.pod }})' - - - alert: MysqlConnectionErrors - expr: 'sum(increase(mysql_global_status_connection_errors_total[1m])) BY (namespace,app_kubernetes_io_instance,pod) > 0' - for: 2m - labels: - severity: warning - annotations: - summary: 'MySQL connection errors' - description: 'MySQL has connection errors and the value is {{ $value | printf "%.2f" }}. (instance: {{ $labels.pod }})' - - - alert: MysqlHighThreadsRunning - expr: 'sum(max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections) BY (namespace,app_kubernetes_io_instance,pod) * 100 > 60' - for: 2m - labels: - severity: warning - annotations: - summary: 'MySQL high threads running (> 60%)' - description: '{{ $value | printf "%.2f" }} percent of MySQL connections are in running state. (instance: {{ $labels.pod }})' - - - alert: MysqlSlowQueries - expr: 'sum(increase(mysql_global_status_slow_queries[1m])) BY (namespace,app_kubernetes_io_instance,pod) > 0' - for: 2m - labels: - severity: info - annotations: - summary: 'MySQL slow queries' - description: 'MySQL server has {{ $value | printf "%.2f" }} slow query. (instance: {{ $labels.pod }})' - - - alert: MysqlInnodbLogWaits - expr: 'sum(rate(mysql_global_status_innodb_log_waits[5m])) BY (namespace,app_kubernetes_io_instance,pod) > 10' - for: 2m - labels: - severity: warning - annotations: - summary: 'MySQL InnoDB log waits (> 10)' - description: 'MySQL innodb log writes stalling and the value is {{ $value | printf "%.2f" }}. (instance: {{ $labels.pod }})' - - - alert: MysqlInnodbBufferPoolHits - expr: 'sum(rate(mysql_global_status_innodb_buffer_pool_reads[5m]) / rate(mysql_global_status_innodb_buffer_pool_read_requests[5m])) BY (namespace,app_kubernetes_io_instance,pod) * 100 > 5' - for: 2m - labels: - severity: warning - annotations: - summary: 'MySQL InnoDB high read requests rate hitting disk (> 5%)' - description: 'High number of logical reads that InnoDB could not satisfy from the buffer pool, and had to read directly from disk. The value is {{ $value | printf "%.2f" }} percent. (instance: {{ $labels.pod }})' - - postgresql_alert_rules.yml: | - groups: - - name: PostgreSQLExporter - rules: - - alert: PostgreSQLDown - expr: 'max_over_time(pg_up[1m]) == 0' - for: 0m - labels: - severity: critical - annotations: - summary: 'PostgreSQL is down' - description: 'PostgreSQL is down. (instance: {{ $labels.pod }})' - - - alert: PostgreSQLRestarted - expr: 'time() - pg_postmaster_start_time_seconds < 60' - for: 0m - labels: - severity: info - annotations: - summary: 'PostgreSQL has just been restarted (< 60s)' - description: 'PostgreSQL has just been restarted {{ $value | printf "%.1f" }} seconds ago. (instance: {{ $labels.pod }})' - - - alert: PostgreSQLExporterError - expr: 'pg_exporter_last_scrape_error > 0' - for: 0m - labels: - severity: warning - annotations: - summary: 'PostgreSQL exporter scrape error' - description: 'PostgreSQL exporter has {{ $value | printf "%.2f" }} scrape errors. A query may be buggy in query.yaml. (instance: {{ $labels.pod }})' - - - alert: PostgreSQLTooManySlowQueries - expr: | - max by(namespace,app_kubernetes_io_instance,pod,datname) ( - max_over_time(pg_stat_activity_max_tx_duration{datname!~"template.*"}[2m]) - ) > 60 - for: 2m - labels: - severity: warning - annotations: - summary: 'PostgreSQL database has high number of slow queries' - description: 'PostgreSQL database has slow queries and the value is {{ $value | printf "%.2f" }}. (instance: {{ $labels.pod }}, database: {{ $labels.datname }})' - - - alert: PostgreSQLTooManyConnections - expr: | - sum by (namespace,app_kubernetes_io_instance,pod) (pg_stat_activity_count{datname!~"template.*"}) - > on(namespace,app_kubernetes_io_instance,pod) - (pg_settings_max_connections - pg_settings_superuser_reserved_connections) * 0.8 - for: 2m - labels: - severity: warning - annotations: - summary: 'PostgreSQL too many connections (> 80%)' - description: 'PostgreSQL has too many connections and the value is {{ $value | printf "%.2f" }} percent. (instance: {{ $labels.pod }})' - - - alert: PostgreSQLDeadLocks - expr: 'increase(pg_stat_database_deadlocks_total{datname!~"template.*", datname!=""}[2m]) > 5' - for: 2m - labels: - severity: warning - annotations: - summary: 'PostgreSQL database has dead locks (> 5)' - description: 'PostgreSQL database has {{ $value | printf "%.2f"}} dead locks. (instance: {{ $labels.pod }}, database: {{ $labels.datname }})' - - - alert: PostgreSQLHighRollbackRate - expr: | - rate(pg_stat_database_xact_rollback_total{datname!~"template.*", datname!=""}[2m]) - / - rate(pg_stat_database_xact_commit_total{datname!~"template.*", datname!=""}[2m]) - > 0.1 - for: 2m - labels: - severity: warning - annotations: - summary: 'PostgreSQL database has high rollback rate (> 10%)' - description: 'Ratio of transactions being aborted compared to committed is {{ $value | printf "%.2f"}} percent. (instance: {{ $labels.pod }}, database: {{ $labels.datname }})' - - - alert: PostgreSQLTooManyLocksAcquired - expr: | - sum by (namespace,app_kubernetes_io_instance,pod) (pg_locks_count) - / on(namespace,app_kubernetes_io_instance,pod) - (pg_settings_max_locks_per_transaction * pg_settings_max_connections) - > 0.2 - for: 2m - labels: - severity: warning - annotations: - summary: 'PostgreSQL has too many locks acquired (> 20%)' - description: 'Too many locks acquired on the database and the value is {{ $value | printf "%.2f" }} percent. (instance: {{ $labels.pod }})' - - - alert: PostgreSQLCacheHitRatio - expr: | - avg by (namespace,app_kubernetes_io_instance,pod,datname) ( - rate(pg_stat_database_blks_hit_total{datname!~"template.*", datname!=""}[2m]) - / - ( - rate( - pg_stat_database_blks_hit_total{datname!~"template.*", datname!=""}[2m] - ) - + - rate( - pg_stat_database_blks_read_total{datname!~"template.*", datname!=""}[2m] - ) - ) - ) < 0.9 - for: 2m - labels: - severity: warning - annotations: - summary: 'PostgreSQL database has low cache hit rate (< 90%)' - description: 'Low cache hit rate and the value is {{ $value | printf "%.2f" }} percent. (instance: {{ $labels.pod }}, database: {{ $labels.datname }})' - - - alert: PostgreSQLMaxWriteBufferReached - expr: 'rate(pg_stat_bgwriter_maxwritten_clean_total[2m]) > 0' - for: 2m - labels: - severity: warning - annotations: - summary: 'PostgreSQL write buffers reached max' - description: 'PostgreSQL background writer stops for max and the value is {{ $value | printf "%.2f" }}. (instance: {{ $labels.pod }})' - - - alert: PostgreSQLHighWALFilesArchiveErrorRate - expr: | - rate(pg_stat_archiver_failed_count_total[2m]) - / ( - rate(pg_stat_archiver_archived_count_total[2m]) + rate(pg_stat_archiver_failed_count_total[2m]) - ) > 0.1 - for: 2m - labels: - severity: warning - annotations: - summary: 'PostgreSQL has high error rate in WAL files archiver(> 10%)' - description: 'PostgreSQL high error rate in WAL files archiver and the value is {{ $value | printf "%.2f" }} percent. (instance: {{ $labels.pod }})' - - - alert: PostgreSQLTableNotAutoVacuumed - expr: | - (pg_stat_user_tables_last_autovacuum > 0) - and - (time() - pg_stat_user_tables_last_autovacuum) - > 24 * 60 * 60 * 10 - for: 0m - labels: - severity: warning - annotations: - summary: 'PostgreSQL table in database has not been auto vacuumed for 10 days' - description: 'Table {{ $labels.relname }} in database has not been auto vacuumed for 10 days. (instance: {{ $labels.pod }}, database: {{ $labels.datname }})' - - - alert: PostgreSQLTableNotAutoAnalyzed - expr: | - (pg_stat_user_tables_last_autoanalyze > 0) - and - (time() - pg_stat_user_tables_last_autoanalyze) - > 24 * 60 * 60 * 10 - for: 0m - labels: - severity: warning - annotations: - summary: 'PostgreSQL table in database has not been auto analyzed for 10 days' - description: 'Table {{ $labels.relname }} in database has not been auto analyzed for 10 days. (instance: {{ $labels.pod }}, database: {{ $labels.datname }})' - - - alert: PostgreSQLTableTooManyDeadTuples - expr: | - (pg_stat_user_tables_n_dead_tup > 10000) - / - (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup) - >= 0.1 - for: 2m - labels: - severity: warning - annotations: - summary: 'PostgreSQL table in database has too many dead tuples (> 10%)' - description: 'Table {{ $labels.relname }} in database dead tuples is too large and the value is {{ $value | printf "%.2f" }} percent. (instance: {{ $labels.pod }}, database: {{ $labels.datname }})' - - redis_alert_rules.yml: | - groups: - - name: RedisExporter - rules: - - alert: RedisDown - expr: 'redis_up == 0' - for: 5m - labels: - severity: critical - annotations: - summary: 'Redis is down' - description: 'Redis is down. (instance: {{ $labels.pod }})' - - - alert: RedisCPUHigh - expr: '(rate(redis_cpu_sys_seconds_total[1m]) + rate(redis_cpu_user_seconds_total[1m])) * 100 > 80' - for: 2m - labels: - severity: warning - annotations: - summary: 'Out of CPU (> 80%)' - description: 'Redis is running out of CPU and the value is {{ $value | printf "%.2f" }} percent. (instance: {{ $labels.pod }})' - - - alert: RedisMemoryHigh - expr: '(redis_memory_max_bytes == 0 or redis_memory_used_bytes * 100 / redis_memory_max_bytes) > 90' - for: 5m - labels: - severity: warning - annotations: - summary: 'Out of memory (> 90%)' - description: 'Redis is running out of memory and the value is {{ $value | printf "%.2f" }} percent. (instance: {{ $labels.pod }})' - - - alert: RedisTooManyConnections - expr: 'redis_connected_clients * 100 / redis_config_maxclients > 80' - for: 1m - labels: - severity: warning - annotations: - summary: 'Redis has too many connections (> 80%)' - description: 'Redis has too many connections and the value is {{ $value | printf "%.2f" }} percent. (instance: {{ $labels.pod }})' - - - alert: RedisRejectedConnections - expr: 'increase(redis_rejected_connections_total[1m]) > 0' - for: 5m - labels: - severity: error - annotations: - summary: 'Redis has rejected connections' - description: '{{ $value | printf "%.2f" }} connections to Redis has been rejected. (instance: {{ $labels.pod }})' - - - alert: RedisKeyEviction - expr: 'increase(redis_evicted_keys_total[5m]) > 0' - for: 1s - labels: - severity: error - annotations: - summary: 'Redis has evicted keys' - description: 'Redis has evicted keys in the last 5 minutes and the value is {{ $value | printf "%.2f" }}. (instance: {{ $labels.pod }})' - - - alert: RedisMissingMaster - expr: 'count by (app_kubernetes_io_instance) (redis_instance_info{role="master"}) < 1' - for: 30s - labels: - severity: critical - annotations: - summary: 'Redis missing master' - description: 'Redis cluster has no node marked as master.' - - - alert: RedisDisconnectedSlaves - expr: 'count without (instance, job) (redis_connected_slaves) - sum without (instance, job) (redis_connected_slaves) - 1 > 1' - for: 0m - labels: - severity: critical - annotations: - summary: 'Redis disconnected slaves' - description: 'Redis not replicating for all slaves. Consider reviewing the redis replication status. (instance: {{ $labels.pod }})' - - - alert: RedisReplicationBroken - expr: 'delta(redis_connected_slaves[1m]) < 0' - for: 0m - labels: - severity: critical - annotations: - summary: 'Redis replication broken' - description: 'Redis instance lost a slave. (instance: {{ $labels.pod }})' - - mongodb_alert_rules.yaml: |- - groups: - - name: MongodbExporter - rules: - - alert: MongodbDown - expr: 'max_over_time(mongodb_up[1m]) == 0' - for: 0m - labels: - severity: critical - annotations: - summary: 'MongoDB is Down' - description: 'MongoDB instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}' - - - alert: MongodbRestarted - expr: 'mongodb_instance_uptime_seconds < 60' - for: 0m - labels: - severity: info - annotations: - summary: 'Mongodb has just been restarted (< 60s)' - description: 'Mongodb has just been restarted {{ $value | printf "%.1f" }} seconds ago\n LABELS = {{ $labels }}' - - - alert: MongodbReplicaMemberUnhealthy - expr: 'max_over_time(mongodb_rs_members_health[1m]) == 0' - for: 0m - labels: - severity: critical - annotations: - summary: 'Mongodb replica member is unhealthy' - description: 'MongoDB replica member is not healthy\n VALUE = {{ $value }}\n LABELS = {{ $labels }}' - - - alert: MongodbReplicationLag - expr: '(mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (pod) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"}) / 1000 > 10' - for: 0m - labels: - severity: critical - annotations: - summary: 'MongoDB replication lag (> 10s)' - description: 'Mongodb replication lag is more than 10s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}' - - - alert: MongodbReplicationHeadroom - expr: 'sum(avg(mongodb_mongod_replset_oplog_head_timestamp - mongodb_mongod_replset_oplog_tail_timestamp)) - sum(avg(mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (pod) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"})) <= 0' - for: 0m - labels: - severity: critical - annotations: - summary: 'MongoDB replication headroom (< 0)' - description: 'MongoDB replication headroom is <= 0\n VALUE = {{ $value }}\n LABELS = {{ $labels }}' - - - alert: MongodbNumberCursorsOpen - expr: 'mongodb_ss_metrics_cursor_open{csr_type="total"} > 10 * 1000' - for: 2m - labels: - severity: warning - annotations: - summary: 'MongoDB opened cursors num (> 10k)' - description: 'Too many cursors opened by MongoDB for clients (> 10k)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}' - - - alert: MongodbCursorsTimeouts - expr: 'increase(mongodb_ss_metrics_cursor_timedOut[1m]) > 100' - for: 2m - labels: - severity: warning - annotations: - summary: 'MongoDB cursors timeouts (>100/minute)' - description: 'Too many cursors are timing out (> 100/minute)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}' - - - alert: MongodbTooManyConnections - expr: 'avg by(pod) (rate(mongodb_ss_connections{conn_type="current"}[1m])) / avg by(pod) (sum (mongodb_ss_connections) by(pod)) * 100 > 80' - for: 2m - labels: - severity: warning - annotations: - summary: 'MongoDB too many connections (> 80%)' - description: 'Too many connections (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}' - - - alert: MongodbVirtualMemoryUsage - expr: '(sum(mongodb_ss_mem_virtual) BY (pod) / sum(mongodb_ss_mem_resident) BY (pod)) > 100' - for: 2m - labels: - severity: warning - annotations: - summary: MongoDB virtual memory usage high - description: "High memory usage: the quotient of (mem_virtual / mem_resident) is more than 100\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - kafka_alert_rules.yaml: |- - group: - - name: KafkaExporter - rules: - - alert: KafkaTopicsReplicas - expr: 'sum(kafka_topic_partition_in_sync_replica) by (topic) < 3' - for: 0m - labels: - severity: critical - annotations: - summary: 'Kafka topics replicas (instance {{ $labels.app_kubernetes_io_instance }})' - description: 'Kafka topic in-sync partition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}' - - alert: KafkaConsumersGroup - expr: 'sum(kafka_consumergroup_lag) by (consumergroup) > 50' - for: 1m - labels: - severity: critical - annotations: - summary: 'Kafka consumers group (instance {{ $labels.app_kubernetes_io_instance }})' - description: 'Kafka consumers group\n VALUE = {{ $value }}\n LABELS = {{ $labels }}' - - alert: KafkaBrokerDown - expr: 'kafka_brokers < 3' - for: 0m - labels: - severity: critical - annotations: - Summary: 'Kafka broker *{{ $labels.app_kubernetes_io_instance }}* alert status' - description: 'One of the Kafka broker *{{ $labels.app_kubernetes_io_instance }}* is down.' - - serverFiles: - prometheus.yml: - rule_files: - - /etc/config/recording_rules.yml - - /etc/config/alerting_rules.yml - - /etc/config/mysql_alert_rules.yml - - /etc/config/postgresql_alert_rules.yml - - /etc/config/redis_alert_rules.yml - - /etc/config/kafka_alert_rules.yml - - /etc/config/mongodb_alert_rules.yaml - - scrape_configs: - - job_name: prometheus - static_configs: - - targets: - - localhost:9090 - - # Scrape config for kubeblocks managed service endpoints. - # - # The relabeling allows the actual service scrape endpoint to be configured - # via the following annotations: - # - # * `monitor.kubeblocks.io/scrape`: Only scrape services that have a value of - # `true`. - # * `monitor.kubeblocks.io/scheme`: If the metrics endpoint is secured then you will need - # to set this to `https` & most likely set the `tls_config` of the scrape config. - # * `monitor.kubeblocks.io/path`: If the metrics path is not `/metrics` override this. - # * `monitor.kubeblocks.io/port`: If the metrics are exposed on a different port to the - # service then set this appropriately. - # * `monitor.kubeblocks.io/param_`: If the metrics endpoint uses parameters - # then you can set any parameter - - job_name: 'kubeblocks-service' - honor_labels: true - - kubernetes_sd_configs: - - role: endpoints - - relabel_configs: - - source_labels: [ __meta_kubernetes_service_label_app_kubernetes_io_managed_by ] - action: keep - regex: kubeblocks - - source_labels: [ __meta_kubernetes_service_label_monitor_kubeblocks_io_managed_by ] - action: drop - regex: agamotto - - source_labels: [ __meta_kubernetes_service_annotation_monitor_kubeblocks_io_scrape ] - action: keep - regex: true - - source_labels: [ __meta_kubernetes_service_annotation_monitor_kubeblocks_io_scheme ] - action: replace - target_label: __scheme__ - regex: (https?) - - source_labels: [ __meta_kubernetes_service_annotation_monitor_kubeblocks_io_path ] - action: replace - target_label: __metrics_path__ - regex: (.+) - - source_labels: [ __address__, __meta_kubernetes_service_annotation_monitor_kubeblocks_io_port ] - action: replace - target_label: __address__ - regex: (.+?)(?::\d+)?;(\d+) - replacement: $1:$2 - - action: labelmap - regex: __meta_kubernetes_service_annotation_monitor_kubeblocks_io_param_(.+) - replacement: __param_$1 - - action: labelmap - regex: __meta_kubernetes_service_label_(.+) - - source_labels: [ __meta_kubernetes_namespace ] - action: replace - target_label: namespace - - source_labels: [ __meta_kubernetes_service_name ] - action: replace - target_label: service - - source_labels: [ __meta_kubernetes_pod_node_name ] - action: replace - target_label: node - - source_labels: [ __meta_kubernetes_pod_name ] - action: replace - target_label: pod - - source_labels: [ __meta_kubernetes_pod_phase ] - regex: Pending|Succeeded|Failed|Completed - action: drop - - - job_name: 'kubeblocks-agamotto' - honor_labels: true - - kubernetes_sd_configs: - - role: endpoints - - relabel_configs: - - source_labels: [ __meta_kubernetes_service_label_monitor_kubeblocks_io_managed_by ] - action: keep - regex: agamotto - - source_labels: [ __meta_kubernetes_service_annotation_monitor_kubeblocks_io_scrape ] - action: keep - regex: true - - source_labels: [ __meta_kubernetes_service_annotation_monitor_kubeblocks_io_scheme ] - action: replace - target_label: __scheme__ - regex: (https?) - - source_labels: [ __meta_kubernetes_service_annotation_monitor_kubeblocks_io_path ] - action: replace - target_label: __metrics_path__ - regex: (.+) - - source_labels: [ __address__, __meta_kubernetes_service_annotation_monitor_kubeblocks_io_port ] - action: replace - target_label: __address__ - regex: (.+?)(?::\d+)?;(\d+) - replacement: $1:$2 - - action: labelmap - regex: __meta_kubernetes_service_annotation_monitor_kubeblocks_io_param_(.+) - replacement: __param_$1 - - source_labels: [ __meta_kubernetes_pod_phase ] - regex: Pending|Succeeded|Failed|Completed - action: drop - - pushgateway: - ## If false, pushgateway will not be installed - ## - enabled: false + retention: "2d" ## loki settings for kubeblocks loki: diff --git a/deploy/prometheus-cluster/Chart.yaml b/deploy/prometheus-cluster/Chart.yaml index 8d0ba8a20ba..c6b01ee0c83 100644 --- a/deploy/prometheus-cluster/Chart.yaml +++ b/deploy/prometheus-cluster/Chart.yaml @@ -15,10 +15,15 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 14.7.2 +version: 0.5.1-beta.0 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to # follow Semantic Versioning. They should reflect the version the application is using. # It is recommended to use it with quotes. -appVersion: 14.7.2 \ No newline at end of file +appVersion: 2.44.0 + +dependencies: + - name: prometheus + version: '>0.0.1-alpha.0' + repository: file://../prometheus \ No newline at end of file diff --git a/deploy/prometheus-cluster/templates/cluster.yaml b/deploy/prometheus-cluster/templates/cluster.yaml index d0e01160d97..1e6573979ab 100644 --- a/deploy/prometheus-cluster/templates/cluster.yaml +++ b/deploy/prometheus-cluster/templates/cluster.yaml @@ -2,7 +2,11 @@ apiVersion: apps.kubeblocks.io/v1alpha1 kind: Cluster metadata: name: {{ include "prometheus-cluster.fullname" . }} - labels: {{ include "prometheus-cluster.labels" . | nindent 4 }} + labels: + {{- include "prometheus-cluster.labels" . | nindent 4 }} + {{- with .Values.extraLabels }} + {{- toYaml . | nindent 4 }} + {{- end }} spec: clusterDefinitionRef: prometheus # ref clusterdefinition.name clusterVersionRef: prometheus-{{ default .Chart.AppVersion .Values.clusterVersionOverride }} # ref clusterversion.name diff --git a/deploy/prometheus-cluster/values.yaml b/deploy/prometheus-cluster/values.yaml index 1c4eb1164c0..c977b0405fc 100644 --- a/deploy/prometheus-cluster/values.yaml +++ b/deploy/prometheus-cluster/values.yaml @@ -9,6 +9,8 @@ terminationPolicy: Delete ## tolerations: [] +extraLabels: {} + ## @param topologySpreadConstraints Topology Spread Constraints for pod assignment spread across your cluster among failure-domains. Evaluated as a template ## Ref: https://kubernetes.io/docs/concepts/workloads/pods/pod-topology-spread-constraints/#spread-constraints-for-pods ## @@ -28,7 +30,7 @@ server: # memory: 512Mi persistence: - enabled: true + enabled: false ## If defined, storageClassName: ## If set to "-", storageClassName: "", which disables dynamic provisioning @@ -54,7 +56,7 @@ alertmanager: # memory: 512Mi persistence: - enabled: true + enabled: false ## If defined, storageClassName: ## If set to "-", storageClassName: "", which disables dynamic provisioning diff --git a/deploy/prometheus/Chart.yaml b/deploy/prometheus/Chart.yaml index 973b282d723..62d8cebbf05 100644 --- a/deploy/prometheus/Chart.yaml +++ b/deploy/prometheus/Chart.yaml @@ -15,10 +15,10 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 14.7.2 +version: 0.5.1-beta.0 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to # follow Semantic Versioning. They should reflect the version the application is using. # It is recommended to use it with quotes. -appVersion: 14.7.2 +appVersion: 2.44.0 diff --git a/deploy/prometheus/config/server/serverFiles/prometheus.yml b/deploy/prometheus/config/server/serverFiles/prometheus.yml index b75e8e8451f..299074c1601 100644 --- a/deploy/prometheus/config/server/serverFiles/prometheus.yml +++ b/deploy/prometheus/config/server/serverFiles/prometheus.yml @@ -110,26 +110,3 @@ scrape_configs: regex: Pending|Succeeded|Failed|Completed action: drop -alerting: - alertmanagers: - - kubernetes_sd_configs: - - role: pod - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - relabel_configs: - - source_labels: [ __meta_kubernetes_namespace ] - regex: default - action: keep - - source_labels: [ __meta_kubernetes_pod_label_app ] - regex: prometheus - action: keep - - source_labels: [ __meta_kubernetes_pod_label_component ] - regex: alertmanager - action: keep - - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_probe ] - regex: .* - action: keep - - source_labels: [ __meta_kubernetes_pod_container_port_number ] - regex: "9093" - action: keep diff --git a/deploy/prometheus/templates/configmap.yaml b/deploy/prometheus/templates/configmap.yaml index 92a006e540f..204a362eab3 100644 --- a/deploy/prometheus/templates/configmap.yaml +++ b/deploy/prometheus/templates/configmap.yaml @@ -40,8 +40,10 @@ metadata: name: {{ template "prometheus.server.fullname" . }}-config data: {{- $values := .Values }} +{{- $root := . -}} {{- range $path, $content := .Files.Glob "config/server/serverFiles/*" }} - {{ trimPrefix "config/server/serverFiles/" $path }}: |- + {{- if eq $path "config/server/serverFiles/prometheus.yml" -}} + {{ trimPrefix "config/server/serverFiles/" $path | nindent 2 }}: |- global: {{- with $values.server.global }} scrape_interval: {{ .scrape_interval }} @@ -49,8 +51,36 @@ data: evaluation_interval: {{ .evaluation_interval }} {{- end }} {{- if $values.server.remoteWrite }} - remote_write: - {{- $values.server.remoteWrite | toYaml | nindent 8 }} + remote_write: + {{- $values.server.remoteWrite | toYaml | nindent 8 }} {{- end }} + alerting: + alertmanagers: + - kubernetes_sd_configs: + - role: pod + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + relabel_configs: + - source_labels: [ __meta_kubernetes_namespace ] + regex: {{ $root.Release.Namespace }} + action: keep + - source_labels: [ __meta_kubernetes_pod_label_app_kubernetes_io_instance ] + regex: .*prometheus-playground + action: keep + - source_labels: [ __meta_kubernetes_pod_label_apps_kubeblocks_io_component_name ] + regex: alertmanager + action: keep + - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_probe ] + regex: .* + action: keep + - source_labels: [ __meta_kubernetes_pod_container_port_number ] + regex: "9093" + action: keep {{- $content | toString | nindent 4 }} + {{- else }} + {{ trimPrefix "config/server/serverFiles/" $path | nindent 2 }}: |- + {{- $content | toString | nindent 4 }} + {{- end }} {{- end }} + diff --git a/internal/cli/cmd/dashboard/dashboard.go b/internal/cli/cmd/dashboard/dashboard.go index 34fae2f07da..5052c7b41a8 100644 --- a/internal/cli/cmd/dashboard/dashboard.go +++ b/internal/cli/cmd/dashboard/dashboard.go @@ -94,14 +94,14 @@ var ( }, { Name: "kubeblocks-prometheus-alertmanager", - AddonName: "kb-addon-prometheus-alertmanager", - Label: "app=prometheus,component=alertmanager,release=kb-addon-prometheus", + AddonName: "kb-addon-prometheus-playground-alertmanager", + Label: "app.kubernetes.io/instance=kb-addon-prometheus-playground,app.kubernetes.io/managed-by=kubeblocks,apps.kubeblocks.io/component-name=alertmanager", TargetPort: "19093", }, { Name: "kubeblocks-prometheus-server", - AddonName: "kb-addon-prometheus-server", - Label: "app=prometheus,component=server,release=kb-addon-prometheus", + AddonName: "kb-addon-prometheus-playground-server", + Label: "app.kubernetes.io/instance=kb-addon-prometheus-playground,app.kubernetes.io/managed-by=kubeblocks,apps.kubeblocks.io/component-name=server", TargetPort: "19090", }, {