From 9288b25cac9ef24252b314adcbe72efdb4ba7ee2 Mon Sep 17 00:00:00 2001 From: iminfinity Date: Thu, 14 Nov 2024 16:53:26 +0530 Subject: [PATCH] add smartmon_sata rule to handle sata cable failures --- .../rules/smartmon_sata.yaml | 18 ++++++++++ .../prometheusRule-smartmon-sata.yaml | 9 +++++ .../tests/smartmon_sata.yaml | 35 +++++++++++++++++++ .../prometheus-linuxaid/values.yaml | 1 + 4 files changed, 63 insertions(+) create mode 100644 argocd-helm-charts/prometheus-linuxaid/rules/smartmon_sata.yaml create mode 100644 argocd-helm-charts/prometheus-linuxaid/templates/prometheusRule-smartmon-sata.yaml create mode 100644 argocd-helm-charts/prometheus-linuxaid/tests/smartmon_sata.yaml diff --git a/argocd-helm-charts/prometheus-linuxaid/rules/smartmon_sata.yaml b/argocd-helm-charts/prometheus-linuxaid/rules/smartmon_sata.yaml new file mode 100644 index 00000000..bd83f863 --- /dev/null +++ b/argocd-helm-charts/prometheus-linuxaid/rules/smartmon_sata.yaml @@ -0,0 +1,18 @@ +groups: + - name: monitor::system::service::smartmon + rules: + - alert: monitor::system::service::smartmon::smartmon_udma_crc_error_count_raw_value + expr: | + smartmon_udma_crc_error_count_raw_value != 0 and on(certname) obmondo_monitoring + for: 3h + labels: + severity: critical + alert_id: monitor::system::service::smartmon::smartmon_udma_crc_error_count_raw_value + annotations: + summary: 'Disk **{{ $labels.disk }}** on **{{ $labels.certname }}** has disk sata failure' + description: | + Disk **{{ $labels.disk }}** on **{{ $labels.certname }}** has disk sata failure, + instance="**{{ $labels.instance }}**", + type="**{{ $labels.type }}**", + + UDMA_CRC_Error_Count - The number of errors related to data transfer over the interface. A value of **{{ $value }}** is concerning and indicates potential issues with the data cable or connections. diff --git a/argocd-helm-charts/prometheus-linuxaid/templates/prometheusRule-smartmon-sata.yaml b/argocd-helm-charts/prometheus-linuxaid/templates/prometheusRule-smartmon-sata.yaml new file mode 100644 index 00000000..4a1e6acf --- /dev/null +++ b/argocd-helm-charts/prometheus-linuxaid/templates/prometheusRule-smartmon-sata.yaml @@ -0,0 +1,9 @@ +{{- if .Values.prometheusRule.smartmon_sata }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: smartmon-sata-rules + namespace: monitoring-{{ .Values.customerid }} +spec: + {{- $.Files.Get "rules/smartmon_sata.yaml" | nindent 4 }} +{{- end }} diff --git a/argocd-helm-charts/prometheus-linuxaid/tests/smartmon_sata.yaml b/argocd-helm-charts/prometheus-linuxaid/tests/smartmon_sata.yaml new file mode 100644 index 00000000..78279177 --- /dev/null +++ b/argocd-helm-charts/prometheus-linuxaid/tests/smartmon_sata.yaml @@ -0,0 +1,35 @@ +--- +evaluation_interval: 1m + +rule_files: + - ../rules/smartmon_sata.yaml + +tests: + - interval: 1m + input_series: + - series: obmondo_monitoring{certname="jacen.enableit", alert_id="monitor::system::service::smartmon::smartmon_udma_crc_error_count_raw_value"} + values: 1x200 + - series: smartmon_udma_crc_error_count_raw_value{certname="jacen.enableit", disk="/dev/sdb", type="sat", instance="htzhel1-ax41a.enableit:63385"} + values: 1x200 + - series: smartmon_udma_crc_error_count_raw_value{certname="jacen.enableit", disk="/dev/sba", type="sat", instance="htzhel1-ax41a.enableit:63385"} + values: 0x200 + + alert_rule_test: + - alertname: 'monitor::system::service::smartmon::smartmon_udma_crc_error_count_raw_value' + eval_time: 3h + exp_alerts: + - exp_labels: + severity: 'critical' + certname: 'jacen.enableit' + disk: '/dev/sdb' + type: sat + instance: "htzhel1-ax41a.enableit:63385" + alert_id: 'monitor::system::service::smartmon::smartmon_udma_crc_error_count_raw_value' + exp_annotations: + summary: 'Disk **/dev/sdb** on **jacen.enableit** has disk sata failure' + description: | + Disk **/dev/sdb** on **jacen.enableit** has disk sata failure, + instance="**htzhel1-ax41a.enableit:63385**", + type="**sat**", + + UDMA_CRC_Error_Count - The number of errors related to data transfer over the interface. A value of **1** is concerning and indicates potential issues with the data cable or connections. diff --git a/argocd-helm-charts/prometheus-linuxaid/values.yaml b/argocd-helm-charts/prometheus-linuxaid/values.yaml index 8aca4848..1dd55b4e 100644 --- a/argocd-helm-charts/prometheus-linuxaid/values.yaml +++ b/argocd-helm-charts/prometheus-linuxaid/values.yaml @@ -54,6 +54,7 @@ prometheusRule: smart: true ssacli: true zfsExporter: true + smartmon_sata: true alertmanager: version: v0.27.0 config: