Skip to content

Commit

Permalink
add smartmon_sata rule to handle sata cable failures
Browse files Browse the repository at this point in the history
  • Loading branch information
iminfinity committed Nov 14, 2024
1 parent 1f52b98 commit 9288b25
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 0 deletions.
18 changes: 18 additions & 0 deletions argocd-helm-charts/prometheus-linuxaid/rules/smartmon_sata.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
groups:
- name: monitor::system::service::smartmon
rules:
- alert: monitor::system::service::smartmon::smartmon_udma_crc_error_count_raw_value
expr: |
smartmon_udma_crc_error_count_raw_value != 0 and on(certname) obmondo_monitoring
for: 3h
labels:
severity: critical
alert_id: monitor::system::service::smartmon::smartmon_udma_crc_error_count_raw_value
annotations:
summary: 'Disk **{{ $labels.disk }}** on **{{ $labels.certname }}** has disk sata failure'
description: |
Disk **{{ $labels.disk }}** on **{{ $labels.certname }}** has disk sata failure,
instance="**{{ $labels.instance }}**",
type="**{{ $labels.type }}**",
UDMA_CRC_Error_Count - The number of errors related to data transfer over the interface. A value of **{{ $value }}** is concerning and indicates potential issues with the data cable or connections.
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{{- if .Values.prometheusRule.smartmon_sata }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: smartmon-sata-rules
namespace: monitoring-{{ .Values.customerid }}
spec:
{{- $.Files.Get "rules/smartmon_sata.yaml" | nindent 4 }}
{{- end }}
35 changes: 35 additions & 0 deletions argocd-helm-charts/prometheus-linuxaid/tests/smartmon_sata.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
---
evaluation_interval: 1m

rule_files:
- ../rules/smartmon_sata.yaml

tests:
- interval: 1m
input_series:
- series: obmondo_monitoring{certname="jacen.enableit", alert_id="monitor::system::service::smartmon::smartmon_udma_crc_error_count_raw_value"}
values: 1x200
- series: smartmon_udma_crc_error_count_raw_value{certname="jacen.enableit", disk="/dev/sdb", type="sat", instance="htzhel1-ax41a.enableit:63385"}
values: 1x200
- series: smartmon_udma_crc_error_count_raw_value{certname="jacen.enableit", disk="/dev/sba", type="sat", instance="htzhel1-ax41a.enableit:63385"}
values: 0x200

alert_rule_test:
- alertname: 'monitor::system::service::smartmon::smartmon_udma_crc_error_count_raw_value'
eval_time: 3h
exp_alerts:
- exp_labels:
severity: 'critical'
certname: 'jacen.enableit'
disk: '/dev/sdb'
type: sat
instance: "htzhel1-ax41a.enableit:63385"
alert_id: 'monitor::system::service::smartmon::smartmon_udma_crc_error_count_raw_value'
exp_annotations:
summary: 'Disk **/dev/sdb** on **jacen.enableit** has disk sata failure'
description: |
Disk **/dev/sdb** on **jacen.enableit** has disk sata failure,
instance="**htzhel1-ax41a.enableit:63385**",
type="**sat**",
UDMA_CRC_Error_Count - The number of errors related to data transfer over the interface. A value of **1** is concerning and indicates potential issues with the data cable or connections.
1 change: 1 addition & 0 deletions argocd-helm-charts/prometheus-linuxaid/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ prometheusRule:
smart: true
ssacli: true
zfsExporter: true
smartmon_sata: true
alertmanager:
version: v0.27.0
config:
Expand Down

0 comments on commit 9288b25

Please sign in to comment.