Skip to content

Commit

Permalink
Fix 'ceph_disk_occupation' query expressions
Browse files Browse the repository at this point in the history
Need to address changes in 'ceph_disk_occupation' metric labels.

What is the change in 'ceph_disk_occupation' metric?
'ceph_disk_occupation' result no longer has 'exported_instance' label,
instead it has 'instance' label.

What is the issue we are facing because of it?
We are hitting 'PrometheusRuleFailures' due to this new label changes
in our alerts / rules, where this metric is used.
Second issue is that we are not seeing any results for some of the
query expressions.

What is the solution?
Update the query expressions, change 'exported_instance' to 'instance'.
Any 'label_replace' action which changes 'exported_instance' label to
'instance' label is no longer required (as the 'instance' label is
directly available now)

Signed-off-by: Arun Kumar Mohan <amohan@redhat.com>
  • Loading branch information
aruniiird committed Dec 13, 2024
1 parent 1d6235c commit a81e357
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 40 deletions.
8 changes: 4 additions & 4 deletions controllers/storagecluster/prometheus/localcephrules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@ spec:
- name: ceph.rules
rules:
- expr: |
kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"} * on (node) group_right() max by (node, namespace) (label_replace(ceph_disk_occupation{job="rook-ceph-mgr"},"node","$1","exported_instance","(.*)"))
kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"} * on (node) group_right() max by (node, namespace) (label_replace(ceph_disk_occupation{job="rook-ceph-mgr"},"node","$1","instance","(.*)"))
record: cluster:ceph_node_down:join_kube
- expr: |
avg by (namespace) (topk by (ceph_daemon, namespace) (1, label_replace(label_replace(ceph_disk_occupation{job="rook-ceph-mgr"}, "instance", "$1", "exported_instance", "(.*)"), "device", "$1", "device", "/dev/(.*)")) * on(instance, device) group_right(ceph_daemon, namespace) topk by (instance, device, namespace) (1,(irate(node_disk_read_time_seconds_total[1m]) + irate(node_disk_write_time_seconds_total[1m]) / (clamp_min(irate(node_disk_reads_completed_total[1m]), 1) + irate(node_disk_writes_completed_total[1m])))))
avg by (namespace) (topk by (ceph_daemon, namespace) (1, label_replace(ceph_disk_occupation{job="rook-ceph-mgr"}, "device", "$1", "device", "/dev/(.*)")) * on(instance, device) group_right(ceph_daemon, namespace) topk by (instance, device, namespace) (1,(irate(node_disk_read_time_seconds_total[1m]) + irate(node_disk_write_time_seconds_total[1m]) / (clamp_min(irate(node_disk_reads_completed_total[1m]), 1) + irate(node_disk_writes_completed_total[1m])))))
record: cluster:ceph_disk_latency:join_ceph_node_disk_irate1m
- name: telemeter.rules
rules:
Expand Down Expand Up @@ -171,7 +171,7 @@ spec:
storage_type: ceph
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/openshift-container-storage-operator/CephOSDDiskNotResponding.md
expr: |
label_replace((ceph_osd_in == 1 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon, namespace, managedBy) group_left(host, device) label_replace(ceph_disk_occupation{job=~"rook-ceph-mgr|rook-ceph-mgr-external"},"host","$1","exported_instance","(.*)")
label_replace((ceph_osd_in == 1 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon, namespace, managedBy) group_left(host, device) label_replace(ceph_disk_occupation{job=~"rook-ceph-mgr|rook-ceph-mgr-external"},"host","$1","instance","(.*)")
for: 15m
labels:
severity: critical
Expand All @@ -183,7 +183,7 @@ spec:
storage_type: ceph
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/openshift-container-storage-operator/CephOSDDiskUnavailable.md
expr: |
label_replace((ceph_osd_in == 0 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon, namespace, managedBy) group_left(host, device) label_replace(ceph_disk_occupation{job=~"rook-ceph-mgr|rook-ceph-mgr-external"},"host","$1","exported_instance","(.*)")
label_replace((ceph_osd_in == 0 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon, namespace, managedBy) group_left(host, device) label_replace(ceph_disk_occupation{job=~"rook-ceph-mgr|rook-ceph-mgr-external"},"host","$1","instance","(.*)")
for: 1m
labels:
severity: critical
Expand Down
24 changes: 6 additions & 18 deletions metrics/deploy/prometheus-ocs-rules-external.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,11 @@ spec:
groups:
- name: ocs_performance.rules
rules:
- expr: "sum by (namespace, managedBy) (\n topk by (ceph_daemon) (1, label_replace(label_replace(ceph_disk_occupation{job=\"rook-ceph-mgr\"},
\"instance\", \"$1\", \"exported_instance\", \"(.*)\"), \"device\", \"$1\",
\"device\", \"/dev/(.*)\")) \n * on(instance, device) group_left topk by
(instance,device) \n (1,\n (\n rate(node_disk_read_time_seconds_total[1m])
/ (clamp_min(rate(node_disk_reads_completed_total[1m]), 1))\n )\n )\n)\n"
- expr: |
sum by (namespace, managedBy) (topk by (ceph_daemon) (1, label_replace(ceph_disk_occupation{job="rook-ceph-mgr"}, "device", "$1", "device", "/dev/(.*)")) * on(instance, device) group_left topk by (instance,device) (1, (rate(node_disk_read_time_seconds_total[1m]) / (clamp_min(rate(node_disk_reads_completed_total[1m]), 1)))))
record: cluster:ceph_disk_latency_read:join_ceph_node_disk_rate1m
- expr: "sum by (namespace, managedBy) (\n topk by (ceph_daemon) (1, label_replace(label_replace(ceph_disk_occupation{job=\"rook-ceph-mgr\"},
\"instance\", \"$1\", \"exported_instance\", \"(.*)\"), \"device\", \"$1\",
\"device\", \"/dev/(.*)\")) \n * on(instance, device) group_left topk by
(instance,device) \n (1,\n (\n rate(node_disk_write_time_seconds_total[1m])
/ (clamp_min(rate(node_disk_writes_completed_total[1m]), 1))\n )\n )\n)\n"
- expr: |
sum by (namespace, managedBy) (topk by (ceph_daemon) (1, label_replace(ceph_disk_occupation{job="rook-ceph-mgr"}, "device", "$1", "device", "/dev/(.*)")) * on(instance, device) group_left topk by (instance,device) (1, (rate(node_disk_write_time_seconds_total[1m]) / (clamp_min(rate(node_disk_writes_completed_total[1m]), 1)))))
record: cluster:ceph_disk_latency_write:join_ceph_node_disk_rate1m
- name: ODF_standardized_metrics.rules
rules:
Expand Down Expand Up @@ -54,14 +48,8 @@ spec:
system_type: OCS
system_vendor: Red Hat
record: odf_system_throughput_total_bytes
- expr: "sum by (namespace, managedBy, job, service)\n(\n topk by (ceph_daemon)
(1, label_replace(label_replace(ceph_disk_occupation{job=\"rook-ceph-mgr\"},
\"instance\", \"$1\", \"exported_instance\", \"(.*)\"), \"device\", \"$1\",
\"device\", \"/dev/(.*)\")) \n * on(instance, device) group_left() topk by
(instance,device) \n (1,\n (\n ( \n rate(node_disk_read_time_seconds_total[1m])
/ (clamp_min(rate(node_disk_reads_completed_total[1m]), 1))\n ) +\n (\n
\ rate(node_disk_write_time_seconds_total[1m]) / (clamp_min(rate(node_disk_writes_completed_total[1m]),
1))\n )\n )\n )\n)\n"
- expr: |
sum by (namespace, managedBy, job, service) (topk by (ceph_daemon) (1, label_replace(ceph_disk_occupation{job="rook-ceph-mgr"}, "device", "$1", "device", "/dev/(.*)")) * on(instance, device) group_left() topk by (instance,device) (1, ((rate(node_disk_read_time_seconds_total[1m]) / (clamp_min(rate(node_disk_reads_completed_total[1m]), 1))) + (rate(node_disk_write_time_seconds_total[1m]) / (clamp_min(rate(node_disk_writes_completed_total[1m]), 1))))))
labels:
system_type: OCS
system_vendor: Red Hat
Expand Down
24 changes: 6 additions & 18 deletions metrics/deploy/prometheus-ocs-rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,11 @@ spec:
groups:
- name: ocs_performance.rules
rules:
- expr: "sum by (namespace, managedBy) (\n topk by (ceph_daemon) (1, label_replace(label_replace(ceph_disk_occupation{job=\"rook-ceph-mgr\"},
\"instance\", \"$1\", \"exported_instance\", \"(.*)\"), \"device\", \"$1\",
\"device\", \"/dev/(.*)\")) \n * on(instance, device) group_left topk by
(instance,device) \n (1,\n (\n rate(node_disk_read_time_seconds_total[1m])
/ (clamp_min(rate(node_disk_reads_completed_total[1m]), 1))\n )\n )\n)\n"
- expr: |
sum by (namespace, managedBy) (topk by (ceph_daemon) (1, label_replace(ceph_disk_occupation{job="rook-ceph-mgr"}, "device", "$1", "device", "/dev/(.*)")) * on(instance, device) group_left topk by (instance,device) (1, (rate(node_disk_read_time_seconds_total[1m]) / (clamp_min(rate(node_disk_reads_completed_total[1m]), 1)))))
record: cluster:ceph_disk_latency_read:join_ceph_node_disk_rate1m
- expr: "sum by (namespace, managedBy) (\n topk by (ceph_daemon) (1, label_replace(label_replace(ceph_disk_occupation{job=\"rook-ceph-mgr\"},
\"instance\", \"$1\", \"exported_instance\", \"(.*)\"), \"device\", \"$1\",
\"device\", \"/dev/(.*)\")) \n * on(instance, device) group_left topk by
(instance,device) \n (1,\n (\n rate(node_disk_write_time_seconds_total[1m])
/ (clamp_min(rate(node_disk_writes_completed_total[1m]), 1))\n )\n )\n)\n"
- expr: |
sum by (namespace, managedBy) (topk by (ceph_daemon) (1, label_replace(ceph_disk_occupation{job="rook-ceph-mgr"}, "device", "$1", "device", "/dev/(.*)")) * on(instance, device) group_left topk by (instance,device) (1, (rate(node_disk_write_time_seconds_total[1m]) / (clamp_min(rate(node_disk_writes_completed_total[1m]), 1)))))
record: cluster:ceph_disk_latency_write:join_ceph_node_disk_rate1m
- name: ODF_standardized_metrics.rules
rules:
Expand Down Expand Up @@ -54,14 +48,8 @@ spec:
system_type: OCS
system_vendor: Red Hat
record: odf_system_throughput_total_bytes
- expr: "sum by (namespace, managedBy, job, service)\n(\n topk by (ceph_daemon)
(1, label_replace(label_replace(ceph_disk_occupation{job=\"rook-ceph-mgr\"},
\"instance\", \"$1\", \"exported_instance\", \"(.*)\"), \"device\", \"$1\",
\"device\", \"/dev/(.*)\")) \n * on(instance, device) group_left() topk by
(instance,device) \n (1,\n (\n ( \n rate(node_disk_read_time_seconds_total[1m])
/ (clamp_min(rate(node_disk_reads_completed_total[1m]), 1))\n ) +\n (\n
\ rate(node_disk_write_time_seconds_total[1m]) / (clamp_min(rate(node_disk_writes_completed_total[1m]),
1))\n )\n )\n )\n)\n"
- expr: |
sum by (namespace, managedBy, job, service) (topk by (ceph_daemon) (1, label_replace(ceph_disk_occupation{job="rook-ceph-mgr"}, "device", "$1", "device", "/dev/(.*)")) * on(instance, device) group_left() topk by (instance,device) (1, ((rate(node_disk_read_time_seconds_total[1m]) / (clamp_min(rate(node_disk_reads_completed_total[1m]), 1))) + (rate(node_disk_write_time_seconds_total[1m]) / (clamp_min(rate(node_disk_writes_completed_total[1m]), 1))))))
labels:
system_type: OCS
system_vendor: Red Hat
Expand Down

0 comments on commit a81e357

Please sign in to comment.