diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/jsonnetfile.json b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/jsonnetfile.json new file mode 100644 index 000000000..c0306a212 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/jsonnetfile.json @@ -0,0 +1,51 @@ +{ + "version": 1, + "dependencies": [ + { + "source": { + "git": { + "remote": "https://github.com/bitnami-labs/sealed-secrets.git", + "subdir": "contrib/prometheus-mixin" + } + }, + "version": "main" + }, + { + "source": { + "git": { + "remote": "https://github.com/ceph/ceph.git", + "subdir": "monitoring/ceph-mixin" + } + }, + "version": "main" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "opensearch-mixin" + } + }, + "version": "master" + }, + { + "source": { + "git": { + "remote": "https://github.com/prometheus-operator/kube-prometheus.git", + "subdir": "jsonnet/kube-prometheus" + } + }, + "version": "74e445ae4a2582f978bae2e0e9b63024d7f759d6" + }, + { + "source": { + "git": { + "remote": "https://gitlab.com/uneeq-oss/cert-manager-mixin.git", + "subdir": "" + } + }, + "version": "master" + } + ], + "legacyImports": true +} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/jsonnetfile.lock.json b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/jsonnetfile.lock.json new file mode 100644 index 000000000..da561fa06 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/jsonnetfile.lock.json @@ -0,0 +1,291 @@ +{ + "version": 1, + "dependencies": [ + { + "source": { + "git": { + "remote": "https://github.com/bitnami-labs/sealed-secrets.git", + "subdir": "contrib/prometheus-mixin" + } + }, + "version": "ec23dd3eacc840a6b0a901945f8910826c19a44e", + "sum": "uqhnRGEja2PfXTtsHkmbcrc8Wr0oox9xZKPP6zYXsJE=" + }, + { + "source": { + "git": { + "remote": "https://github.com/brancz/kubernetes-grafana.git", + "subdir": "grafana" + } + }, + "version": "5698c8940b6dadca3f42107b7839557bc041761f", + "sum": "l6fPvh3tW6fWot308w71QY/amrYsFPeitvz1IgJxqQA=" + }, + { + "source": { + "git": { + "remote": "https://github.com/ceph/ceph.git", + "subdir": "monitoring/ceph-mixin" + } + }, + "version": "b045f58546c0c701df6b29bfb796c93049eb6fe3", + "sum": "ZnyCIu25NBI6Q3Ru7QK1DHf7DBMEURSMQdEJXzCyIgA=" + }, + { + "source": { + "git": { + "remote": "https://github.com/etcd-io/etcd.git", + "subdir": "contrib/mixin" + } + }, + "version": "c00593bbd402cd4cc7bae421598343cb056b5294", + "sum": "IXI3LQIT9NmTPJAk8WLUJd5+qZfcGpeNCyWIK7oEpws=" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafana.git", + "subdir": "grafana-mixin" + } + }, + "version": "1120f9e255760a3c104b57871fcb91801e934382", + "sum": "MkjR7zCgq6MUZgjDzop574tFKoTX2OBr7DTwm1K+Ofs=" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet-lib.git", + "subdir": "grafonnet" + } + }, + "version": "a1d61cce1da59c71409b99b5c7568511fec661ea", + "sum": "342u++/7rViR/zj2jeJOjshzglkZ1SY+hFNuyCBFMdc=" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet-lib.git", + "subdir": "grafonnet-7.0" + } + }, + "version": "a1d61cce1da59c71409b99b5c7568511fec661ea", + "sum": "gCtR9s/4D5fxU9aKXg0Bru+/njZhA0YjLjPiASc61FM=" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet.git", + "subdir": "gen/grafonnet-latest" + } + }, + "version": "733beadbc8dab55c5fe1bcdcf0d8a2d215759a55", + "sum": "eyuJ0jOXeA4MrobbNgU4/v5a7ASDHslHZ0eS6hDdWoI=" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet.git", + "subdir": "gen/grafonnet-v10.0.0" + } + }, + "version": "733beadbc8dab55c5fe1bcdcf0d8a2d215759a55", + "sum": "xdcrJPJlpkq4+5LpGwN4tPAuheNNLXZjE6tDcyvFjr0=" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet.git", + "subdir": "gen/grafonnet-v11.0.0" + } + }, + "version": "733beadbc8dab55c5fe1bcdcf0d8a2d215759a55", + "sum": "0BvzR0i4bS4hc2O3xDv6i9m52z7mPrjvqxtcPrGhynA=" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "common-lib" + } + }, + "version": "f4f72ec792de41019e817b3d48840c622fc28951", + "sum": "tfvNQmtjIkqbgy7FU3FDy34OYoZhWskDCQanbiScaHU=" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "grafana-builder" + } + }, + "version": "f4f72ec792de41019e817b3d48840c622fc28951", + "sum": "yxqWcq/N3E/a/XreeU6EuE6X7kYPnG0AspAQFKOjASo=" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "opensearch-mixin" + } + }, + "version": "f4f72ec792de41019e817b3d48840c622fc28951", + "sum": "AK83KBy5roMxhT0taG54ERV20oG9mhaCJA+EHRzuPO4=" + }, + { + "source": { + "git": { + "remote": "https://github.com/jsonnet-libs/docsonnet.git", + "subdir": "doc-util" + } + }, + "version": "6ac6c69685b8c29c54515448eaca583da2d88150", + "sum": "BrAL/k23jq+xy9oA7TWIhUx07dsA/QLm3g7ktCwe//U=" + }, + { + "source": { + "git": { + "remote": "https://github.com/jsonnet-libs/xtd.git", + "subdir": "" + } + }, + "version": "63d430b69a95741061c2f7fc9d84b1a778511d9c", + "sum": "qiZi3axUSXCVzKUF83zSAxklwrnitMmrDK4XAfjPMdE=" + }, + { + "source": { + "git": { + "remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin.git", + "subdir": "" + } + }, + "version": "dec42129576a19dc175aea145adbbc0d2fabe074", + "sum": "aV/5H7usvWMDkjn75KCgBm8cjChSKso29Dk+AVpYt3c=" + }, + { + "source": { + "git": { + "remote": "https://github.com/kubernetes/kube-state-metrics.git", + "subdir": "jsonnet/kube-state-metrics" + } + }, + "version": "db01d3eb023c0e0d7d727c1f91f241b256d9a7f3", + "sum": "lO7jUSzAIy8Yk9pOWJIWgPRhubkWzVh56W6wtYfbVH4=" + }, + { + "source": { + "git": { + "remote": "https://github.com/kubernetes/kube-state-metrics.git", + "subdir": "jsonnet/kube-state-metrics-mixin" + } + }, + "version": "db01d3eb023c0e0d7d727c1f91f241b256d9a7f3", + "sum": "qclI7LwucTjBef3PkGBkKxF0mfZPbHnn4rlNWKGtR4c=" + }, + { + "source": { + "git": { + "remote": "https://github.com/prometheus-operator/kube-prometheus.git", + "subdir": "jsonnet/kube-prometheus" + } + }, + "version": "74e445ae4a2582f978bae2e0e9b63024d7f759d6", + "sum": "roigNXE23iQFFU6G8t7/SF+UKqG6zV9yQotiClXnkoM=" + }, + { + "source": { + "git": { + "remote": "https://github.com/prometheus-operator/prometheus-operator.git", + "subdir": "jsonnet/mixin" + } + }, + "version": "caafb96d5c8c87873bab7485522f9e3d12bc835a", + "sum": "gi+knjdxs2T715iIQIntrimbHRgHnpM8IFBJDD1gYfs=", + "name": "prometheus-operator-mixin" + }, + { + "source": { + "git": { + "remote": "https://github.com/prometheus-operator/prometheus-operator.git", + "subdir": "jsonnet/prometheus-operator" + } + }, + "version": "caafb96d5c8c87873bab7485522f9e3d12bc835a", + "sum": "nkQ22KTboIYhXEnBiINtMw7M1ptoO3UK6+s7O/G5fK8=" + }, + { + "source": { + "git": { + "remote": "https://github.com/prometheus/alertmanager.git", + "subdir": "doc/alertmanager-mixin" + } + }, + "version": "a6df704408ba303c5d1d4e8e751da227e0ab08bf", + "sum": "IpF46ZXsm+0wJJAPtAre8+yxTNZA57mBqGpBP/r7/kw=", + "name": "alertmanager" + }, + { + "source": { + "git": { + "remote": "https://github.com/prometheus/node_exporter.git", + "subdir": "docs/node-mixin" + } + }, + "version": "b9d0932179a0c5b3a8863f3d6cdafe8584cedc8e", + "sum": "rhUvbqviGjQ2mwsRhHKMN0TiS3YvnYpUXHew3XlQ+Wg=" + }, + { + "source": { + "git": { + "remote": "https://github.com/prometheus/prometheus.git", + "subdir": "documentation/prometheus-mixin" + } + }, + "version": "6ef1eb4e68d4b8132c3cbbaa20bdb2b95de26a1a", + "sum": "dYLcLzGH4yF3qB7OGC/7z4nqeTNjv42L7Q3BENU8XJI=", + "name": "prometheus" + }, + { + "source": { + "git": { + "remote": "https://github.com/pyrra-dev/pyrra.git", + "subdir": "config/crd/bases" + } + }, + "version": "551856d42dff02ec38c5b0ea6a2d99c4cb127e82", + "sum": "bY/Pcrrbynguq8/HaI88cQ3B2hLv/xc+76QILY7IL+g=", + "name": "pyrra" + }, + { + "source": { + "git": { + "remote": "https://github.com/thanos-io/thanos.git", + "subdir": "mixin" + } + }, + "version": "9f2af3f78f89d1e1b282f85d89f3d2f4f727b74c", + "sum": "ieCD4eMgGbOlrI8GmckGPHBGQDcLasE1rULYq56W/bs=", + "name": "thanos-mixin" + }, + { + "source": { + "git": { + "remote": "https://github.com/yugui/jsonnetunit.git", + "subdir": "jsonnetunit" + } + }, + "version": "6927c58cae7624a00f368b977ccc477d4f74071f", + "sum": "9FFqqln65hooRF0l6rjICDtnTxUlmDj34+sKMh4sjPI=" + }, + { + "source": { + "git": { + "remote": "https://gitlab.com/uneeq-oss/cert-manager-mixin.git", + "subdir": "" + } + }, + "version": "eae22f642aaa5d422e4766f6811df2158fc05539", + "sum": "DOg3fzS0OWrjjRPVsKgxID/rk9AC3ESQ4gDELc2RNgM=" + } + ], + "legacyImports": false +} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/alertmanager b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/alertmanager new file mode 120000 index 000000000..96f32a46b --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/alertmanager @@ -0,0 +1 @@ +github.com/prometheus/alertmanager/doc/alertmanager-mixin \ No newline at end of file diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/ceph-mixin b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/ceph-mixin new file mode 120000 index 000000000..dbbe3337b --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/ceph-mixin @@ -0,0 +1 @@ +github.com/ceph/ceph/monitoring/ceph-mixin \ No newline at end of file diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/cert-manager-mixin b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/cert-manager-mixin new file mode 120000 index 000000000..98c87112d --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/cert-manager-mixin @@ -0,0 +1 @@ +gitlab.com/uneeq-oss/cert-manager-mixin \ No newline at end of file diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/common-lib b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/common-lib new file mode 120000 index 000000000..5d4320e59 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/common-lib @@ -0,0 +1 @@ +github.com/grafana/jsonnet-libs/common-lib \ No newline at end of file diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/doc-util b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/doc-util new file mode 120000 index 000000000..dcfde67cf --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/doc-util @@ -0,0 +1 @@ +github.com/jsonnet-libs/docsonnet/doc-util \ No newline at end of file diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/bitnami-labs/sealed-secrets/contrib/prometheus-mixin/.gitignore b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/bitnami-labs/sealed-secrets/contrib/prometheus-mixin/.gitignore new file mode 100644 index 000000000..b55735e04 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/bitnami-labs/sealed-secrets/contrib/prometheus-mixin/.gitignore @@ -0,0 +1 @@ +manifests/ diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/bitnami-labs/sealed-secrets/contrib/prometheus-mixin/Makefile b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/bitnami-labs/sealed-secrets/contrib/prometheus-mixin/Makefile new file mode 100644 index 000000000..c55b2a812 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/bitnami-labs/sealed-secrets/contrib/prometheus-mixin/Makefile @@ -0,0 +1,56 @@ +# Prometheus Mixin Makefile +# Heavily copied from upstream project kubenetes-mixin + +PROMETHEUS_IMAGE := prom/prometheus:v2.21.0 + +JSONNET_FMT := jsonnetfmt + +all: fmt prometheus_alerts.yaml prometheus_rules.yaml dashboards_out lint test ## Generate files, lint and test + +fmt: ## Format Jsonnet + find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \ + xargs -n 1 -- $(JSONNET_FMT) -i + +prometheus_alerts.yaml: mixin.libsonnet lib/alerts.jsonnet alerts/*.libsonnet ## Generate Alerts YAML + @mkdir -p manifests + jsonnet -S lib/alerts.jsonnet > manifests/$@ + +prometheus_rules.yaml: mixin.libsonnet lib/rules.jsonnet rules/*.libsonnet ## Generate Rules YAML + @mkdir -p manifests + jsonnet -S lib/rules.jsonnet > manifests/$@ + +dashboards_out: mixin.libsonnet lib/dashboards.jsonnet dashboards/*.libsonnet ## Generate Dashboards JSON + jsonnet -J vendor -m manifests lib/dashboards.jsonnet + +lint: prometheus_alerts.yaml prometheus_rules.yaml ## Lint and check YAML + find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \ + while read f; do \ + $(JSONNET_FMT) "$$f" | diff -u "$$f" -; \ + done + docker run \ + -v $(PWD)/manifests:/tmp \ + --entrypoint '/bin/promtool' \ + $(PROMETHEUS_IMAGE) \ + check rules /tmp/prometheus_rules.yaml; \ + docker run \ + -v $(PWD)/manifests:/tmp \ + --entrypoint '/bin/promtool' \ + $(PROMETHEUS_IMAGE) \ + check rules /tmp/prometheus_alerts.yaml + +clean: ## Clean up generated files + rm -rf manifests/ + +# TODO: Find out why official prom images segfaults during `test rules` if not root +test: prometheus_alerts.yaml prometheus_rules.yaml ## Test generated files + docker run \ + -v $(PWD):/tmp \ + --user root \ + --entrypoint '/bin/promtool' \ + $(PROMETHEUS_IMAGE) \ + test rules /tmp/tests.yaml + +.PHONY: help +help: + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' + diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/bitnami-labs/sealed-secrets/contrib/prometheus-mixin/README.md b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/bitnami-labs/sealed-secrets/contrib/prometheus-mixin/README.md new file mode 100644 index 000000000..cf76ad42a --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/bitnami-labs/sealed-secrets/contrib/prometheus-mixin/README.md @@ -0,0 +1,106 @@ +# Sealed Secrets Metrics + +The Sealed Secrets Controller running in Kubernetes exposes Prometheus +metrics on `*:8081/metrics`. + +These metrics enable operators to observe how it is performing. For example +how many `SealedSecret` unseals have been attempted and how many errors may +have occured due to RBAC permissions, wrong key, corrupted data, etc. + +These metrics can be scraped by a Prometheus server and viewed in Prometheus, +displayed on a Grafana dashboard and/or trigger alerts to Slack/etc. + +## Prometheus Mixin + +A Prometheus mixin bundles all of the metric related concerns into a single +package for users of the application to consume. +Typically this includes dashboards, recording rules, alerts and alert logic +tests. + +By creating a mixin, application maintainers and contributors to the project +can enshrine knowledge about operating the application and potential SLO's +that users may wish to use. + +For more details about this concept see the [monitoring-mixins](https://github.com/monitoring-mixins/docs) +project on GitHub. + +## Scraping the metrics manually + +After installing the Sealed Secrets Controller you can access the metrics via +Kubernetes port-forward to your pod: + +``` +$ kubectl port-forward sealed-secrets-controller-6566dc69c6-lqr6x 8081 & +[1] 293283 +``` + +Then query the metrics endpoint: +``` +$ curl localhost:8081/metrics + + +# HELP sealed_secrets_controller_build_info Build information. +# TYPE sealed_secrets_controller_build_info gauge +sealed_secrets_controller_build_info{revision="v0.12.1"} 0 +# HELP sealed_secrets_controller_unseal_errors_total Total number of sealed secret unseal errors by reason +# TYPE sealed_secrets_controller_unseal_errors_total counter +sealed_secrets_controller_unseal_errors_total{reason="fetch"} 0 +sealed_secrets_controller_unseal_errors_total{reason="status"} 0 +sealed_secrets_controller_unseal_errors_total{reason="unmanaged"} 0 +sealed_secrets_controller_unseal_errors_total{reason="unseal"} 0 +sealed_secrets_controller_unseal_errors_total{reason="update"} 0 +# HELP sealed_secrets_controller_unseal_requests_total Total number of sealed secret unseal requests +# TYPE sealed_secrets_controller_unseal_requests_total counter +sealed_secrets_controller_unseal_requests_total 86 +``` + +## Scraping metrics with the Prometheus Operator + +The [Prometheus Operator](https://github.com/coreos/prometheus-operator) +supports a couple of Kubernetes native scrape target `CustomResourceDefinitions`. + +This project includes a [PodMonitor](../../controller-podmonitor.jsonnet +) CRD definition in jsonnet. To use this: + +Compile jsonnet to yaml: +``` +$ make controller-podmonitor.yaml +kubecfg show -V CONTROLLER_IMAGE=docker.io/bitnami/sealed-secrets-controller:latest -V IMAGE_PULL_POLICY=Always -o yaml controller-podmonitor.jsonnet > controller-podmonitor.yaml.tmp +mv controller-podmonitor.yaml.tmp controller-podmonitor.yaml +``` + +Submit the `PodMonitor` CustomResourceDefinition to Kubernetes API: +``` +$ kubectl apply -f controller-podmonitor.yaml +``` + +The Prometheus Operator will trigger a reload of Prometheus configuration and +you should see the Sealed Secrets Controller in your Prometheus UI under +`Service Discovery` and `Targets`. + +## Grafana dashboard + +The [dashboard](./dashboards/sealed-secrets-controller.json) can be imported +standalone into Grafana. You may need to edit the datasource if you have +configured your Prometheus datasource with a different name. + +## Using the mixin with kube-prometheus + +See the [kube-prometheus](https://github.com/coreos/kube-prometheus#kube-prometheus) +project documentation for instructions on importing mixins. + +## Using the mixin as raw YAML files + +If you don't use the jsonnet based `kube-prometheus` project then you will need to +generate the raw yaml files for inclusion in your Prometheus installation. + +Install the `jsonnet` dependencies: +``` +$ go get github.com/google/go-jsonnet/cmd/jsonnet +$ go get github.com/google/go-jsonnet/cmd/jsonnetfmt +``` + +Generate yaml: +``` +$ make +``` diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/bitnami-labs/sealed-secrets/contrib/prometheus-mixin/alerts/alerts.libsonnet b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/bitnami-labs/sealed-secrets/contrib/prometheus-mixin/alerts/alerts.libsonnet new file mode 100644 index 000000000..08bfcfe15 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/bitnami-labs/sealed-secrets/contrib/prometheus-mixin/alerts/alerts.libsonnet @@ -0,0 +1,3 @@ +// Sealed Secrets Alertmanager Alerts + +(import 'sealed-secrets-alerts.libsonnet') diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/bitnami-labs/sealed-secrets/contrib/prometheus-mixin/alerts/sealed-secrets-alerts.libsonnet b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/bitnami-labs/sealed-secrets/contrib/prometheus-mixin/alerts/sealed-secrets-alerts.libsonnet new file mode 100644 index 000000000..cf1f268af --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/bitnami-labs/sealed-secrets/contrib/prometheus-mixin/alerts/sealed-secrets-alerts.libsonnet @@ -0,0 +1,34 @@ +{ + prometheusAlerts+:: { + groups+: [{ + name: 'sealed-secrets', + rules: [ + // SealedSecretsErrorRateHigh: + // Method: Alert on occurence of errors by looking for a non-zero rate of errors over past 5 minutes + // Pros: + // - An app deploy is likely broken if a secret can't be updated by Controller. + // Caveats: + // - Probably better to leave app deploy breakages to the app or CD systems monitoring. + // - Potentially noisy. Controller attempts to unseal 5 times, so if it exceeds on the 4th attempt then all is fine but this alert will trigger. + // - Usage of an invalid cert.pem with kubeseal will trigger this alert, it would be better to distinguish alerts due to controller or user + // - 'for' clause not used because we are unlikely to have a sustained rate of errors unless there is a LOT of secret churn in cluster. + // Rob Ewaschuk - My Philosophy on Alerting: https://docs.google.com/document/d/199PqyG3UsyXlwieHaqbGiWVa8eMWi8zzAn0YfcApr8Q/edit + { + alert: 'SealedSecretsUnsealErrorHigh', + expr: ||| + sum by (reason, namespace) (rate(sealed_secrets_controller_unseal_errors_total{}[5m])) > 0 + ||| % $._config, + // 'for': '5m', // Not used, see caveats above. + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Sealed Secrets Unseal Error High', + description: 'High number of errors during unsealing Sealed Secrets in {{ $labels.namespace }} namespace.', + runbook_url: 'https://github.com/bitnami-labs/sealed-secrets', + }, + }, + ], + }], + }, +} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/bitnami-labs/sealed-secrets/contrib/prometheus-mixin/config.libsonnet b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/bitnami-labs/sealed-secrets/contrib/prometheus-mixin/config.libsonnet new file mode 100644 index 000000000..358f4ff12 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/bitnami-labs/sealed-secrets/contrib/prometheus-mixin/config.libsonnet @@ -0,0 +1,4 @@ +// Sealed Secrets Prometheus Mixin Config +{ + _config+:: {}, +} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/bitnami-labs/sealed-secrets/contrib/prometheus-mixin/dashboards/dashboards.libsonnet b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/bitnami-labs/sealed-secrets/contrib/prometheus-mixin/dashboards/dashboards.libsonnet new file mode 100644 index 000000000..d1958dcaf --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/bitnami-labs/sealed-secrets/contrib/prometheus-mixin/dashboards/dashboards.libsonnet @@ -0,0 +1,7 @@ +// Sealed Secrets Grafana Dashboards + +{ + grafanaDashboards+:: { + 'sealed-secrets-controller.json': (import 'sealed-secrets-controller.json'), + }, +} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/bitnami-labs/sealed-secrets/contrib/prometheus-mixin/dashboards/sealed-secrets-controller.json b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/bitnami-labs/sealed-secrets/contrib/prometheus-mixin/dashboards/sealed-secrets-controller.json new file mode 100644 index 000000000..5fc9a4d5b --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/bitnami-labs/sealed-secrets/contrib/prometheus-mixin/dashboards/sealed-secrets-controller.json @@ -0,0 +1,302 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Sealed Secrets Controller", + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 3, + "iteration": 1585599163503, + "links": [ + { + "icon": "external link", + "tags": [], + "title": "GitHub", + "tooltip": "View Project on GitHub", + "type": "link", + "url": "https://github.com/bitnami-labs/sealed-secrets" + } + ], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Rate of requests to unseal a SealedSecret.\n\nThis can include non-obvious operations such as deleting a SealedSecret.", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 0 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "avg": true, + "current": false, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(sealed_secrets_controller_unseal_requests_total{}[1m]))", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "rps", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Unseal Request Rate/s", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Rate of errors when unsealing a SealedSecret. \n\nReason for error included as label value, eg:\n- unseal = cryptography issue (key/namespace) or RBAC\n- unmanaged = destination Secret wasn't created by SealedSecrets\n- update = potentially RBAC\n- status = potentially RBAC\n- fetch = potentially RBAC\n", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 0 + }, + "hiddenSeries": false, + "id": 3, + "legend": { + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(sealed_secrets_controller_unseal_errors_total{pod=~\"$pod\"}[1m])) by (reason)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{ reason }}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Unseal Error Rate/s", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": false, + "schemaVersion": 22, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "current": { + "text": "prometheus", + "value": "prometheus" + }, + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": null, + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": "$datasource", + "definition": "label_values(kube_pod_info, pod)", + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "pod", + "options": [], + "query": "label_values(kube_pod_info, pod)", + "refresh": 1, + "regex": "/^sealed-secrets-controller.*$/", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Sealed Secrets Controller", + "uid": "UuEtZCVWz", + "version": 2 +} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/bitnami-labs/sealed-secrets/contrib/prometheus-mixin/lib/alerts.jsonnet b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/bitnami-labs/sealed-secrets/contrib/prometheus-mixin/lib/alerts.jsonnet new file mode 100644 index 000000000..d396a38cd --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/bitnami-labs/sealed-secrets/contrib/prometheus-mixin/lib/alerts.jsonnet @@ -0,0 +1 @@ +std.manifestYamlDoc((import '../mixin.libsonnet').prometheusAlerts) diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/bitnami-labs/sealed-secrets/contrib/prometheus-mixin/lib/dashboards.jsonnet b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/bitnami-labs/sealed-secrets/contrib/prometheus-mixin/lib/dashboards.jsonnet new file mode 100644 index 000000000..dadaebe9b --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/bitnami-labs/sealed-secrets/contrib/prometheus-mixin/lib/dashboards.jsonnet @@ -0,0 +1,6 @@ +local dashboards = (import '../mixin.libsonnet').grafanaDashboards; + +{ + [name]: dashboards[name] + for name in std.objectFields(dashboards) +} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/bitnami-labs/sealed-secrets/contrib/prometheus-mixin/lib/rules.jsonnet b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/bitnami-labs/sealed-secrets/contrib/prometheus-mixin/lib/rules.jsonnet new file mode 100644 index 000000000..2d7fa91f7 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/bitnami-labs/sealed-secrets/contrib/prometheus-mixin/lib/rules.jsonnet @@ -0,0 +1 @@ +std.manifestYamlDoc((import '../mixin.libsonnet').prometheusRules) diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/bitnami-labs/sealed-secrets/contrib/prometheus-mixin/mixin.libsonnet b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/bitnami-labs/sealed-secrets/contrib/prometheus-mixin/mixin.libsonnet new file mode 100644 index 000000000..404c51017 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/bitnami-labs/sealed-secrets/contrib/prometheus-mixin/mixin.libsonnet @@ -0,0 +1,10 @@ +// Prometheus Mixin +// Follows the kubernetes-mixin project pattern here: https://github.com/kubernetes-monitoring/kubernetes-mixin +// Mixin design doc: https://docs.google.com/document/d/1A9xvzwqnFVSOZ5fD3blKODXfsat5fg6ZhnKu9LK3lB4/edit + +// This file will be imported during build for all Promethei + +(import 'config.libsonnet') + +(import 'alerts/alerts.libsonnet') + +(import 'dashboards/dashboards.libsonnet') + +(import 'rules/rules.libsonnet') diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/bitnami-labs/sealed-secrets/contrib/prometheus-mixin/rules/rules.libsonnet b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/bitnami-labs/sealed-secrets/contrib/prometheus-mixin/rules/rules.libsonnet new file mode 100644 index 000000000..340328d9c --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/bitnami-labs/sealed-secrets/contrib/prometheus-mixin/rules/rules.libsonnet @@ -0,0 +1,8 @@ +// Sealed Secrets Prometheus Recording Rules +{ + prometheusRules+:: { + groups+: [ + // import ('sealed-secrets-rules.libsonnet') + ], + }, +} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/bitnami-labs/sealed-secrets/contrib/prometheus-mixin/tests.yaml b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/bitnami-labs/sealed-secrets/contrib/prometheus-mixin/tests.yaml new file mode 100644 index 000000000..b68e2c9e4 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/bitnami-labs/sealed-secrets/contrib/prometheus-mixin/tests.yaml @@ -0,0 +1,28 @@ +--- +rule_files: +- /tmp/manifests/prometheus_alerts.yaml +- /tmp/manifests/prometheus_rules.yaml + +evaluation_interval: 1m + +tests: +- interval: 1m + input_series: + - series: 'sealed_secrets_controller_unseal_errors_total{reason="update",namespace="test"}' + values: '0+0x5 1+1x5' + - series: 'sealed_secrets_controller_unseal_errors_total{reason="unseal",namespace="test"}' + values: '0+0x10' + alert_rule_test: + - eval_time: 5m + alertname: SealedSecretsUnsealErrorHigh + - eval_time: 10m + alertname: SealedSecretsUnsealErrorHigh + exp_alerts: + - exp_labels: + severity: warning + namespace: test + reason: update + exp_annotations: + summary: 'Sealed Secrets Unseal Error High' + description: 'High number of errors during unsealing Sealed Secrets in test namespace.' + runbook_url: 'https://github.com/bitnami-labs/sealed-secrets' diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/brancz/kubernetes-grafana/grafana/grafana.libsonnet b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/brancz/kubernetes-grafana/grafana/grafana.libsonnet new file mode 100644 index 000000000..2813ee3d2 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/brancz/kubernetes-grafana/grafana/grafana.libsonnet @@ -0,0 +1,377 @@ +local defaults = { + local defaults = self, + namespace: 'default', + version: '7.5.10', + image: 'docker.io/grafana/grafana:' + defaults.version, + commonLabels:: { + 'app.kubernetes.io/name': 'grafana', + 'app.kubernetes.io/version': defaults.version, + 'app.kubernetes.io/component': 'grafana', + }, + selectorLabels:: { + [labelName]: defaults.commonLabels[labelName] + for labelName in std.objectFields(defaults.commonLabels) + if !std.setMember(labelName, ['app.kubernetes.io/version']) + }, + replicas: 1, + port: 3000, + resources: { + requests: { cpu: '100m', memory: '100Mi' }, + limits: { cpu: '200m', memory: '200Mi' }, + }, + + dashboards: {}, + rawDashboards: {}, + folderDashboards: {}, + folderUidGenerator(folder): '', + datasources: [{ + name: 'prometheus', + type: 'prometheus', + access: 'proxy', + orgId: 1, + url: 'http://prometheus-k8s.' + defaults.namespace + '.svc:9090', + version: 1, + editable: false, + }], + // Forces pod restarts when dashboards are changed + dashboardsChecksum: false, + config: { + sections: { + date_formats: { default_timezone: 'UTC' }, + }, + }, + ldap: null, + plugins: [], + env: [], + containers: [], +}; + +function(params) { + local g = self, + _config:: defaults + params, + _metadata:: { + name: 'grafana', + namespace: g._config.namespace, + labels: g._config.commonLabels, + }, + + serviceAccount: { + apiVersion: 'v1', + kind: 'ServiceAccount', + metadata: g._metadata, + automountServiceAccountToken: false, + }, + + service: { + apiVersion: 'v1', + kind: 'Service', + metadata: g._metadata, + spec: { + selector: g.deployment.spec.selector.matchLabels, + ports: [ + { name: 'http', targetPort: 'http', port: 3000 }, + ], + }, + }, + + config: { + apiVersion: 'v1', + kind: 'Secret', + metadata: g._metadata { + name: 'grafana-config', + }, + type: 'Opaque', + stringData: { + 'grafana.ini': std.manifestIni(g._config.config), + } + if g._config.ldap != null then { 'ldap.toml': g._config.ldap } else {}, + }, + + dashboardDefinitions: { + apiVersion: 'v1', + kind: 'ConfigMapList', + items: [ + { + local dashboardName = 'grafana-dashboard-' + std.strReplace(name, '.json', ''), + apiVersion: 'v1', + kind: 'ConfigMap', + metadata: g._metadata { + name: dashboardName, + }, + data: { [name]: std.manifestJsonEx(g._config.dashboards[name], ' ') }, + } + for name in std.objectFields(g._config.dashboards) + ] + [ + { + local dashboardName = 'grafana-dashboard-' + std.strReplace(name, '.json', ''), + apiVersion: 'v1', + kind: 'ConfigMap', + metadata: g._metadata { + name: dashboardName, + }, + data: { [name]: std.manifestJsonEx(g._config.folderDashboards[folder][name], ' ') }, + } + for folder in std.objectFields(g._config.folderDashboards) + for name in std.objectFields(g._config.folderDashboards[folder]) + ] + ( + if std.length(g._config.rawDashboards) > 0 then + [ + + { + local dashboardName = 'grafana-dashboard-' + std.strReplace(name, '.json', ''), + apiVersion: 'v1', + kind: 'ConfigMap', + metadata: g._metadata { + name: dashboardName, + }, + data: { [name]: g._config.rawDashboards[name] }, + } + for name in std.objectFields(g._config.rawDashboards) + ] + else + [] + ), + }, + + dashboardSources: + local dashboardSources = { + apiVersion: 1, + providers: + ( + if std.length(g._config.dashboards) + + std.length(g._config.rawDashboards) > 0 then [ + { + name: '0', + orgId: 1, + folder: 'Default', + folderUid: g._config.folderUidGenerator('Default'), + type: 'file', + options: { + path: '/grafana-dashboard-definitions/0', + }, + }, + ] else [] + ) + + [ + { + name: folder, + orgId: 1, + folder: folder, + folderUid: g._config.folderUidGenerator(folder), + type: 'file', + options: { + path: '/grafana-dashboard-definitions/' + folder, + }, + } + for folder in std.objectFields(g._config.folderDashboards) + ], + }; + + { + kind: 'ConfigMap', + apiVersion: 'v1', + metadata: g._metadata { + name: 'grafana-dashboards', + }, + data: { 'dashboards.yaml': std.manifestJsonEx(dashboardSources, ' ') }, + }, + + dashboardDatasources: { + apiVersion: 'v1', + kind: 'Secret', + metadata: g._metadata { + name: 'grafana-datasources', + }, + type: 'Opaque', + stringData: { + 'datasources.yaml': std.manifestJsonEx( + { + apiVersion: 1, + datasources: g._config.datasources, + }, ' ' + ), + }, + }, + + deployment: + local configVolume = { + name: 'grafana-config', + secret: { secretName: g.config.metadata.name }, + }; + local configVolumeMount = { + name: configVolume.name, + mountPath: '/etc/grafana', + readOnly: false, + }; + + local storageVolume = { + name: 'grafana-storage', + emptyDir: {}, + }; + local storageVolumeMount = { + name: storageVolume.name, + mountPath: '/var/lib/grafana', + readOnly: false, + }; + + local datasourcesVolume = { + name: 'grafana-datasources', + secret: { secretName: g.dashboardDatasources.metadata.name }, + }; + local datasourcesVolumeMount = { + name: datasourcesVolume.name, + mountPath: '/etc/grafana/provisioning/datasources', + readOnly: false, + }; + + local dashboardsVolume = { + name: 'grafana-dashboards', + configMap: { name: g.dashboardSources.metadata.name }, + }; + local dashboardsVolumeMount = { + name: dashboardsVolume.name, + mountPath: '/etc/grafana/provisioning/dashboards', + readOnly: false, + }; + // A volume on /tmp is needed to let us use 'readOnlyRootFilesystem: true' + local pluginTmpVolume = { + name: 'tmp-plugins', + emptyDir: { + medium: 'Memory', + }, + }; + local pluginTmpVolumeMount = { + mountPath: '/tmp', + name: 'tmp-plugins', + readOnly: false, + }; + + local volumeMounts = + [ + storageVolumeMount, + datasourcesVolumeMount, + dashboardsVolumeMount, + pluginTmpVolumeMount, + ] + + [ + { + local dashboardName = std.strReplace(name, '.json', ''), + name: 'grafana-dashboard-' + dashboardName, + mountPath: '/grafana-dashboard-definitions/0/' + dashboardName, + readOnly: false, + } + for name in std.objectFields(g._config.dashboards + g._config.rawDashboards) + ] + + [ + { + local dashboardName = std.strReplace(name, '.json', ''), + name: 'grafana-dashboard-' + dashboardName, + mountPath: '/grafana-dashboard-definitions/' + folder + '/' + dashboardName, + readOnly: false, + } + for folder in std.objectFields(g._config.folderDashboards) + for name in std.objectFields(g._config.folderDashboards[folder]) + ] + ( + if std.length(g._config.config) > 0 then [configVolumeMount] else [] + ); + + local volumes = + [ + storageVolume, + datasourcesVolume, + dashboardsVolume, + pluginTmpVolume, + ] + + [ + { + local dashboardName = 'grafana-dashboard-' + std.strReplace(name, '.json', ''), + name: dashboardName, + configMap: { name: dashboardName }, + } + for name in std.objectFields(g._config.dashboards) + ] + + [ + { + local dashboardName = 'grafana-dashboard-' + std.strReplace(name, '.json', ''), + name: dashboardName, + configMap: { name: dashboardName }, + } + for folder in std.objectFields(g._config.folderDashboards) + for name in std.objectFields(g._config.folderDashboards[folder]) + ] + + [ + { + local dashboardName = 'grafana-dashboard-' + std.strReplace(name, '.json', ''), + name: dashboardName, + configMap: { name: dashboardName }, + } + for name in std.objectFields(g._config.rawDashboards) + ] + + if std.length(g._config.config) > 0 then [configVolume] else []; + + local plugins = ( + if std.length(g._config.plugins) == 0 then + [] + else + [{ name: 'GF_INSTALL_PLUGINS', value: std.join(',', g._config.plugins) }] + ); + + local grafanaContainer = { + name: 'grafana', + image: g._config.image, + env: g._config.env + plugins, + volumeMounts: volumeMounts, + ports: [{ + name: 'http', + containerPort: g._config.port, + }], + readinessProbe: { + httpGet: { + path: '/api/health', + port: grafanaContainer.ports[0].name, + }, + }, + resources: g._config.resources, + securityContext: { + capabilities: { drop: ['ALL'] }, + allowPrivilegeEscalation: false, + readOnlyRootFilesystem: true, + seccompProfile: { type: 'RuntimeDefault' }, + }, + }; + + { + apiVersion: 'apps/v1', + kind: 'Deployment', + metadata: g._metadata, + spec: { + replicas: g._config.replicas, + selector: { + matchLabels: g._config.selectorLabels, + }, + template: { + metadata: { + labels: g._config.commonLabels, + annotations: { + [if std.length(g._config.config) > 0 then 'checksum/grafana-config']: std.md5(std.toString(g.config)), + 'checksum/grafana-datasources': std.md5(std.toString(g.dashboardDatasources)), + [if g._config.dashboardsChecksum then 'checksum/grafana-dashboards']: std.md5(std.toString(g.dashboardDefinitions)), + 'checksum/grafana-dashboardproviders': std.md5(std.toString(g.dashboardSources)), + }, + }, + spec: { + containers: [grafanaContainer] + g._config.containers, + volumes: volumes, + serviceAccountName: g.serviceAccount.metadata.name, + nodeSelector: { + 'kubernetes.io/os': 'linux', + }, + securityContext: { + fsGroup: 65534, + runAsNonRoot: true, + runAsUser: 65534, + }, + }, + }, + }, + }, +} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/brancz/kubernetes-grafana/grafana/jsonnetfile.json b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/brancz/kubernetes-grafana/grafana/jsonnetfile.json new file mode 100644 index 000000000..650733a05 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/brancz/kubernetes-grafana/grafana/jsonnetfile.json @@ -0,0 +1,15 @@ +{ + "version": 1, + "dependencies": [ + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet-lib.git", + "subdir": "grafonnet" + } + }, + "version": "master" + } + ], + "legacyImports": false +} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/.gitignore b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/.gitignore new file mode 100644 index 000000000..22d0d82f8 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/.gitignore @@ -0,0 +1 @@ +vendor diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/CMakeLists.txt b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/CMakeLists.txt new file mode 100644 index 000000000..e63c740b7 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/CMakeLists.txt @@ -0,0 +1,57 @@ +if(WITH_GRAFANA) + set(CEPH_GRAFANA_DASHBOARDS_DIR "${CMAKE_INSTALL_SYSCONFDIR}/grafana/dashboards/ceph-dashboard" + CACHE PATH "Location for grafana dashboards") + file(GLOB CEPH_GRAFANA_DASHBOARDS "dashboards_out/*.json") + install(FILES + ${CEPH_GRAFANA_DASHBOARDS} + DESTINATION ${CEPH_GRAFANA_DASHBOARDS_DIR}) + if(WITH_TESTS) + set(CEPH_BUILD_VIRTUALENV $ENV{TMPDIR}) + if(NOT CEPH_BUILD_VIRTUALENV) + include(AddCephTest) + set(CEPH_BUILD_VIRTUALENV ${CMAKE_BINARY_DIR}) + + add_test(NAME jsonnet-bundler-build + COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/jsonnet-bundler-build.sh ${CMAKE_CURRENT_BINARY_DIR}) + set_property(TEST jsonnet-bundler-build PROPERTY + FIXTURES_SETUP jsonnet-bundler) + add_test(NAME jsonnet-bundler-cleanup + COMMAND rm -rf jsonnet-bundler ${CMAKE_CURRENT_BINARY_DIR}/jb) + set_property(TEST jsonnet-bundler-cleanup PROPERTY + FIXTURES_CLEANUP jsonnet-bundler) + + add_tox_test(grafana-lint TOX_ENVS lint) + add_tox_test(jsonnet-lint TOX_ENVS jsonnet-lint) + set_property(TEST run-tox-jsonnet-lint PROPERTY + FIXTURES_REQUIRED venv-for-jsonnet-lint) + add_tox_test(jsonnet-check TOX_ENVS jsonnet-check) + set_property(TEST run-tox-jsonnet-check PROPERTY + FIXTURES_REQUIRED venv-for-jsonnet-check jsonnet-bundler) + + add_tox_test(alerts-check TOX_ENVS alerts-check) + add_tox_test(alerts-lint TOX_ENVS alerts-lint) + add_tox_test(promql-query-test TOX_ENVS promql-query-test) + endif() + + if(DEFINED PROMTOOL_EXECUTABLE) + set(promtool_executable_checked TRUE) + endif() + + find_program(PROMTOOL_EXECUTABLE promtool) + if(PROMTOOL_EXECUTABLE) + execute_process( + COMMAND ${PROMTOOL_EXECUTABLE} test rules /dev/null + RESULT_VARIABLE rc + OUTPUT_QUIET) + if(NOT rc) + add_ceph_test(run-promtool-unittests + ${PROMTOOL_EXECUTABLE} test rules ${CMAKE_SOURCE_DIR}/monitoring/ceph-mixin/tests_alerts/test_alerts.yml) + elseif(NOT promtool_executable_checked) + message(WARNING "'${PROMTOOL_EXECUTABLE} test rules' does not work, " + "please use a newer prometheus") + endif() + elseif(NOT promtool_executable_checked) + message(WARNING "run-promtool-unittests is skipped due to missing promtool") + endif() + endif() +endif() diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/Makefile b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/Makefile new file mode 100644 index 000000000..915481d8c --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/Makefile @@ -0,0 +1,25 @@ +all: fmt generate lint test + +fmt: + ./lint-jsonnet.sh -i + +generate: dashboards_out + tox -ealerts-fix + +vendor: jsonnetfile.lock.json + tox -ejsonnet-bundler-install + +dashboards_out: vendor dashboards + tox -ejsonnet-fix + +lint: + tox -ejsonnet-lint + tox -ealerts-lint + +test: generate + tox -ejsonnet-check + tox -epromql-query-test + tox -ealerts-test +check: test + +.PHONY: all fmt generate lint test check diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/README.md b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/README.md new file mode 100644 index 000000000..f34d67f92 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/README.md @@ -0,0 +1,82 @@ +## Prometheus Monitoring Mixin for Ceph +A set of Grafana dashboards and Prometheus alerts for Ceph. + +All the Grafana dashboards are already generated in the `dashboards_out` +directory and alerts in the `prometheus_alerts.yml` file. + +You can use the Grafana dashboards and alerts with Jsonnet like any other +prometheus mixin. You can find more resources about mixins in general on +[monitoring.mixins.dev](https://monitoring.mixins.dev/). + +### Grafana dashboards for Ceph +In `dashboards_out` you can find a collection of +[Grafana](https://grafana.com/grafana) dashboards for Ceph Monitoring. + +These dashboards are based on metrics collected +from [prometheus](https://prometheus.io/) scraping the [prometheus mgr +plugin](http://docs.ceph.com/en/latest/mgr/prometheus/) and the +[node_exporter (0.17.0)](https://github.com/prometheus/node_exporter). + + +##### Recommended versions: +-grafana 8.3.5 + -grafana-piechart-panel 1.6.2 + -grafana-status-panel 1.0.11 + +#### Requirements + +- [Status Panel](https://grafana.com/plugins/vonage-status-panel) installed on + your Grafana instance +- [Pie Chart Panel](https://grafana.com/grafana/plugins/grafana-piechart-panel/) + installed on your Grafana instance + + +### Prometheus alerts +In `prometheus_alerts.libsonnet` you'll find a set of Prometheus +alert rules that should provide a decent set of default alerts for a +Ceph cluster. After building them with jsonnet put this file in place according to your Prometheus +configuration (wherever the `rules` configuration stanza points). + +### Multi-cluster support +Ceph-mixin supports dashboards and alerts across multiple clusters. +To enable this feature you need to configure the following in `config.libsonnnet`: + +``` +showMultiCluster: true, +clusterLabel: '', +``` + +##### Recommended versions: +-prometheus v2.33.4 + +#### SNMP +Ceph provides a MIB (CEPH-PROMETHEUS-ALERT-MIB.txt) to support sending +Prometheus alerts to an SNMP management platform. The translation from +Prometheus alert to SNMP trap requires the Prometheus alert to contain an OID +that maps to a definition within the MIB. When making changes to the Prometheus +alert rules file, developers should include any necessary changes to the MIB. + + +##### Recommended: +-alertmanager 0.16.2 + +### Building from Jsonnet + +- Install [jsonnet](https://jsonnet.org/) (at least v0.18.0) + - By installing the package `jsonnet` in most of the distro and + `golang-github-google-jsonnet` in fedora +- Install [jsonnet-bundler](https://github.com/jsonnet-bundler/jsonnet-bundler) + +To rebuild all the generated files, you can run `tox -egrafonnet-fix`. + +The jsonnet code located in this directory depends on some Jsonnet third party +libraries. To update those libraries you can run `jb update` and then update +the generated files using `tox -egrafonnet-fix`. + +### Building alerts from `prometheus_alerts.libsonnet` + +To rebuild the `prometheus_alerts.yml` file from the corresponding libsonnet, +you can run `tox -ealerts-fix`. + + +##### Any upgrade or downgrade to different major versions of the recommended tools mentioned above is not supported. diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/alerts.jsonnet b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/alerts.jsonnet new file mode 100644 index 000000000..13e70179f --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/alerts.jsonnet @@ -0,0 +1 @@ +std.manifestYamlDoc((import 'mixin.libsonnet').prometheusAlerts, indent_array_in_object=true, quote_keys=false) diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/alerts.libsonnet b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/alerts.libsonnet new file mode 100644 index 000000000..c2d39e2d3 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/alerts.libsonnet @@ -0,0 +1,4 @@ +{ + prometheusAlerts+:: (import 'prometheus_alerts.libsonnet') + + { _config:: $._config }, +} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/config.libsonnet b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/config.libsonnet new file mode 100644 index 000000000..c0af859e4 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/config.libsonnet @@ -0,0 +1,23 @@ +{ + _config+:: { + dashboardTags: ['ceph-mixin'], + + clusterLabel: 'cluster', + showMultiCluster: true, + + CephNodeNetworkPacketDropsThreshold: 0.005, + CephNodeNetworkPacketDropsPerSec: 10, + CephRBDMirrorImageTransferBandwidthThreshold: 0.8, + CephRBDMirrorImagesPerDaemonThreshold: 100, + NVMeoFMaxGatewaysPerGroup: 4, + NVMeoFMaxGatewaysPerCluster: 4, + NVMeoFHighGatewayCPU: 80, + NVMeoFMaxSubsystemsPerGateway: 16, + NVMeoFHighClientCount: 32, + NVMeoFHighHostCPU: 80, + // + // Read/Write latency is defined in ms + NVMeoFHighClientReadLatency: 10, + NVMeoFHighClientWriteLatency: 20, + }, +} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards.jsonnet b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards.jsonnet new file mode 100644 index 000000000..9d913ed3f --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards.jsonnet @@ -0,0 +1,6 @@ +local dashboards = (import 'mixin.libsonnet').grafanaDashboards; + +{ + [name]: dashboards[name] + for name in std.objectFields(dashboards) +} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards.libsonnet b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards.libsonnet new file mode 100644 index 000000000..82e1888e0 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards.libsonnet @@ -0,0 +1,13 @@ +{ + grafanaDashboards+:: + (import 'dashboards/cephfs.libsonnet') + + (import 'dashboards/host.libsonnet') + + (import 'dashboards/osd.libsonnet') + + (import 'dashboards/pool.libsonnet') + + (import 'dashboards/rbd.libsonnet') + + (import 'dashboards/rgw.libsonnet') + + (import 'dashboards/ceph-cluster.libsonnet') + + (import 'dashboards/rgw-s3-analytics.libsonnet') + + (import 'dashboards/multi-cluster.libsonnet') + + { _config:: $._config }, +} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards/ceph-cluster.libsonnet b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards/ceph-cluster.libsonnet new file mode 100644 index 000000000..a6991f54c --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards/ceph-cluster.libsonnet @@ -0,0 +1,1683 @@ +local g = import 'grafonnet/grafana.libsonnet'; + +(import 'utils.libsonnet') { + 'ceph-cluster-advanced.json': $.dashboardSchema( + 'Ceph Cluster - Advanced', + 'Ceph cluster overview', + 'dn13KBeTv', + 'now-6h', + '1m', + 38, + $._config.dashboardTags, + '' + ).addAnnotation( + $.addAnnotationSchema( + 1, + '-- Grafana --', + true, // enable + true, // hide + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' + ) + ).addRequired( + type='grafana', id='grafana', name='Grafana', version='5.3.2' + ).addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ).addRequired( + type='panel', id='heatmap', name='Heatmap', version='5.0.0' + ).addRequired( + type='panel', id='singlestat', name='Singlestat', version='5.0.0' + ).addTemplate( + g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') + ).addTemplate( + $.addClusterTemplate() + ).addTemplate( + $.addCustomTemplate( + name='interval', + query='5s,10s,30s,1m,10m,30m,1h,6h,12h,1d,7d,14d,30d', + current='$__auto_interval_interval', + refresh=2, + label='Interval', + auto_count=10, + auto_min='1m', + options=[ + { selected: true, text: 'auto', value: '$__auto_interval_interval' }, + { selected: false, text: '5s', value: '5s' }, + { selected: false, text: '10s', value: '10s' }, + { selected: false, text: '30s', value: '30s' }, + { selected: false, text: '1m', value: '1m' }, + { selected: false, text: '10m', value: '10m' }, + { selected: false, text: '30m', value: '30m' }, + { selected: false, text: '1h', value: '1h' }, + { selected: false, text: '6h', value: '6h' }, + { selected: false, text: '12h', value: '12h' }, + { selected: false, text: '1d', value: '1d' }, + { selected: false, text: '7d', value: '7d' }, + { selected: false, text: '14d', value: '14d' }, + { selected: false, text: '30d', value: '30d' }, + ], + auto=true, + ) + ).addPanels( + [ + $.addRowSchema(collapse=false, showTitle=true, title='CLUSTER STATE') + { gridPos: { x: 0, y: 0, w: 24, h: 1 } }, + $.addStatPanel( + title='Ceph health status', + unit='none', + datasource='$datasource', + gridPosition={ x: 0, y: 1, w: 3, h: 3 }, + colorMode='value', + interval='1m', + transparent=true, + color={ mode: 'thresholds' }, + thresholdsMode='absolute', + pluginVersion='9.4.7' + ).addMappings([ + { + options: { + '0': { text: 'HEALTHY' }, + '1': { text: 'WARNING' }, + '2': { text: 'ERROR' }, + }, + type: 'value', + }, + { options: { match: null, result: { text: 'N/A' } }, type: 'special' }, + ]) + .addThresholds([ + { color: '#9ac48a' }, + { color: 'rgba(237, 129, 40, 0.89)', value: 1 }, + { color: 'rgba(245, 54, 54, 0.9)', value: 2 }, + ]) + .addTarget($.addTargetSchema( + expr='ceph_health_status{%(matchers)s}' % $.matchers(), + instant=true, + interval='$interval', + datasource='$datasource', + step=300, + )), + + $.addGaugePanel( + title='Available Capacity', + gridPosition={ h: 6, w: 3, x: 3, y: 1 }, + unit='percentunit', + max=1, + min=0, + interval='1m', + pluginVersion='9.4.7' + ).addMappings([ + { options: { match: null, result: { text: 'N/A' } }, type: 'special' }, + ]) + .addThresholds([ + { color: 'rgba(245, 54, 54, 0.9)' }, + { color: 'rgba(237, 129, 40, 0.89)', value: 0.1 }, + { color: 'rgba(50, 172, 45, 0.97)', value: 0.3 }, + ]) + .addTarget($.addTargetSchema( + expr='(ceph_cluster_total_bytes{%(matchers)s}-ceph_cluster_total_used_bytes{%(matchers)s})/ceph_cluster_total_bytes{%(matchers)s}' % $.matchers(), + instant=true, + interval='$interval', + datasource='$datasource', + step=300 + )), + + $.addStatPanel( + title='Cluster Capacity', + unit='decbytes', + datasource='$datasource', + gridPosition={ x: 6, y: 1, w: 3, h: 3 }, + graphMode='area', + decimals=2, + interval='1m', + color={ fixedColor: 'rgb(31, 120, 193)', mode: 'fixed' }, + thresholdsMode='absolute', + pluginVersion='9.4.7', + ).addMappings([ + { options: { match: null, result: { text: 'N/A' } }, type: 'special' }, + ]).addThresholds([ + { color: 'rgba(50, 172, 45, 0.97)' }, + { color: 'rgba(237, 129, 40, 0.89)', value: 0.025 }, + { color: 'rgba(245, 54, 54, 0.9)', value: 1.0 }, + ]) + .addTarget($.addTargetSchema( + expr='ceph_cluster_total_bytes{%(matchers)s}' % $.matchers(), + instant=true, + interval='$interval', + datasource='$datasource', + step=300 + )), + + $.addStatPanel( + title='Write Throughput', + unit='Bps', + datasource='$datasource', + gridPosition={ x: 9, y: 1, w: 3, h: 3 }, + decimals=1, + color={ mode: 'thresholds' }, + thresholdsMode='absolute', + pluginVersion='9.4.7', + ).addMappings([ + { options: { match: null, result: { text: 'N/A' } }, type: 'special' }, + ]).addThresholds([ + { color: 'green' }, + ]) + .addTarget($.addTargetSchema( + expr='sum(irate(ceph_osd_op_w_in_bytes{%(matchers)s}[5m]))' % $.matchers(), + instant=true, + interval='$interval', + datasource='$datasource', + )), + + $.addStatPanel( + title='Read Throughput', + unit='Bps', + datasource='$datasource', + gridPosition={ x: 12, y: 1, w: 3, h: 3 }, + decimals=1, + color={ mode: 'thresholds' }, + thresholdsMode='absolute', + pluginVersion='9.4.7', + ).addMappings([ + { options: { match: null, result: { text: 'N/A' } }, type: 'special' }, + ]).addThresholds([ + { color: '#d44a3a' }, + { color: 'rgba(237, 129, 40, 0.89)', value: 0 }, + { color: '#9ac48a', value: 0 }, + ]) + .addTarget($.addTargetSchema( + expr='sum(irate(ceph_osd_op_r_out_bytes{%(matchers)s}[5m]))' % $.matchers(), + instant=true, + interval='$interval', + datasource='$datasource', + )), + + $.addStatPanel( + title='OSDs', + datasource='$datasource', + gridPosition={ h: 3, w: 6, x: 15, y: 1 }, + color={ mode: 'thresholds' }, + thresholdsMode='absolute', + orientation='auto', + rootColorMode='Panel', + displayName='', + rootColors={ + crit: 'rgb(255, 0, 0)', + disable: 'rgba(128, 128, 128, 0.9)', + ok: 'rgba(50, 128, 45, 0.9)', + warn: 'rgba(237, 129, 40, 0.9)', + }, + cornerRadius=0, + flipCard=false, + flipTime=5, + isAutoScrollOnOverflow=false, + isGrayOnNoData=false, + isHideAlertsOnDisable=false, + isIgnoreOKColors=false, + fontFormat='Regular', + colorMode='background', + unit='none', + pluginVersion='9.4.7', + ) + .addThresholds([ + { color: 'green', value: null }, + { color: 'red', value: 80 }, + ]) + .addTargets([ + $.addTargetSchema( + aggregation='Last', + alias='All', + decimals=2, + displayAliasType='Always', + displayType='Regular', + displayValueWithAlias='When Alias Displayed', + units='none', + valueHandler='Number Threshold', + expr='count(ceph_osd_metadata{%(matchers)s})' % $.matchers(), + legendFormat='All', + interval='$interval', + datasource='$datasource', + ), + $.addTargetSchema( + aggregation='Last', + alias='In', + decimals=2, + displayAliasType='Always', + displayType='Regular', + displayValueWithAlias='When Alias Displayed', + units='none', + valueHandler='Number Threshold', + expr='count(ceph_osd_in{%(matchers)s})' % $.matchers(), + legendFormat='In', + interval='$interval', + datasource='$datasource', + ), + $.addTargetSchema( + aggregation='Last', + alias='Out', + decimals=2, + displayAliasType='Warning / Critical', + displayType='Regular', + displayValueWithAlias='When Alias Displayed', + units='none', + valueHandler='Number Threshold', + expr='sum(ceph_osd_in{%(matchers)s} == bool 0)' % $.matchers(), + legendFormat='Out', + interval='', + warn=1, + datasource='$datasource', + ), + $.addTargetSchema( + aggregation='Last', + alias='Up', + decimals=2, + displayAliasType='Always', + displayType='Regular', + displayValueWithAlias='When Alias Displayed', + units='none', + valueHandler='Number Threshold', + expr='sum(ceph_osd_up{%(matchers)s})' % $.matchers(), + legendFormat='Up', + interval='', + datasource='$datasource', + ), + $.addTargetSchema( + aggregation='Last', + alias='Down', + decimals=2, + displayAliasType='Warning / Critical', + displayType='Regular', + displayValueWithAlias='When Alias Displayed', + units='none', + valueHandler='Number Threshold', + expr='sum(ceph_osd_up{%(matchers)s} == bool 0)' % $.matchers(), + legendFormat='Down', + interval='', + warn=1, + datasource='$datasource', + ), + ]), + + $.addStatPanel( + title='MGRs', + datasource='$datasource', + gridPosition={ h: 6, w: 3, x: 21, y: 1 }, + color={ mode: 'thresholds' }, + thresholdsMode='absolute', + orientation='auto', + rootColorMode='Panel', + displayName='', + rootColors={ + crit: 'rgba(245, 54, 54, 0.9)', + disable: 'rgba(128, 128, 128, 0.9)', + ok: 'rgba(50, 128, 45, 0.9)', + warn: 'rgba(237, 129, 40, 0.9)', + }, + cornerRadius=1, + flipCard=false, + flipTime=5, + isAutoScrollOnOverflow=false, + isGrayOnNoData=false, + isHideAlertsOnDisable=false, + isIgnoreOKColors=false, + fontFormat='Regular', + colorMode='background', + unit='none', + pluginVersion='9.4.7', + ) + .addThresholds([ + { color: 'green', value: null }, + { color: 'red', value: 80 }, + ]) + .addTargets([ + $.addTargetSchema( + aggregation='Last', + alias='Active', + decimals=2, + displayAliasType='Always', + displayType='Regular', + displayValueWithAlias='When Alias Displayed', + units='none', + valueHandler='Number Threshold', + expr='count(ceph_mgr_status{%(matchers)s} == 1) or vector(0)' % $.matchers(), + legendFormat='Active', + datasource='$datasource', + instant=true, + ), + $.addTargetSchema( + aggregation='Last', + alias='Standby', + decimals=2, + displayAliasType='Always', + displayType='Regular', + displayValueWithAlias='When Alias Displayed', + units='none', + valueHandler='Number Threshold', + expr='count(ceph_mgr_status{%(matchers)s} == 0) or vector(0)' % $.matchers(), + legendFormat='Standby', + datasource='$datasource', + instant=true, + ), + ]), + + $.addStatPanel( + title='Firing Alerts', + datasource='$datasource', + gridPosition={ h: 3, w: 3, x: 0, y: 4 }, + color={ mode: 'thresholds' }, + thresholdsMode='absolute', + orientation='auto', + rootColorMode='Panel', + displayName='', + rootColors={ + crit: 'rgba(245, 54, 54, 0.9)', + disable: 'rgba(128, 128, 128, 0.9)', + ok: 'rgba(50, 128, 45, 0.9)', + warn: 'rgba(237, 129, 40, 0.9)', + }, + cornerRadius=1, + flipCard=false, + flipTime=5, + isAutoScrollOnOverflow=false, + isGrayOnNoData=false, + isHideAlertsOnDisable=false, + isIgnoreOKColors=false, + fontFormat='Regular', + colorMode='background', + unit='none', + pluginVersion='9.4.7', + ) + .addThresholds([ + { color: 'green', value: null }, + { color: 'red', value: 1 }, + ]) + .addOverrides([ + { matcher: { id: 'byName', options: 'Critical' }, properties: [ + { id: 'color', value: { fixedColor: 'red', mode: 'fixed' } }, + ] }, + { matcher: { id: 'byName', options: 'Warning' }, properties: [ + { id: 'color', value: { fixedColor: '#987d24', mode: 'fixed' } }, + ] }, + ]) + .addTargets([ + $.addTargetSchema( + aggregation='Last', + alias='Active', + decimals=2, + displayAliasType='Always', + displayType='Regular', + displayValueWithAlias='When Alias Displayed', + units='none', + valueHandler='Number Threshold', + expr='count(ALERTS{alertstate="firing",alertname=~"^Ceph.+", severity="critical", %(matchers)s}) OR vector(0)' % $.matchers(), + legendFormat='Critical', + datasource='$datasource', + instant=true, + ), + $.addTargetSchema( + aggregation='Last', + alias='Standby', + decimals=2, + displayAliasType='Always', + displayType='Regular', + displayValueWithAlias='When Alias Displayed', + units='none', + valueHandler='Number Threshold', + expr='count(ALERTS{alertstate="firing",alertname=~"^Ceph.+", severity="warning", %(matchers)s}) OR vector(0)' % $.matchers(), + legendFormat='Warning', + datasource='$datasource', + instant=true, + ), + ]), + + $.addStatPanel( + title='Used Capacity', + datasource='$datasource', + gridPosition={ h: 3, w: 3, x: 6, y: 4 }, + color={ mode: 'thresholds' }, + thresholdsMode='absolute', + orientation='horizontal', + graphMode='area', + displayName='', + maxDataPoints=100, + colorMode='none', + unit='decbytes', + pluginVersion='9.4.7', + ) + .addMappings([ + { options: { result: { text: 'N/A' } }, type: 'special' }, + ]) + .addThresholds([ + { color: 'rgba(50, 172, 45, 0.97)', value: null }, + { color: 'rgba(237, 129, 40, 0.89)', value: 0.025 }, + { color: 'rgba(245, 54, 54, 0.9)', value: 0.1 }, + ]) + .addTargets([ + $.addTargetSchema( + expr='ceph_cluster_total_used_bytes{%(matchers)s}' % $.matchers(), + legendFormat='', + datasource='$datasource', + instant=true, + ), + ]), + + $.addStatPanel( + title='Write IOPS', + datasource='$datasource', + gridPosition={ h: 3, w: 3, x: 9, y: 4 }, + color={ mode: 'thresholds' }, + thresholdsMode='absolute', + orientation='horizontal', + graphMode='area', + displayName='', + maxDataPoints=100, + colorMode='none', + unit='ops', + pluginVersion='9.4.7', + ) + .addMappings([ + { options: { result: { text: 'N/A' } }, type: 'special' }, + ]) + .addThresholds([ + { color: 'green', value: null }, + ]) + .addTargets([ + $.addTargetSchema( + expr='sum(irate(ceph_osd_op_w{%(matchers)s}[1m]))' % $.matchers(), + legendFormat='', + datasource='$datasource', + instant=true, + ), + ]), + + $.addStatPanel( + title='Read IOPS', + datasource='$datasource', + gridPosition={ h: 3, w: 3, x: 12, y: 4 }, + color={ mode: 'thresholds' }, + thresholdsMode='absolute', + orientation='horizontal', + graphMode='area', + displayName='', + maxDataPoints=100, + colorMode='none', + unit='ops', + pluginVersion='9.4.7', + ) + .addMappings([ + { options: { result: { text: 'N/A' } }, type: 'special' }, + ]) + .addThresholds([ + { color: '#d44a3a', value: null }, + { color: 'rgba(237, 129, 40, 0.89)', value: 0 }, + { color: '#9ac48a', value: 0 }, + ]) + .addTargets([ + $.addTargetSchema( + expr='sum(irate(ceph_osd_op_r{%(matchers)s}[1m]))' % $.matchers(), + legendFormat='', + datasource='$datasource', + instant=true, + ), + ]), + + $.addStatPanel( + title='Monitors', + datasource='$datasource', + gridPosition={ h: 3, w: 6, x: 15, y: 4 }, + color={ mode: 'thresholds' }, + thresholdsMode='absolute', + orientation='auto', + rootColorMode='Panel', + displayName='', + rootColors={ + crit: 'rgba(245, 54, 54, 0.9)', + disable: 'rgba(128, 128, 128, 0.9)', + ok: 'rgba(50, 128, 45, 0.9)', + warn: 'rgba(237, 129, 40, 0.9)', + }, + cornerRadius=1, + flipCard=false, + flipTime=5, + isAutoScrollOnOverflow=false, + isGrayOnNoData=false, + isHideAlertsOnDisable=false, + isIgnoreOKColors=false, + fontFormat='Regular', + colorMode='background', + unit='none', + pluginVersion='9.4.7', + ) + .addThresholds([ + { color: 'green', value: null }, + { color: 'red', value: 80 }, + ]) + .addTargets([ + $.addTargetSchema( + aggregation='Last', + alias='In Quorum', + decimals=2, + displayAliasType='Always', + displayType='Regular', + displayValueWithAlias='When Alias Displayed', + units='none', + valueHandler='Text Only', + expr='sum(ceph_mon_quorum_status{%(matchers)s})' % $.matchers(), + legendFormat='In Quorum', + datasource='$datasource', + ), + $.addTargetSchema( + aggregation='Last', + alias='Total', + crit=1, + decimals=2, + displayAliasType='Always', + displayType='Regular', + displayValueWithAlias='When Alias Displayed', + units='none', + valueHandler='Text Only', + expr='count(ceph_mon_quorum_status{%(matchers)s})' % $.matchers(), + legendFormat='Total', + datasource='$datasource', + warn=2, + ), + $.addTargetSchema( + aggregation='Last', + alias='MONs out of Quorum', + crit=1.6, + decimals=2, + displayAliasType='Warning / Critical', + displayType='Annotation', + displayValueWithAlias='Never', + units='none', + valueHandler='Number Threshold', + expr='count(ceph_mon_quorum_status{%(matchers)s}) - sum(ceph_mon_quorum_status{%(matchers)s})' % $.matchers(), + legendFormat='MONs out of Quorum', + datasource='$datasource', + warn=1.1, + range=true, + ), + ]), + $.addRowSchema(collapse=false, showTitle=true, title='CLUSTER STATS') + { gridPos: { x: 0, y: 7, w: 24, h: 1 } }, + $.addAlertListPanel( + title='Alerts', + datasource={ + type: 'datasource', + uid: 'grafana', + }, + gridPosition={ h: 8, w: 8, x: 0, y: 8 }, + alertInstanceLabelFilter='{alertname=~"^Ceph.+", %(matchers)s}' % $.matchers(), + alertName='', + dashboardAlerts=false, + groupBy=[], + groupMode='default', + maxItems=20, + sortOrder=1, + stateFilter={ + 'error': true, + firing: true, + noData: false, + normal: false, + pending: true, + }, + ), + + $.timeSeriesPanel( + title='Capacity', + datasource='$datasource', + gridPosition={ h: 8, w: 8, x: 8, y: 8 }, + fillOpacity=40, + pointSize=5, + showPoints='never', + unit='bytes', + displayMode='table', + tooltip={ mode: 'multi', sort: 'desc' }, + interval='$interval', + stackingMode='normal', + spanNulls=true, + decimals=2, + thresholdsMode='percentage', + sortBy='Last', + sortDesc=true, + ) + .addCalcs(['last']) + .addThresholds([ + { color: 'green', value: null }, + { color: '#c0921f', value: 75 }, + { color: '#E02F44', value: 85 }, + ]) + .addOverrides( + [ + { + matcher: { id: 'byName', options: 'Total Capacity' }, + properties: [{ + id: 'color', + value: { fixedColor: 'red', mode: 'fixed' }, + }], + }, + { + matcher: { id: 'byName', options: 'Used' }, + properties: [ + { + id: 'color', + value: { fixedColor: 'green', mode: 'fixed' }, + }, + { + id: 'custom.thresholdsStyle', + value: { mode: 'dashed' }, + }, + ], + }, + ] + ) + .addTargets( + [ + $.addTargetSchema( + expr='ceph_cluster_total_bytes{%(matchers)s}' % $.matchers(), + datasource='$datasource', + interval='$interval', + instant=false, + legendFormat='Total Capacity', + step=300, + range=true, + ), + $.addTargetSchema( + expr='ceph_cluster_total_used_bytes{%(matchers)s}' % $.matchers(), + datasource='$datasource', + interval='$interval', + instant=false, + legendFormat='Used', + step=300, + range=true, + ), + ] + ), + + $.timeSeriesPanel( + title='Cluster Throughput', + datasource='$datasource', + gridPosition={ h: 8, w: 8, x: 16, y: 8 }, + fillOpacity=10, + pointSize=5, + lineWidth=1, + showPoints='never', + unit='decbytes', + displayMode='table', + tooltip={ mode: 'multi', sort: 'desc' }, + interval='$interval', + stackingMode='normal', + spanNulls=true, + decimals=null, + thresholdsMode='absolute', + ).addCalcs(['mean', 'lastNotNull', 'max', 'min']) + .addThresholds([ + { color: 'green', value: null }, + { color: 'red', value: 85 }, + ]) + .addTargets( + [ + $.addTargetSchema( + expr='sum(irate(ceph_osd_op_w_in_bytes{%(matchers)s}[5m]))' % $.matchers(), + datasource='$datasource', + interval='$interval', + legendFormat='Write', + step=300, + range=true, + ), + $.addTargetSchema( + expr='sum(irate(ceph_osd_op_r_out_bytes{%(matchers)s}[5m]))' % $.matchers(), + datasource='$datasource', + interval='$interval', + legendFormat='Read', + step=300, + range=true, + ), + ] + ), + + $.timeSeriesPanel( + title='IOPS', + datasource='$datasource', + gridPosition={ h: 8, w: 8, x: 0, y: 16 }, + fillOpacity=10, + pointSize=5, + lineWidth=1, + showPoints='never', + unit='decbytes', + displayMode='table', + tooltip={ mode: 'multi', sort: 'desc' }, + interval='$interval', + stackingMode='normal', + spanNulls=true, + decimals=null, + thresholdsMode='absolute', + ) + .addCalcs(['mean', 'lastNotNull', 'max', 'min']) + .addThresholds([ + { color: 'green', value: null }, + { color: 'red', value: 80 }, + ]) + .addTargets( + [ + $.addTargetSchema( + expr='sum(irate(ceph_osd_op_w{%(matchers)s}[1m]))' % $.matchers(), + datasource='$datasource', + interval='$interval', + legendFormat='Write', + step=300, + range=true, + ), + $.addTargetSchema( + expr='sum(irate(ceph_osd_op_r{%(matchers)s}[1m]))' % $.matchers(), + datasource='$datasource', + interval='$interval', + legendFormat='Read', + step=300, + range=true, + ), + ] + ), + + $.timeSeriesPanel( + title='Pool Used Bytes', + datasource='$datasource', + gridPosition={ h: 8, w: 8, x: 8, y: 16 }, + fillOpacity=10, + pointSize=5, + lineWidth=1, + showPoints='never', + unit='bytes', + tooltip={ mode: 'multi', sort: 'desc' }, + interval='$interval', + stackingMode='normal', + spanNulls=true, + decimals=null, + thresholdsMode='absolute', + displayMode='list', + placement='right', + ) + .addThresholds([ + { color: 'green', value: null }, + { color: 'red', value: 80 }, + ]) + .addTargets( + [ + $.addTargetSchema( + expr='(ceph_pool_bytes_used{%(matchers)s}) *on (pool_id) group_left(name)(ceph_pool_metadata{%(matchers)s})' % $.matchers(), + datasource='$datasource', + interval='$interval', + legendFormat='{{name}}', + step=300, + ), + ] + ), + + $.timeSeriesPanel( + title='Pool Used RAW Bytes', + datasource='$datasource', + gridPosition={ h: 8, w: 8, x: 16, y: 16 }, + fillOpacity=10, + pointSize=5, + lineWidth=1, + showPoints='never', + unit='bytes', + tooltip={ mode: 'multi', sort: 'desc' }, + interval='$interval', + stackingMode='normal', + spanNulls=true, + decimals=null, + thresholdsMode='absolute', + displayMode='table', + placement='right', + ) + .addThresholds([ + { color: 'green', value: null }, + ]) + .addOverrides( + [ + { + matcher: { id: 'byName', options: 'rbd Stored' }, + properties: [{ + id: 'color', + value: { fixedColor: 'transparent', mode: 'fixed' }, + }], + }, + ] + ) + .addTargets( + [ + $.addTargetSchema( + expr='(ceph_pool_stored_raw{%(matchers)s}) *on (pool_id) group_left(name)(ceph_pool_metadata{%(matchers)s})' % $.matchers(), + datasource='$datasource', + interval='', + legendFormat='{{name}}', + step=300, + range=true, + hide=false, + ), + ] + ), + + $.timeSeriesPanel( + title='Pool Objects Quota', + datasource='$datasource', + gridPosition={ h: 7, w: 8, x: 0, y: 24 }, + fillOpacity=10, + pointSize=5, + lineWidth=1, + showPoints='never', + unit='short', + tooltip={ mode: 'multi', sort: 'none' }, + interval='$interval', + stackingMode='none', + spanNulls=true, + decimals=null, + thresholdsMode='absolute', + displayMode='list', + placement='bottom', + ) + .addThresholds([ + { color: 'green', value: null }, + { color: 'red', value: 80 }, + ]) + .addTargets( + [ + $.addTargetSchema( + expr='(ceph_pool_quota_objects{%(matchers)s}) *on (pool_id) group_left(name)(ceph_pool_metadata{%(matchers)s})' % $.matchers(), + datasource='$datasource', + interval='', + legendFormat='{{name}}', + step=300, + ), + ] + ), + + $.timeSeriesPanel( + title='Pool Quota Bytes', + datasource='$datasource', + gridPosition={ h: 7, w: 8, x: 8, y: 24 }, + fillOpacity=10, + pointSize=5, + lineWidth=1, + showPoints='never', + unit='bytes', + tooltip={ mode: 'multi', sort: 'none' }, + interval='$interval', + stackingMode='none', + spanNulls=true, + decimals=null, + thresholdsMode='absolute', + displayMode='list', + placement='bottom', + ) + .addThresholds([ + { color: 'green', value: null }, + { color: 'red', value: 80 }, + ]) + .addTargets( + [ + $.addTargetSchema( + expr='(ceph_pool_quota_bytes{%(matchers)s}) *on (pool_id) group_left(name)(ceph_pool_metadata{%(matchers)s})' % $.matchers(), + datasource='$datasource', + interval='', + legendFormat='{{name}}', + step=300, + ), + ] + ), + + $.timeSeriesPanel( + title='Objects Per Pool', + datasource='$datasource', + gridPosition={ h: 7, w: 8, x: 16, y: 24 }, + fillOpacity=10, + pointSize=5, + lineWidth=1, + showPoints='never', + unit='short', + tooltip={ mode: 'multi', sort: 'none' }, + interval='$interval', + stackingMode='normal', + spanNulls=false, + decimals=null, + thresholdsMode='absolute', + displayMode='list', + placement='right', + ) + .addThresholds([ + { color: 'green', value: null }, + { color: 'red', value: 80 }, + ]) + .addTargets( + [ + $.addTargetSchema( + expr='(ceph_pool_objects{%(matchers)s}) * on (pool_id) group_left(name)(ceph_pool_metadata{%(matchers)s})' % $.matchers(), + datasource='$datasource', + interval='', + legendFormat='{{name}}', + ), + ] + ), + + $.addRowSchema(collapse=false, showTitle=true, title='OBJECTS') + { gridPos: { x: 0, y: 31, w: 24, h: 1 } }, + + $.timeSeriesPanel( + title='OSD Type Count', + datasource='$datasource', + gridPosition={ h: 12, w: 6, x: 0, y: 32 }, + fillOpacity=10, + pointSize=5, + lineWidth=2, + showPoints='never', + unit='short', + tooltip={ mode: 'multi', sort: 'asc' }, + interval='$interval', + stackingMode='normal', + spanNulls=true, + decimals=null, + thresholdsMode='absolute', + displayMode='list', + placement='bottom', + showLegend=false, + ) + .addThresholds([ + { color: 'green' }, + { color: 'red', value: 80 }, + ]) + .addOverrides( + [ + { + matcher: { id: 'byRegexp', options: '/^Total.*$/' }, + properties: [{ + id: 'custom.stacking', + value: { group: false, mode: 'normal' }, + }], + }, + ] + ) + .addTargets( + [ + $.addTargetSchema( + expr='sum(ceph_pool_objects{%(matchers)s})' % $.matchers(), + datasource='$datasource', + interval='$interval', + legendFormat='Total', + range=true, + step=200 + ), + ] + ), + + $.timeSeriesPanel( + title='PGs State', + datasource='$datasource', + gridPosition={ h: 12, w: 8, x: 6, y: 32 }, + fillOpacity=10, + pointSize=5, + lineWidth=2, + showPoints='never', + unit='short', + tooltip={ mode: 'multi', sort: 'asc' }, + interval='$interval', + stackingMode='normal', + spanNulls=true, + decimals=null, + thresholdsMode='absolute', + displayMode='table', + placement='right', + showLegend=true, + ) + .addThresholds([ + { color: 'green' }, + { color: 'red', value: 80 }, + ]) + .addCalcs(['lastNotNull']) + .addOverrides( + [ + { + matcher: { id: 'byRegexp', options: '/^Total.*$/' }, + properties: [{ + id: 'custom.stacking', + value: { group: false, mode: 'normal' }, + }], + }, + ] + ) + .addTargets( + [ + $.addTargetSchema( + expr='sum(ceph_pg_active{%(matchers)s})' % $.matchers(), + datasource='$datasource', + interval='$interval', + legendFormat='Active', + range=true, + ), + $.addTargetSchema( + expr='sum(ceph_pg_clean{%(matchers)s})' % $.matchers(), + datasource='$datasource', + interval='$interval', + legendFormat='Clean', + range=true, + ), + $.addTargetSchema( + expr='sum(ceph_pg_peering{%(matchers)s})' % $.matchers(), + datasource='$datasource', + interval='$interval', + legendFormat='Peering', + range=true, + ), + $.addTargetSchema( + expr='sum(ceph_pg_degraded{%(matchers)s})' % $.matchers(), + datasource='$datasource', + interval='$interval', + legendFormat='Degraded', + range=true, + step=300, + ), + $.addTargetSchema( + expr='sum(ceph_pg_stale{%(matchers)s})' % $.matchers(), + datasource='$datasource', + interval='$interval', + legendFormat='Stale', + range=true, + step=300, + ), + $.addTargetSchema( + expr='sum(ceph_unclean_pgs{%(matchers)s})' % $.matchers(), + datasource='$datasource', + interval='$interval', + legendFormat='Unclean', + range=true, + step=300, + ), + $.addTargetSchema( + expr='sum(ceph_pg_undersized{%(matchers)s})' % $.matchers(), + datasource='$datasource', + interval='$interval', + legendFormat='Undersized', + range=true, + step=300, + ), + $.addTargetSchema( + expr='sum(ceph_pg_incomplete{%(matchers)s})' % $.matchers(), + datasource='$datasource', + interval='$interval', + legendFormat='Incomplete', + range=true, + ), + $.addTargetSchema( + expr='sum(ceph_pg_forced_backfill{%(matchers)s})' % $.matchers(), + datasource='$datasource', + interval='$interval', + legendFormat='Forced Backfill', + range=true, + ), + $.addTargetSchema( + expr='sum(ceph_pg_forced_recovery{%(matchers)s})' % $.matchers(), + datasource='$datasource', + interval='$interval', + legendFormat='Forced Recovery', + range=true, + ), + $.addTargetSchema( + expr='sum(ceph_pg_creating{%(matchers)s})' % $.matchers(), + datasource='$datasource', + interval='$interval', + legendFormat='Creating', + range=true, + ), + $.addTargetSchema( + expr='sum(ceph_pg_wait_backfill{%(matchers)s})' % $.matchers(), + datasource='$datasource', + interval='$interval', + legendFormat='Wait Backfill', + range=true, + ), + $.addTargetSchema( + expr='sum(ceph_pg_deep{%(matchers)s})' % $.matchers(), + datasource='$datasource', + interval='$interval', + legendFormat='Deep', + range=true, + ), + $.addTargetSchema( + expr='sum(ceph_pg_scrubbing{%(matchers)s})' % $.matchers(), + datasource='$datasource', + interval='$interval', + legendFormat='Scrubbing', + range=true, + ), + $.addTargetSchema( + expr='sum(ceph_pg_recovering{%(matchers)s})' % $.matchers(), + datasource='$datasource', + interval='$interval', + legendFormat='Recovering', + range=true, + ), + $.addTargetSchema( + expr='sum(ceph_pg_repair{%(matchers)s})' % $.matchers(), + datasource='$datasource', + interval='$interval', + legendFormat='Repair', + range=true, + ), + $.addTargetSchema( + expr='sum(ceph_pg_down{%(matchers)s})' % $.matchers(), + datasource='$datasource', + interval='$interval', + legendFormat='Down', + range=true, + ), + $.addTargetSchema( + expr='sum(ceph_pg_peered{%(matchers)s})' % $.matchers(), + datasource='$datasource', + interval='$interval', + legendFormat='Peered', + range=true, + ), + $.addTargetSchema( + expr='sum(ceph_pg_backfill{%(matchers)s})' % $.matchers(), + datasource='$datasource', + interval='$interval', + legendFormat='Backfill', + range=true, + ), + $.addTargetSchema( + expr='sum(ceph_pg_remapped{%(matchers)s})' % $.matchers(), + datasource='$datasource', + interval='$interval', + legendFormat='Remapped', + range=true, + ), + $.addTargetSchema( + expr='sum(ceph_pg_backfill_toofull{%(matchers)s})' % $.matchers(), + datasource='$datasource', + interval='$interval', + legendFormat='Backfill Toofull', + range=true, + ), + ] + ), + + $.timeSeriesPanel( + title='Stuck PGs', + datasource='$datasource', + gridPosition={ h: 6, w: 10, x: 14, y: 32 }, + fillOpacity=10, + pointSize=5, + lineWidth=2, + showPoints='never', + unit='short', + tooltip={ mode: 'multi', sort: 'asc' }, + interval='$interval', + stackingMode='normal', + spanNulls=true, + decimals=null, + thresholdsMode='absolute', + displayMode='table', + placement='right', + showLegend=true, + ) + .addCalcs(['mean', 'lastNotNull']) + .addThresholds([ + { color: 'green' }, + { color: 'red', value: 80 }, + ]) + .addOverrides( + [ + { + matcher: { id: 'byRegexp', options: '/^Total.*$/' }, + properties: [{ + id: 'custom.stacking', + value: { group: false, mode: 'normal' }, + }], + }, + ] + ) + .addTargets([ + $.addTargetSchema( + expr='sum(ceph_pg_degraded{%(matchers)s})' % $.matchers(), + datasource='$datasource', + interval='$interval', + legendFormat='Degraded', + range=true, + step=300, + ), + $.addTargetSchema( + expr='sum(ceph_pg_stale{%(matchers)s})' % $.matchers(), + datasource='$datasource', + interval='$interval', + legendFormat='Stale', + range=true, + step=300, + ), + $.addTargetSchema( + expr='sum(ceph_pg_undersized{%(matchers)s})' % $.matchers(), + datasource='$datasource', + interval='$interval', + legendFormat='Undersized', + range=true, + step=300, + ), + ]), + + $.timeSeriesPanel( + title='Recovery Operations', + datasource='$datasource', + gridPosition={ h: 6, w: 10, x: 14, y: 38 }, + fillOpacity=10, + pointSize=5, + lineWidth=2, + showPoints='never', + unit='short', + tooltip={ mode: 'multi', sort: 'none' }, + interval='$interval', + stackingMode='none', + spanNulls=true, + decimals=null, + thresholdsMode='absolute', + displayMode='list', + placement='bottom', + showLegend=false, + ) + .addThresholds([ + { color: 'green' }, + { color: 'red', value: 80 }, + ]) + .addTargets([ + $.addTargetSchema( + expr='sum(irate(ceph_osd_recovery_ops{%(matchers)s}[$interval]))' % $.matchers(), + datasource='$datasource', + interval='$interval', + legendFormat='OPS', + step=300, + ), + ]), + $.addRowSchema(false, true, 'LATENCY', collapsed=true) + .addPanels([ + $.heatMapPanel( + title='OSD Apply Latency Distribution', + datasource='$datasource', + gridPosition={ h: 8, w: 12, x: 0, y: 42 }, + colorMode='opacity', + legendShow=true, + optionsCalculate=true, + optionsColor={ + exponent: 0.5, + fill: '#b4ff00', + mode: 'opacity', + reverse: false, + scale: 'exponential', + scheme: 'Oranges', + steps: 128, + }, + optionsExemplars={ color: 'rgba(255,0,255,0.7)' }, + optionsFilterValues={ le: 1e-9 }, + optionsLegend={ show: true }, + optionsRowFrame={ layout: 'auto' }, + optionsToolTip={ + show: true, + yHistogram: false, + }, + optionsYAxis={ + axisPlacement: 'left', + min: '0', + reverse: false, + unit: 'ms', + }, + xBucketSize='', + yAxisFormat='ms', + yAxisLogBase=2, + yAxisMin='0', + yBucketSize=10, + pluginVersion='9.4.7', + ).addTarget($.addTargetSchema( + expr='ceph_osd_apply_latency_ms{%(matchers)s}' % $.matchers(), + datasource='$datasource', + interval='$interval', + instant=false, + )), + $.heatMapPanel( + title='OSD Commit Latency Distribution', + datasource='$datasource', + gridPosition={ h: 8, w: 12, x: 12, y: 42 }, + colorMode='opacity', + legendShow=true, + cardColor='#65c5db', + optionsColor={ + exponent: 0.5, + fill: '#65c5db', + mode: 'opacity', + reverse: false, + scale: 'exponential', + scheme: 'Oranges', + steps: 128, + }, + optionsCalculate=true, + optionsCalculation={ + yBuckets: { + mode: 'count', + scale: { log: 2, type: 'log' }, + }, + }, + optionsExemplars={ color: 'rgba(255,0,255,0.7)' }, + optionsFilterValues={ le: 1e-9 }, + optionsLegend={ show: true }, + optionsRowFrame={ layout: 'auto' }, + optionsToolTip={ + show: true, + yHistogram: false, + }, + optionsYAxis={ + axisPlacement: 'left', + min: '0', + reverse: false, + unit: 'ms', + }, + xBucketSize='', + yAxisFormat='ms', + yAxisLogBase=2, + yAxisMin='0', + yBucketSize=10, + pluginVersion='9.4.7', + ).addTarget($.addTargetSchema( + expr='ceph_osd_commit_latency_ms{%(matchers)s}' % $.matchers(), + datasource='$datasource', + interval='$interval', + instant=false, + )), + $.heatMapPanel( + title='OSD Read Op Latency Distribution', + datasource='$datasource', + gridPosition={ h: 8, w: 12, x: 0, y: 50 }, + colorMode='opacity', + legendShow=true, + cardColor='#806eb7', + optionsColor={ + exponent: 0.5, + fill: '#806eb7', + mode: 'opacity', + reverse: false, + scale: 'exponential', + scheme: 'Oranges', + steps: 128, + }, + optionsCalculate=true, + optionsCalculation={ + yBuckets: { + mode: 'count', + scale: { log: 2, type: 'log' }, + }, + }, + optionsExemplars={ color: 'rgba(255,0,255,0.7)' }, + optionsFilterValues={ le: 1e-9 }, + optionsLegend={ show: true }, + optionsRowFrame={ layout: 'auto' }, + optionsToolTip={ + show: true, + yHistogram: false, + }, + optionsYAxis={ + axisPlacement: 'left', + decimals: 2, + min: '0', + reverse: false, + unit: 'ms', + }, + xBucketSize='', + yAxisFormat='ms', + yAxisLogBase=2, + yAxisMin='0', + yBucketSize=null, + pluginVersion='9.4.7', + ).addTarget($.addTargetSchema( + expr='rate(ceph_osd_op_r_latency_sum{%(matchers)s}[5m]) / rate(ceph_osd_op_r_latency_count{%(matchers)s}[5m]) >= 0' % $.matchers(), + datasource='$datasource', + interval='$interval', + instant=false, + )), + + $.heatMapPanel( + title='OSD Write Op Latency Distribution', + datasource='$datasource', + gridPosition={ h: 8, w: 12, x: 12, y: 50 }, + colorMode='opacity', + legendShow=true, + cardColor='#f9934e', + optionsColor={ + exponent: 0.5, + fill: '#f9934e', + mode: 'opacity', + reverse: false, + scale: 'exponential', + scheme: 'Oranges', + steps: 128, + }, + optionsCalculate=true, + optionsCalculation={ + yBuckets: { + mode: 'count', + scale: { log: 2, type: 'log' }, + }, + }, + optionsExemplars={ color: 'rgba(255,0,255,0.7)' }, + optionsFilterValues={ le: 1e-9 }, + optionsLegend={ show: true }, + optionsRowFrame={ layout: 'auto' }, + optionsToolTip={ + show: true, + yHistogram: false, + }, + optionsYAxis={ + axisPlacement: 'left', + decimals: 2, + min: '0', + reverse: false, + unit: 'ms', + }, + xBucketSize='', + yAxisFormat='ms', + yAxisLogBase=2, + yAxisMin='0', + yBucketSize=null, + pluginVersion='9.4.7', + ).addTarget($.addTargetSchema( + expr='rate(ceph_osd_op_w_latency_sum{%(matchers)s}[5m]) / rate(ceph_osd_op_w_latency_count{%(matchers)s}[5m]) >= 0' % $.matchers(), + datasource='$datasource', + interval='$interval', + legendFormat='', + instant=false, + )), + $.timeSeriesPanel( + title='Recovery Operations', + datasource='$datasource', + gridPosition={ h: 7, w: 12, x: 0, y: 58 }, + fillOpacity=10, + pointSize=5, + lineWidth=1, + showPoints='never', + unit='ms', + tooltip={ mode: 'multi', sort: 'none' }, + interval='$interval', + stackingMode='none', + spanNulls=false, + decimals=null, + thresholdsMode='absolute', + displayMode='table', + placement='bottom', + showLegend=true, + ) + .addThresholds([ + { color: 'green' }, + { color: 'red', value: 80 }, + ]) + .addTargets([ + $.addTargetSchema( + expr='avg(rate(ceph_osd_op_r_latency_sum{%(matchers)s}[5m]) / rate(ceph_osd_op_r_latency_count{%(matchers)s}[5m]) >= 0)' % $.matchers(), + datasource='$datasource', + legendFormat='Read', + ), + $.addTargetSchema( + expr='avg(rate(ceph_osd_op_w_latency_sum{%(matchers)s}[5m]) / rate(ceph_osd_op_w_latency_count{%(matchers)s}[5m]) >= 0)' % $.matchers(), + datasource='$datasource', + legendFormat='Write', + ), + ]), + + $.timeSeriesPanel( + title='AVG OSD Apply + Commit Latency', + datasource='$datasource', + gridPosition={ h: 7, w: 12, x: 12, y: 58 }, + fillOpacity=10, + pointSize=5, + lineWidth=1, + showPoints='never', + unit='ms', + tooltip={ mode: 'multi', sort: 'none' }, + interval='$interval', + stackingMode='none', + spanNulls=false, + decimals=null, + thresholdsMode='absolute', + displayMode='table', + placement='bottom', + showLegend=true, + ) + .addCalcs(['lastNotNull', 'max']) + .addThresholds([ + { color: 'green' }, + { color: 'red', value: 80 }, + ]) + .addTargets([ + $.addTargetSchema( + expr='avg(ceph_osd_apply_latency_ms{%(matchers)s})' % $.matchers(), + datasource='$datasource', + legendFormat='apply', + interval='$interval', + metric='ceph_osd_perf_apply_latency_seconds', + step=4, + ), + $.addTargetSchema( + expr='avg(ceph_osd_commit_latency_ms{%(matchers)s})' % $.matchers(), + datasource='$datasource', + legendFormat='commit', + interval='$interval', + metric='ceph_osd_perf_commit_latency_seconds', + step=4, + ), + ]), + ]) + + { gridPos: { x: 0, y: 44, w: 24, h: 1 } }, + $.addRowSchema(collapse=true, showTitle=true, title='', collapsed=false) + { gridPos: { x: 0, y: 45, w: 24, h: 1 } }, + + $.addTableExtended( + datasource='$datasource', + title='Ceph Versions', + gridPosition={ h: 6, w: 24, x: 0, y: 46 }, + options={ + footer: { + fields: '', + reducer: ['sum'], + countRows: false, + enablePagination: false, + show: false, + }, + frameIndex: 1, + showHeader: true, + }, + custom={ align: 'left', cellOptions: { type: 'auto' }, filterable: false, inspect: false }, + thresholds={ + mode: 'absolute', + steps: [ + { color: 'green' }, + ], + }, + overrides=[{ + matcher: { id: 'byName', options: 'Time' }, + properties: [ + { id: 'custom.hidden', value: true }, + ], + }], + pluginVersion='9.4.7' + ) + .addTransformations([ + { + id: 'merge', + options: {}, + }, + { + id: 'organize', + options: { + excludeByName: {}, + indexByName: {}, + renameByName: { + Time: '', + 'Value #A': 'OSD Services', + 'Value #B': 'Mon Services', + 'Value #C': 'MDS Services', + 'Value #D': 'RGW Services', + 'Value #E': 'MGR Services', + ceph_version: 'Ceph Version', + }, + }, + }, + ]).addTargets([ + $.addTargetSchema( + expr='count by (ceph_version)(ceph_osd_metadata{%(matchers)s})' % $.matchers(), + datasource='$datasource', + format='table', + hide=false, + exemplar=false, + instant=true, + interval='', + legendFormat='OSD Services', + range=false, + ), + $.addTargetSchema( + expr='count by (ceph_version)(ceph_mon_metadata{%(matchers)s})' % $.matchers(), + datasource='$datasource', + format='table', + hide=false, + exemplar=false, + instant=true, + interval='', + legendFormat='Mon Services', + range=false, + ), + $.addTargetSchema( + expr='count by (ceph_version)(ceph_mds_metadata{%(matchers)s})' % $.matchers(), + datasource='$datasource', + format='table', + hide=false, + exemplar=false, + instant=true, + legendFormat='MDS Services', + range=false, + ), + $.addTargetSchema( + expr='count by (ceph_version)(ceph_rgw_metadata{%(matchers)s})' % $.matchers(), + datasource='$datasource', + format='table', + hide=false, + exemplar=false, + instant=true, + interval='', + legendFormat='RGW Services', + range=false, + ), + $.addTargetSchema( + expr='count by (ceph_version)(ceph_mgr_metadata{%(matchers)s})' % $.matchers(), + datasource='$datasource', + format='table', + hide=false, + exemplar=false, + instant=true, + interval='', + legendFormat='MGR Services', + range=false, + ), + ]), + + + ] //end panels + ), +} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards/cephfs.libsonnet b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards/cephfs.libsonnet new file mode 100644 index 000000000..11548ef2a --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards/cephfs.libsonnet @@ -0,0 +1,86 @@ +local g = import 'grafonnet/grafana.libsonnet'; + +(import 'utils.libsonnet') { + 'cephfs-overview.json': + $.dashboardSchema( + 'MDS Performance', + '', + 'tbO9LAiZz', + 'now-1h', + '30s', + 16, + $._config.dashboardTags, + '' + ) + .addAnnotation( + $.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' + ) + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.3.2' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addTemplate( + g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') + ) + .addTemplate( + $.addClusterTemplate() + ) + .addTemplate( + $.addTemplateSchema('mds_servers', + '$datasource', + 'label_values(ceph_mds_inodes{%(matchers)s}, ceph_daemon)' % $.matchers(), + 1, + true, + 1, + 'MDS Server', + '') + ) + .addPanels([ + $.addRowSchema(false, true, 'MDS Performance') + { gridPos: { x: 0, y: 0, w: 24, h: 1 } }, + $.simpleGraphPanel( + {}, + 'MDS Workload - $mds_servers', + '', + 'none', + 'Reads(-) / Writes (+)', + 0, + 'sum(rate(ceph_objecter_op_r{ceph_daemon=~"($mds_servers).*", %(matchers)s}[$__rate_interval]))' % $.matchers(), + 'Read Ops', + 0, + 1, + 12, + 9 + ) + .addTarget($.addTargetSchema( + 'sum(rate(ceph_objecter_op_w{ceph_daemon=~"($mds_servers).*", %(matchers)s}[$__rate_interval]))' % $.matchers(), + 'Write Ops' + )) + .addSeriesOverride( + { alias: '/.*Reads/', transform: 'negative-Y' } + ), + $.simpleGraphPanel( + {}, + 'Client Request Load - $mds_servers', + '', + 'none', + 'Client Requests', + 0, + 'ceph_mds_server_handle_client_request{ceph_daemon=~"($mds_servers).*", %(matchers)s}' % $.matchers(), + '{{ceph_daemon}}', + 12, + 1, + 12, + 9 + ), + ]), +} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards/host.libsonnet b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards/host.libsonnet new file mode 100644 index 000000000..cf7e04569 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards/host.libsonnet @@ -0,0 +1,793 @@ +local g = import 'grafonnet/grafana.libsonnet'; + +(import 'utils.libsonnet') { + 'hosts-overview.json': + $.dashboardSchema( + 'Host Overview', + '', + 'y0KGL0iZz', + 'now-1h', + '30s', + 16, + $._config.dashboardTags, + '', + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.3.2' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addRequired( + type='panel', id='singlestat', name='Singlestat', version='5.0.0' + ) + .addAnnotation( + $.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' + ) + ) + .addTemplate( + g.template.datasource('datasource', + 'prometheus', + 'default', + label='Data Source') + ) + .addTemplate( + $.addClusterTemplate() + ) + .addTemplate( + $.addTemplateSchema('osd_hosts', + '$datasource', + 'label_values(ceph_osd_metadata{%(matchers)s}, hostname)' % $.matchers(), + 1, + true, + 1, + null, + '([^.]*).*') + ) + .addTemplate( + $.addTemplateSchema('mon_hosts', + '$datasource', + 'label_values(ceph_mon_metadata{%(matchers)s}, hostname)' % $.matchers(), + 1, + true, + 1, + null, + 'mon.(.*)') + ) + .addTemplate( + $.addTemplateSchema('mds_hosts', + '$datasource', + 'label_values(ceph_mds_inodes{hostname, %(matchers)s})' % $.matchers(), + 1, + true, + 1, + null, + 'mds.(.*)') + ) + .addTemplate( + $.addTemplateSchema('rgw_hosts', + '$datasource', + 'label_values(ceph_rgw_metadata{hostname, %(matchers)s})' % $.matchers(), + 1, + true, + 1, + null, + 'rgw.(.*)') + ) + .addPanels([ + $.simpleSingleStatPanel( + 'none', + 'OSD Hosts', + '', + 'current', + 'count(sum by (hostname) (ceph_osd_metadata{%(matchers)s}))' % $.matchers(), + true, + 'time_series', + 0, + 0, + 4, + 5 + ), + $.simpleSingleStatPanel( + 'percentunit', + 'AVG CPU Busy', + 'Average CPU busy across all hosts (OSD, RGW, MON etc) within the cluster', + 'current', + ||| + avg(1 - ( + avg by(instance) ( + rate(node_cpu_seconds_total{mode='idle',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[$__rate_interval]) or + rate(node_cpu{mode='idle',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[$__rate_interval]) + ) + )) + |||, + true, + 'time_series', + 4, + 0, + 4, + 5 + ), + $.simpleSingleStatPanel( + 'percentunit', + 'AVG RAM Utilization', + 'Average Memory Usage across all hosts in the cluster (excludes buffer/cache usage)', + 'current', + ||| + avg (( + ( + node_memory_MemTotal{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or + node_memory_MemTotal_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} + ) - (( + node_memory_MemFree{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or + node_memory_MemFree_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}) + + ( + node_memory_Cached{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or + node_memory_Cached_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} + ) + ( + node_memory_Buffers{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or + node_memory_Buffers_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} + ) + ( + node_memory_Slab{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or + node_memory_Slab_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} + ) + ) + ) / ( + node_memory_MemTotal{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or + node_memory_MemTotal_bytes{instance=~"($osd_hosts|$rgw_hosts|$mon_hosts|$mds_hosts).*"} + )) + |||, + true, + 'time_series', + 8, + 0, + 4, + 5 + ), + $.simpleSingleStatPanel( + 'none', + 'Physical IOPS', + 'IOPS Load at the device as reported by the OS on all OSD hosts', + 'current', + ||| + sum (( + rate(node_disk_reads_completed{instance=~"($osd_hosts).*"}[$__rate_interval]) or + rate(node_disk_reads_completed_total{instance=~"($osd_hosts).*"}[$__rate_interval]) + ) + ( + rate(node_disk_writes_completed{instance=~"($osd_hosts).*"}[$__rate_interval]) or + rate(node_disk_writes_completed_total{instance=~"($osd_hosts).*"}[$__rate_interval]) + )) + |||, + true, + 'time_series', + 12, + 0, + 4, + 5 + ), + $.simpleSingleStatPanel( + 'percent', + 'AVG Disk Utilization', + 'Average Disk utilization for all OSD data devices (i.e. excludes journal/WAL)', + 'current', + ||| + avg ( + label_replace( + (rate(node_disk_io_time_ms[$__rate_interval]) / 10 ) or + (rate(node_disk_io_time_seconds_total[$__rate_interval]) * 100), + "instance", "$1", "instance", "([^.:]*).*" + ) * on(instance, device) group_left(ceph_daemon) label_replace( + label_replace( + ceph_disk_occupation_human{instance=~"($osd_hosts).*", %(matchers)s}, + "device", "$1", "device", "/dev/(.*)" + ), "instance", "$1", "instance", "([^.:]*).*" + ) + ) + ||| % $.matchers(), + true, + 'time_series', + 16, + 0, + 4, + 5 + ), + $.simpleSingleStatPanel( + 'bytes', + 'Network Load', + 'Total send/receive network load across all hosts in the ceph cluster', + 'current', + ||| + sum ( + ( + rate(node_network_receive_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[$__rate_interval]) or + rate(node_network_receive_bytes_total{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[$__rate_interval]) + ) unless on (device, instance) + label_replace((node_bonding_slaves > 0), "device", "$1", "master", "(.+)") + ) + + sum ( + ( + rate(node_network_transmit_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[$__rate_interval]) or + rate(node_network_transmit_bytes_total{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[$__rate_interval]) + ) unless on (device, instance) + label_replace((node_bonding_slaves > 0), "device", "$1", "master", "(.+)") + ) + |||, + true, + 'time_series', + 20, + 0, + 4, + 5 + ), + $.simpleGraphPanel( + {}, + 'CPU Busy - Top 10 Hosts', + 'Show the top 10 busiest hosts by cpu', + 'percent', + null, + 0, + ||| + topk(10, + 100 * ( + 1 - ( + avg by(instance) ( + rate(node_cpu_seconds_total{mode='idle',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[$__rate_interval]) or + rate(node_cpu{mode='idle',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[$__rate_interval]) + ) + ) + ) + ) + |||, + '{{instance}}', + 0, + 5, + 12, + 9 + ), + $.simpleGraphPanel( + {}, + 'Network Load - Top 10 Hosts', + 'Top 10 hosts by network load', + 'Bps', + null, + 0, + ||| + topk(10, (sum by(instance) ( + ( + rate(node_network_receive_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[$__rate_interval]) or + rate(node_network_receive_bytes_total{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[$__rate_interval]) + ) + + ( + rate(node_network_transmit_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[$__rate_interval]) or + rate(node_network_transmit_bytes_total{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[$__rate_interval]) + ) unless on (device, instance) + label_replace((node_bonding_slaves > 0), "device", "$1", "master", "(.+)")) + )) + |||, + '{{instance}}', + 12, + 5, + 12, + 9 + ), + ]), + 'host-details.json': + $.dashboardSchema( + 'Host Details', + '', + 'rtOg0AiWz', + 'now-1h', + '30s', + 16, + $._config.dashboardTags + ['overview'], + '' + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.3.2' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addRequired( + type='panel', id='singlestat', name='Singlestat', version='5.0.0' + ) + .addAnnotation( + $.addAnnotationSchema( + 1, '-- Grafana --', true, true, 'rgba(0, 211, 255, 1)', 'Annotations & Alerts', 'dashboard' + ) + ) + .addTemplate( + g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') + ) + .addTemplate( + $.addClusterTemplate() + ) + .addTemplate( + $.addTemplateSchema('ceph_hosts', + '$datasource', + 'label_values({__name__=~"ceph_.+_metadata", %(matchers)s}, hostname)' % $.matchers(), + 1, + true, + 1, + null, + '([^.]*).*') + ) + .addPanels([ + $.addRowSchema(false, true, '$ceph_hosts System Overview') + { gridPos: { x: 0, y: 0, w: 24, h: 1 } }, + $.simpleSingleStatPanel( + 'none', + 'OSDs', + '', + 'current', + 'count(sum by (ceph_daemon) (ceph_osd_metadata{%(matchers)s}))' % $.matchers(), + null, + 'time_series', + 0, + 1, + 3, + 5 + ), + $.simpleGraphPanel( + { + interrupt: '#447EBC', + steal: '#6D1F62', + system: '#890F02', + user: '#3F6833', + wait: '#C15C17', + }, + 'CPU Utilization', + "Shows the CPU breakdown. When multiple servers are selected, only the first host's cpu data is shown", + 'percent', + '% Utilization', + null, + ||| + sum by (mode) ( + rate(node_cpu{instance=~"($ceph_hosts)([\\\\.:].*)?", mode=~"(irq|nice|softirq|steal|system|user|iowait)"}[$__rate_interval]) or + rate(node_cpu_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?", mode=~"(irq|nice|softirq|steal|system|user|iowait)"}[$__rate_interval]) + ) / ( + scalar( + sum(rate(node_cpu{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or + rate(node_cpu_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval])) + ) * 100 + ) + |||, + '{{mode}}', + 3, + 1, + 6, + 10 + ), + $.simpleGraphPanel( + { + Available: '#508642', + Free: '#508642', + Total: '#bf1b00', + Used: '#bf1b00', + total: '#bf1b00', + used: '#0a50a1', + }, + 'RAM Usage', + '', + 'bytes', + 'RAM used', + null, + ||| + node_memory_MemFree{instance=~"$ceph_hosts([\\\\.:].*)?"} or + node_memory_MemFree_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"} + |||, + 'Free', + 9, + 1, + 6, + 10 + ) + .addTargets( + [ + $.addTargetSchema( + ||| + node_memory_MemTotal{instance=~"$ceph_hosts([\\\\.:].*)?"} or + node_memory_MemTotal_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"} + |||, + 'total' + ), + $.addTargetSchema( + ||| + ( + node_memory_Cached{instance=~"$ceph_hosts([\\\\.:].*)?"} or + node_memory_Cached_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"} + ) + ( + node_memory_Buffers{instance=~"$ceph_hosts([\\\\.:].*)?"} or + node_memory_Buffers_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"} + ) + ( + node_memory_Slab{instance=~"$ceph_hosts([\\\\.:].*)?"} or + node_memory_Slab_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"} + ) + |||, + 'buffers/cache' + ), + $.addTargetSchema( + ||| + ( + node_memory_MemTotal{instance=~"$ceph_hosts([\\\\.:].*)?"} or + node_memory_MemTotal_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"} + ) - ( + ( + node_memory_MemFree{instance=~"$ceph_hosts([\\\\.:].*)?"} or + node_memory_MemFree_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"} + ) + ( + node_memory_Cached{instance=~"$ceph_hosts([\\\\.:].*)?"} or + node_memory_Cached_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"} + ) + ( + node_memory_Buffers{instance=~"$ceph_hosts([\\\\.:].*)?"} or + node_memory_Buffers_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"} + ) + + ( + node_memory_Slab{instance=~"$ceph_hosts([\\\\.:].*)?"} or + node_memory_Slab_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"} + ) + ) + |||, + 'used' + ), + ] + ) + .addSeriesOverride( + { + alias: 'total', + color: '#bf1b00', + fill: 0, + linewidth: 2, + stack: false, + } + ), + $.simpleGraphPanel( + {}, + 'Network Load', + "Show the network load (rx,tx) across all interfaces (excluding loopback 'lo')", + 'decbytes', + 'Send (-) / Receive (+)', + null, + ||| + sum by (device) ( + rate( + node_network_receive_bytes{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[$__rate_interval]) or + rate(node_network_receive_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[$__rate_interval] + ) + ) + |||, + '{{device}}.rx', + 15, + 1, + 6, + 10 + ) + .addTargets( + [ + $.addTargetSchema( + ||| + sum by (device) ( + rate(node_network_transmit_bytes{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[$__rate_interval]) or + rate(node_network_transmit_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[$__rate_interval]) + ) + |||, + '{{device}}.tx' + ), + ] + ) + .addSeriesOverride( + { alias: '/.*tx/', transform: 'negative-Y' } + ), + $.simpleGraphPanel( + {}, + 'Network drop rate', + '', + 'pps', + 'Send (-) / Receive (+)', + null, + ||| + rate(node_network_receive_drop{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) or + rate(node_network_receive_drop_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) + |||, + '{{device}}.rx', + 21, + 1, + 3, + 5 + ) + .addTargets( + [ + $.addTargetSchema( + ||| + rate(node_network_transmit_drop{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) or + rate(node_network_transmit_drop_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) + |||, + '{{device}}.tx' + ), + ] + ) + .addSeriesOverride( + { + alias: '/.*tx/', + transform: 'negative-Y', + } + ), + $.simpleSingleStatPanel( + 'bytes', + 'Raw Capacity', + 'Each OSD consists of a Journal/WAL partition and a data partition. The RAW Capacity shown is the sum of the data partitions across all OSDs on the selected OSD hosts.', + 'current', + ||| + sum( + ceph_osd_stat_bytes{%(matchers)s} and + on (ceph_daemon) ceph_disk_occupation{instance=~"($ceph_hosts)([\\\\.:].*)?", %(matchers)s} + ) + ||| % $.matchers(), + null, + 'time_series', + 0, + 6, + 3, + 5 + ), + $.simpleGraphPanel( + {}, + 'Network error rate', + '', + 'pps', + 'Send (-) / Receive (+)', + null, + ||| + rate(node_network_receive_errs{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) or + rate(node_network_receive_errs_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) + |||, + '{{device}}.rx', + 21, + 6, + 3, + 5 + ) + .addTargets( + [$.addTargetSchema( + ||| + rate(node_network_transmit_errs{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) or + rate(node_network_transmit_errs_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) + |||, + '{{device}}.tx' + )] + ) + .addSeriesOverride( + { + alias: '/.*tx/', + transform: 'negative-Y', + } + ), + $.addRowSchema(false, + true, + 'OSD Disk Performance Statistics') + { gridPos: { x: 0, y: 11, w: 24, h: 1 } }, + $.simpleGraphPanel( + {}, + '$ceph_hosts Disk IOPS', + "For any OSD devices on the host, this chart shows the iops per physical device. Each device is shown by it's name and corresponding OSD id value", + 'ops', + 'Read (-) / Write (+)', + null, + ||| + label_replace( + ( + rate(node_disk_writes_completed{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or + rate(node_disk_writes_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) + ), "instance", "$1", "instance", "([^:.]*).*" + ) * on(instance, device) group_left(ceph_daemon) label_replace( + label_replace( + ceph_disk_occupation_human{%(matchers)s}, "device", "$1", "device", "/dev/(.*)" + ), "instance", "$1", "instance", "([^:.]*).*" + ) + ||| % $.matchers(), + '{{device}}({{ceph_daemon}}) writes', + 0, + 12, + 11, + 9 + ) + .addTargets( + [ + $.addTargetSchema( + ||| + label_replace( + ( + rate(node_disk_reads_completed{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or + rate(node_disk_reads_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) + ), "instance", "$1", "instance", "([^:.]*).*" + ) * on(instance, device) group_left(ceph_daemon) label_replace( + label_replace( + ceph_disk_occupation_human{%(matchers)s},"device", "$1", "device", "/dev/(.*)" + ), "instance", "$1", "instance", "([^:.]*).*" + ) + ||| % $.matchers(), + '{{device}}({{ceph_daemon}}) reads' + ), + ] + ) + .addSeriesOverride( + { alias: '/.*reads/', transform: 'negative-Y' } + ), + $.simpleGraphPanel( + {}, + '$ceph_hosts Throughput by Disk', + 'For OSD hosts, this chart shows the disk bandwidth (read bytes/sec + write bytes/sec) of the physical OSD device. Each device is shown by device name, and corresponding OSD id', + 'Bps', + 'Read (-) / Write (+)', + null, + ||| + label_replace( + ( + rate(node_disk_bytes_written{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or + rate(node_disk_written_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) + ), "instance", "$1", "instance", "([^:.]*).*") * on(instance, device) + group_left(ceph_daemon) label_replace( + label_replace(ceph_disk_occupation_human{%(matchers)s}, "device", "$1", "device", "/dev/(.*)"), + "instance", "$1", "instance", "([^:.]*).*" + ) + ||| % $.matchers(), + '{{device}}({{ceph_daemon}}) write', + 12, + 12, + 11, + 9 + ) + .addTargets( + [$.addTargetSchema( + ||| + label_replace( + ( + rate(node_disk_bytes_read{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or + rate(node_disk_read_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) + ), + "instance", "$1", "instance", "([^:.]*).*") * on(instance, device) + group_left(ceph_daemon) label_replace( + label_replace(ceph_disk_occupation_human{%(matchers)s}, "device", "$1", "device", "/dev/(.*)"), + "instance", "$1", "instance", "([^:.]*).*" + ) + ||| % $.matchers(), + '{{device}}({{ceph_daemon}}) read' + )] + ) + .addSeriesOverride( + { alias: '/.*read/', transform: 'negative-Y' } + ), + $.simpleGraphPanel( + {}, + '$ceph_hosts Disk Latency', + "For OSD hosts, this chart shows the latency at the physical drive. Each drive is shown by device name, with it's corresponding OSD id", + 's', + '', + null, + ||| + max by(instance, device) (label_replace( + (rate(node_disk_write_time_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval])) / + clamp_min(rate(node_disk_writes_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]), 0.001) or + (rate(node_disk_read_time_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval])) / + clamp_min(rate(node_disk_reads_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]), 0.001), + "instance", "$1", "instance", "([^:.]*).*" + )) * on(instance, device) group_left(ceph_daemon) label_replace( + label_replace( + ceph_disk_occupation_human{instance=~"($ceph_hosts)([\\\\.:].*)?"}, + "device", "$1", "device", "/dev/(.*)" + ), "instance", "$1", "instance", "([^:.]*).*" + ) + ||| % $.matchers(), + '{{device}}({{ceph_daemon}})', + 0, + 21, + 11, + 9 + ), + $.simpleGraphPanel( + {}, + '$ceph_hosts Disk utilization', + 'Show disk utilization % (util) of any OSD devices on the host by the physical device name and associated OSD id.', + 'percent', + '%Util', + null, + ||| + label_replace( + ( + (rate(node_disk_io_time_ms{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) / 10) or + rate(node_disk_io_time_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) * 100 + ), "instance", "$1", "instance", "([^:.]*).*" + ) * on(instance, device) group_left(ceph_daemon) label_replace( + label_replace(ceph_disk_occupation_human{instance=~"($ceph_hosts)([\\\\.:].*)?", %(matchers)s}, + "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*" + ) + ||| % $.matchers(), + '{{device}}({{ceph_daemon}})', + 12, + 21, + 11, + 9 + ), + + $.addTableExtended( + datasource='${datasource}', + title='Top Slow Ops per Host', + gridPosition={ h: 8, w: 6, x: 0, y: 30 }, + options={ + footer: { + fields: '', + reducer: ['sum'], + countRows: false, + enablePagination: false, + show: false, + }, + frameIndex: 1, + showHeader: true, + }, + custom={ align: 'null', cellOptions: { type: 'auto' }, filterable: true, inspect: false }, + thresholds={ + mode: 'absolute', + steps: [ + { color: 'green', value: null }, + { color: 'red', value: 80 }, + ], + }, + overrides=[ + { + matcher: { id: 'byName', options: 'instance' }, + properties: [ + { id: 'displayName', value: 'Instance' }, + { id: 'unit', value: 'short' }, + { id: 'decimals', value: 2 }, + { id: 'custom.align', value: null }, + ], + }, + { + matcher: { id: 'byName', options: 'Value' }, + properties: [ + { id: 'displayName', value: 'Slow Ops' }, + { id: 'unit', value: 'none' }, + { id: 'decimals', value: 2 }, + { id: 'custom.align', value: null }, + ], + }, + ], + pluginVersion='10.4.0' + ) + .addTransformations([ + { + id: 'merge', + options: { reducers: [] }, + } + { + id: 'organize', + options: { + excludeByName: { + Time: true, + cluster: true, + }, + indexByName: {}, + renameByName: {}, + includeByName: {}, + }, + }, + ]).addTarget( + $.addTargetSchema( + ||| + topk(10, + (sum by (instance)(ceph_daemon_health_metrics{type="SLOW_OPS", ceph_daemon=~"osd.*", %(matchers)s})) + ) + ||| % $.matchers(), + '', + 'table', + 1, + true + ) + ), + ]), +} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards/multi-cluster.libsonnet b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards/multi-cluster.libsonnet new file mode 100644 index 000000000..8185ebb25 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards/multi-cluster.libsonnet @@ -0,0 +1,931 @@ +local g = import 'grafonnet/grafana.libsonnet'; + +(import 'utils.libsonnet') { + 'multi-cluster-overview.json': + $.dashboardSchema( + 'Ceph - Multi-cluster', + '', + 'BnxelG7Sx', + 'now-1h', + '30s', + 22, + $._config.dashboardTags, + '' + ) + .addAnnotation( + $.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' + ) + ) + .addTemplate( + g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') + ) + + .addTemplate( + $.addTemplateSchema( + 'cluster', + '$datasource', + 'label_values(ceph_health_status, %s)' % $._config.clusterLabel, + 1, + true, + 1, + 'cluster', + '(.*)', + if !$._config.showMultiCluster then 'variable' else '', + multi=true, + allValues='.*', + ), + ) + + .addPanels([ + $.addRowSchema(false, true, 'Clusters') + { gridPos: { x: 0, y: 1, w: 24, h: 1 } }, + $.addStatPanel( + title='Status', + datasource='$datasource', + gridPosition={ x: 0, y: 2, w: 5, h: 7 }, + graphMode='none', + colorMode='value', + orientation='auto', + justifyMode='center', + thresholdsMode='absolute', + pluginVersion='9.4.7', + ).addThresholds([ + { color: 'text', value: null }, + ]) + .addOverrides( + [ + { + matcher: { id: 'byName', options: 'Warning' }, + properties: [ + { + id: 'thresholds', + value: { mode: 'absolute', steps: [{ color: 'text', value: null }, { color: 'semi-dark-yellow', value: 1 }] }, + }, + ], + }, + { + matcher: { id: 'byName', options: 'Error' }, + properties: [ + { + id: 'thresholds', + value: { mode: 'absolute', steps: [{ color: 'text', value: null }, { color: 'semi-dark-red', value: 1 }] }, + }, + ], + }, + { + matcher: { id: 'byName', options: 'Healthy' }, + properties: [ + { + id: 'thresholds', + value: { mode: 'absolute', steps: [{ color: 'text', value: null }, { color: 'semi-dark-green', value: 1 }] }, + }, + ], + }, + ] + ) + .addTargets([ + $.addTargetSchema( + expr='count(ceph_health_status==0) or vector(0)', + datasource='$datasource', + legendFormat='Healthy', + ), + $.addTargetSchema( + expr='count(ceph_health_status==1)', + datasource='$datasource', + legendFormat='Warning' + ), + $.addTargetSchema( + expr='count(ceph_health_status==2)', + datasource='$datasource', + legendFormat='Error' + ), + ]), + + $.addTableExtended( + datasource='$datasource', + title='Details', + gridPosition={ h: 7, w: 19, x: 5, y: 2 }, + options={ + footer: { + fields: '', + reducer: ['sum'], + countRows: false, + enablePagination: false, + show: false, + }, + frameIndex: 1, + showHeader: true, + }, + custom={ align: 'left', cellOptions: { type: 'color-text' }, filterable: false, inspect: false }, + thresholds={ + mode: 'absolute', + steps: [ + { color: 'text' }, + ], + }, + overrides=[ + { + matcher: { id: 'byName', options: 'Value #A' }, + properties: [ + { id: 'mappings', value: [{ options: { '0': { color: 'semi-dark-green', index: 2, text: 'Healthy' }, '1': { color: 'semi-dark-yellow', index: 0, text: 'Warning' }, '2': { color: 'semi-dark-red', index: 1, text: 'Error' } }, type: 'value' }] }, + ], + }, + { + matcher: { id: 'byName', options: 'Capacity Used' }, + properties: [ + { id: 'unit', value: 'bytes' }, + ], + }, + { + matcher: { id: 'byName', options: 'Cluster' }, + properties: [ + { id: 'links', value: [{ title: '', url: '/d/edtb0oxdq/ceph-cluster?var-cluster=${__data.fields.Cluster}&${DS_PROMETHEUS:queryparam}' }] }, + ], + }, + { + matcher: { id: 'byName', options: 'Alerts' }, + properties: [ + { id: 'mappings', value: [{ options: { match: null, result: { index: 0, text: '0' } }, type: 'special' }] }, + ], + }, + ], + pluginVersion='9.4.7' + ) + .addTransformations([ + { + id: 'joinByField', + options: { byField: 'cluster', mode: 'outer' }, + }, + { + id: 'organize', + options: { + excludeByName: { + 'Time 1': true, + 'Time 2': true, + 'Time 3': true, + 'Time 4': true, + 'Time 5': true, + 'Time 6': true, + 'Value #B': true, + '__name__ 1': true, + '__name__ 2': true, + '__name__ 3': true, + ceph_daemon: true, + device_class: true, + hostname: true, + 'instance 1': true, + 'instance 2': true, + 'instance 3': true, + 'job 1': true, + 'job 2': true, + 'job 3': true, + 'replica 1': true, + 'replica 2': true, + 'replica 3': true, + }, + indexByName: { + 'Time 1': 8, + 'Time 2': 13, + 'Time 3': 21, + 'Time 4': 7, + 'Time 5': 22, + 'Time 6': 23, + 'Value #A': 1, + 'Value #B': 20, + 'Value #C': 3, + 'Value #D': 6, + '__name__ 1': 9, + '__name__ 2': 14, + '__name__ 3': 24, + ceph_daemon: 15, + ceph_version: 2, + cluster: 0, + device_class: 25, + hostname: 16, + 'instance 1': 10, + 'instance 2': 17, + 'instance 3': 26, + 'job 1': 11, + 'job 2': 18, + 'job 3': 27, + 'replica 1': 12, + 'replica 2': 19, + 'replica 3': 28, + }, + renameByName: { + 'Value #A': 'Status', + 'Value #C': 'Alerts', + 'Value #D': 'Capacity Used', + ceph_version: 'Version', + cluster: 'Cluster', + }, + }, + }, + ]).addTargets([ + $.addTargetSchema( + expr='ceph_health_status', + datasource={ type: 'prometheus', uid: '$datasource' }, + format='table', + hide=false, + exemplar=false, + instant=true, + interval='', + legendFormat='__auto', + range=false, + ), + $.addTargetSchema( + expr='ceph_mgr_metadata', + datasource={ type: 'prometheus', uid: '$datasource' }, + format='table', + hide=false, + exemplar=false, + instant=true, + interval='', + legendFormat='__auto', + range=false, + ), + $.addTargetSchema( + expr='count(ALERTS{alertstate="firing", cluster=~"$cluster"})', + datasource={ type: 'prometheus', uid: '$datasource' }, + format='table', + hide=false, + exemplar=false, + instant=true, + interval='', + legendFormat='__auto', + range=false, + ), + $.addTargetSchema( + expr='ceph_cluster_by_class_total_used_bytes', + datasource={ type: 'prometheus', uid: '$datasource' }, + format='table', + hide=false, + exemplar=false, + instant=true, + interval='', + legendFormat='__auto', + range=false, + ), + ]), + + + $.addRowSchema(false, true, 'Overview') + { gridPos: { x: 0, y: 9, w: 24, h: 1 } }, + $.addStatPanel( + title='Cluster Count', + datasource='$datasource', + gridPosition={ x: 0, y: 10, w: 3, h: 4 }, + graphMode='none', + colorMode='value', + orientation='auto', + justifyMode='center', + thresholdsMode='absolute', + pluginVersion='9.4.7', + ).addThresholds([ + { color: 'text', value: null }, + { color: 'red', value: 80 }, + ]) + .addTargets([ + $.addTargetSchema( + expr='count(ceph_health_status{cluster=~"$cluster"}) or vector(0)', + datasource={ type: 'prometheus', uid: '$datasource' }, + format='table', + hide=false, + exemplar=false, + instant=true, + interval='', + legendFormat='__auto', + range=false, + ), + ]), + + $.addGaugePanel( + title='Capacity Used', + gridPosition={ h: 8, w: 4, x: 3, y: 10 }, + unit='percentunit', + max=1, + min=0, + interval='1m', + pluginVersion='9.4.7' + ) + .addThresholds([ + { color: 'green', value: null }, + { color: 'semi-dark-yellow', value: 0.75 }, + { color: 'red', value: 0.85 }, + ]) + .addTarget($.addTargetSchema( + expr='sum(ceph_cluster_total_used_bytes{cluster=~"$cluster"}) / sum(ceph_cluster_total_bytes{cluster=~"$cluster"})', + instant=true, + legendFormat='Used', + datasource='$datasource', + )), + + $.addStatPanel( + title='Total Capacity', + datasource='$datasource', + gridPosition={ x: 7, y: 10, w: 3, h: 4 }, + graphMode='area', + colorMode='none', + orientation='auto', + justifyMode='auto', + thresholdsMode='absolute', + unit='bytes', + pluginVersion='9.4.7', + ).addThresholds([ + { color: 'green', value: null }, + ]) + .addTargets([ + $.addTargetSchema( + expr='sum(ceph_cluster_total_bytes{cluster=~"$cluster"})', + datasource={ type: 'prometheus', uid: '$datasource' }, + format='table', + hide=false, + exemplar=false, + instant=false, + interval='', + legendFormat='__auto', + range=true, + ), + ]), + + $.addStatPanel( + title='OSDs', + datasource='$datasource', + gridPosition={ x: 10, y: 10, w: 3, h: 4 }, + graphMode='area', + colorMode='none', + orientation='auto', + justifyMode='auto', + thresholdsMode='absolute', + unit='none', + pluginVersion='9.4.7', + ).addThresholds([ + { color: 'green', value: null }, + ]) + .addTargets([ + $.addTargetSchema( + expr='count(ceph_osd_metadata{cluster=~"$cluster"})', + datasource={ type: 'prometheus', uid: '$datasource' }, + format='table', + hide=false, + exemplar=false, + instant=false, + interval='', + legendFormat='__auto', + range=true, + ), + ]), + + $.addStatPanel( + title='Hosts', + datasource='$datasource', + gridPosition={ x: 13, y: 10, w: 3, h: 4 }, + graphMode='area', + colorMode='none', + orientation='auto', + justifyMode='auto', + thresholdsMode='absolute', + unit='none', + pluginVersion='9.4.7', + ).addThresholds([ + { color: 'green', value: null }, + ]) + .addTargets([ + $.addTargetSchema( + expr='count(sum by (hostname) (ceph_osd_metadata{cluster=~"$cluster"}))', + datasource={ type: 'prometheus', uid: '$datasource' }, + format='table', + hide=false, + exemplar=false, + instant=false, + interval='', + legendFormat='__auto', + range=true, + ), + ]), + + $.addStatPanel( + title='Client IOPS', + datasource='$datasource', + gridPosition={ x: 16, y: 10, w: 4, h: 4 }, + graphMode='area', + colorMode='none', + orientation='auto', + justifyMode='center', + thresholdsMode='absolute', + unit='ops', + pluginVersion='9.4.7', + ).addThresholds([ + { color: 'green', value: null }, + ]) + .addTargets([ + $.addTargetSchema( + expr='sum(irate(ceph_pool_wr{cluster=~"$cluster"}[$__interval]))', + datasource={ type: 'prometheus', uid: '$datasource' }, + hide=false, + exemplar=false, + instant=false, + legendFormat='Write', + range=true, + ), + $.addTargetSchema( + expr='sum(irate(ceph_pool_rd{cluster=~"$cluster"}[$__interval]))', + datasource={ type: 'prometheus', uid: '$datasource' }, + hide=false, + exemplar=false, + legendFormat='Read', + range=true, + ), + ]), + + $.addStatPanel( + title='OSD Latencies', + datasource='$datasource', + gridPosition={ x: 20, y: 10, w: 4, h: 4 }, + graphMode='area', + colorMode='none', + orientation='auto', + justifyMode='center', + thresholdsMode='absolute', + unit='ms', + pluginVersion='9.4.7', + ).addThresholds([ + { color: 'green', value: null }, + ]) + .addTargets([ + $.addTargetSchema( + expr='avg(ceph_osd_apply_latency_ms{cluster=~"$cluster"})', + datasource={ type: 'prometheus', uid: '$datasource' }, + hide=false, + exemplar=false, + instant=false, + legendFormat='Apply', + range=true, + ), + $.addTargetSchema( + expr='avg(ceph_osd_commit_latency_ms{cluster=~"$cluster"})', + datasource={ type: 'prometheus', uid: '$datasource' }, + hide=false, + exemplar=false, + legendFormat='Commit', + range=true, + ), + ]), + + $.addStatPanel( + title='Alert Count', + datasource='$datasource', + gridPosition={ x: 0, y: 14, w: 3, h: 4 }, + graphMode='none', + colorMode='value', + orientation='auto', + justifyMode='center', + thresholdsMode='absolute', + pluginVersion='9.4.7', + ).addThresholds([ + { color: 'text', value: null }, + { color: 'red', value: 80 }, + ]) + .addTargets([ + $.addTargetSchema( + expr='count(ALERTS{alertstate="firing", cluster=~"$cluster"}) or vector(0)', + datasource={ type: 'prometheus', uid: '$datasource' }, + format='table', + hide=false, + exemplar=false, + instant=true, + interval='', + legendFormat='__auto', + range=false, + ), + ]), + + $.addStatPanel( + title='Total Used', + datasource='$datasource', + gridPosition={ x: 7, y: 14, w: 3, h: 4 }, + graphMode='area', + colorMode='none', + orientation='auto', + justifyMode='auto', + thresholdsMode='absolute', + unit='bytes', + pluginVersion='9.4.7', + ).addThresholds([ + { color: 'green', value: null }, + ]) + .addTargets([ + $.addTargetSchema( + expr='sum(ceph_cluster_total_used_bytes{cluster=~"$cluster"})', + datasource={ type: 'prometheus', uid: '$datasource' }, + format='table', + hide=false, + exemplar=false, + instant=false, + interval='', + legendFormat='__auto', + range=true, + ), + ]), + + $.addStatPanel( + title='Capacity Prediction', + datasource='$datasource', + gridPosition={ x: 10, y: 14, w: 3, h: 4 }, + graphMode='none', + colorMode='none', + orientation='auto', + justifyMode='auto', + unit='s', + thresholdsMode='absolute', + pluginVersion='9.4.7', + ).addThresholds([ + { color: 'green', value: null }, + ]) + .addTargets([ + $.addTargetSchema( + expr='predict_linear(avg(increase(ceph_cluster_total_used_bytes{cluster=~"${Cluster}"}[1d]))[7d:1h],120)', + datasource={ type: 'prometheus', uid: '$datasource' }, + hide=false, + exemplar=false, + legendFormat='__auto', + range=true, + ), + ]), + + $.addStatPanel( + title='Pools', + datasource='$datasource', + gridPosition={ x: 13, y: 14, w: 3, h: 4 }, + graphMode='area', + colorMode='none', + orientation='auto', + justifyMode='auto', + thresholdsMode='absolute', + unit='none', + pluginVersion='9.4.7', + ).addThresholds([ + { color: 'green', value: null }, + ]) + .addTargets([ + $.addTargetSchema( + expr='count(ceph_pool_metadata{cluster=~"$cluster"})', + datasource={ type: 'prometheus', uid: '$datasource' }, + format='table', + hide=false, + exemplar=false, + instant=false, + interval='', + legendFormat='__auto', + range=true, + ), + ]), + + $.addStatPanel( + title='Client Bandwidth', + datasource='$datasource', + gridPosition={ x: 16, y: 14, w: 4, h: 4 }, + graphMode='area', + colorMode='none', + orientation='auto', + justifyMode='center', + thresholdsMode='absolute', + unit='binBps', + pluginVersion='9.4.7', + ).addThresholds([ + { color: 'green', value: null }, + ]) + .addTargets([ + $.addTargetSchema( + expr='sum(irate(ceph_pool_rd_bytes{cluster=~"$cluster"}[$__interval]))', + datasource={ type: 'prometheus', uid: '$datasource' }, + hide=false, + exemplar=false, + instant=false, + legendFormat='Write', + range=true, + ), + $.addTargetSchema( + expr='sum(irate(ceph_pool_wr_bytes{cluster=~"$cluster"}[$__interval]))', + datasource={ type: 'prometheus', uid: '$datasource' }, + hide=false, + exemplar=false, + legendFormat='Read', + range=true, + ), + ]), + + $.addStatPanel( + title='Recovery Rate', + datasource='$datasource', + gridPosition={ x: 20, y: 14, w: 4, h: 4 }, + graphMode='area', + colorMode='none', + orientation='auto', + justifyMode='center', + thresholdsMode='absolute', + unit='binBps', + pluginVersion='9.4.7', + ).addThresholds([ + { color: 'green', value: null }, + ]) + .addTargets([ + $.addTargetSchema( + expr='sum(irate(ceph_osd_recovery_ops{cluster=~"$cluster"}[$__interval]))', + datasource={ type: 'prometheus', uid: '$datasource' }, + hide=false, + exemplar=false, + instant=false, + legendFormat='Write', + range=true, + ), + ]), + + + $.addRowSchema(false, true, 'Alerts', collapsed=true) + .addPanels([ + $.addStatPanel( + title='Status', + datasource='$datasource', + gridPosition={ x: 0, y: 19, w: 5, h: 7 }, + graphMode='area', + colorMode='value', + orientation='auto', + justifyMode='center', + thresholdsMode='absolute', + pluginVersion='9.4.7', + ).addThresholds([ + { color: 'text', value: null }, + ]) + .addOverrides( + [ + { + matcher: { id: 'byName', options: 'Critical' }, + properties: [ + { + id: 'thresholds', + value: { mode: 'absolute', steps: [{ color: 'text', value: null }, { color: 'semi-dark-red', value: 1 }] }, + }, + ], + }, + { + matcher: { id: 'byName', options: 'Warning' }, + properties: [ + { + id: 'thresholds', + value: { mode: 'absolute', steps: [{ color: 'text', value: null }, { color: 'semi-dark-yellow', value: 1 }] }, + }, + ], + }, + ] + ) + .addTargets([ + $.addTargetSchema( + expr='count(ALERTS{alertstate="firing",severity="critical", cluster=~"$cluster"}) OR vector(0)', + datasource='$datasource', + legendFormat='Critical', + instant=true, + range=false + ), + $.addTargetSchema( + expr='count(ALERTS{alertstate="firing",severity="warning", cluster=~"$cluster"}) OR vector(0)', + datasource='$datasource', + legendFormat='Warning', + instant=true, + range=false + ), + ]), + + + $.addTableExtended( + datasource='$datasource', + title='Alerts', + gridPosition={ h: 7, w: 19, x: 5, y: 19 }, + options={ + footer: { + fields: '', + reducer: ['sum'], + countRows: false, + enablePagination: false, + show: false, + }, + frameIndex: 1, + showHeader: true, + sortBy: [{ desc: false, displayName: 'Severity' }], + }, + custom={ align: 'auto', cellOptions: { type: 'auto' }, filterable: true, inspect: false }, + thresholds={ + mode: 'absolute', + steps: [ + { color: 'green' }, + { color: 'red', value: 80 }, + ], + }, + pluginVersion='9.4.7' + ) + .addTransformations([ + { + id: 'joinByField', + options: { byField: 'cluster', mode: 'outer' }, + }, + { + id: 'organize', + options: { + excludeByName: { + Time: true, + Value: true, + __name__: true, + instance: true, + job: true, + oid: true, + replica: true, + type: true, + }, + indexByName: { + Time: 0, + Value: 9, + __name__: 1, + alertname: 2, + alertstate: 4, + cluster: 3, + instance: 6, + job: 7, + severity: 5, + type: 8, + }, + renameByName: { + alertname: 'Name', + alertstate: 'State', + cluster: 'Cluster', + severity: 'Severity', + }, + }, + }, + ]).addTargets([ + $.addTargetSchema( + expr='ALERTS{alertstate="firing", %(matchers)s}' % $.matchers(), + datasource={ type: 'prometheus', uid: '$datasource' }, + format='table', + hide=false, + exemplar=false, + instant=true, + interval='', + legendFormat='__auto', + range=false, + ), + ]), + + $.addAlertListPanel( + title='Alerts(Grouped)', + datasource={ + type: 'datasource', + uid: 'grafana', + }, + gridPosition={ h: 8, w: 24, x: 0, y: 26 }, + alertName='', + dashboardAlerts=false, + groupBy=[], + groupMode='default', + maxItems=20, + sortOrder=1, + stateFilter={ + 'error': true, + firing: true, + noData: false, + normal: false, + pending: true, + }, + ), + ]) + { gridPos: { x: 0, y: 18, w: 24, h: 1 } }, + + $.addRowSchema(false, true, 'Cluster Stats', collapsed=true) + .addPanels([ + $.timeSeriesPanel( + lineInterpolation='linear', + lineWidth=1, + drawStyle='line', + axisPlacement='auto', + title='Top 5 - Capacity Utilization(%)', + datasource='$datasource', + gridPosition={ h: 7, w: 8, x: 0, y: 30 }, + fillOpacity=0, + pointSize=5, + showPoints='auto', + unit='percentunit', + displayMode='table', + showLegend=true, + placement='bottom', + tooltip={ mode: 'multi', sort: 'desc' }, + stackingMode='none', + spanNulls=false, + decimals=2, + thresholdsMode='percentage', + sortBy='Last', + sortDesc=true + ) + .addCalcs(['last']) + .addThresholds([ + { color: 'green' }, + ]) + .addTargets( + [ + $.addTargetSchema( + expr='topk(5, ceph_cluster_total_used_bytes/ceph_cluster_total_bytes)', + datasource='$datasource', + instant=false, + legendFormat='{{cluster}}', + step=300, + range=true, + ), + ] + ), + + + $.timeSeriesPanel( + lineInterpolation='linear', + lineWidth=1, + drawStyle='line', + axisPlacement='auto', + title='Top 5 - Cluster IOPS', + datasource='$datasource', + gridPosition={ h: 7, w: 8, x: 8, y: 30 }, + fillOpacity=0, + pointSize=5, + showPoints='auto', + unit='ops', + displayMode='table', + showLegend=true, + placement='bottom', + tooltip={ mode: 'multi', sort: 'desc' }, + stackingMode='none', + spanNulls=false, + decimals=2, + thresholdsMode='percentage', + sortBy='Last', + sortDesc=true + ) + .addCalcs(['last']) + .addThresholds([ + { color: 'green' }, + ]) + .addTargets( + [ + $.addTargetSchema( + expr='topk(10, sum by (cluster) (irate(ceph_osd_op_w[$__interval])) \n+ sum by (cluster) (irate(ceph_osd_op_r[$__interval])) )', + datasource='$datasource', + instant=false, + legendFormat='{{cluster}}', + step=300, + range=true, + ), + ] + ), + + + $.timeSeriesPanel( + lineInterpolation='linear', + lineWidth=1, + drawStyle='line', + axisPlacement='auto', + title='Top 10 - Capacity Utilization(%) by Pool', + datasource='$datasource', + gridPosition={ h: 7, w: 8, x: 16, y: 30 }, + fillOpacity=0, + pointSize=5, + showPoints='auto', + unit='percentunit', + displayMode='table', + showLegend=true, + placement='bottom', + tooltip={ mode: 'multi', sort: 'desc' }, + stackingMode='none', + spanNulls=false, + decimals=2, + thresholdsMode='absolute', + sortBy='Last', + sortDesc=true + ) + .addCalcs(['last']) + .addThresholds([ + { color: 'green' }, + ]) + .addTargets( + [ + $.addTargetSchema( + expr='topk(10, ceph_pool_bytes_used{%(matchers)s}/ceph_pool_max_avail{%(matchers)s} * on(pool_id, cluster) group_left(instance, name) ceph_pool_metadata{%(matchers)s})' % $.matchers(), + datasource='$datasource', + instant=false, + legendFormat='{{cluster}} - {{name}}', + step=300, + range=true, + ), + ] + ), + ]) + { gridPos: { x: 0, y: 29, w: 24, h: 1 } }, + ]), +} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards/osd.libsonnet b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards/osd.libsonnet new file mode 100644 index 000000000..2b066ea5f --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards/osd.libsonnet @@ -0,0 +1,771 @@ +local g = import 'grafonnet/grafana.libsonnet'; + + +(import 'utils.libsonnet') { + 'osds-overview.json': + $.dashboardSchema( + 'OSD Overview', + '', + 'lo02I1Aiz', + 'now-1h', + '30s', + 16, + $._config.dashboardTags, + '' + ) + .addAnnotation( + $.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' + ) + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.0.0' + ) + .addRequired( + type='panel', id='grafana-piechart-panel', name='Pie Chart', version='1.3.3' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addRequired( + type='panel', id='table', name='Table', version='5.0.0' + ) + .addTemplate( + g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') + ) + .addTemplate( + $.addClusterTemplate() + ) + .addPanels([ + $.simpleGraphPanel( + { '@95%ile': '#e0752d' }, + 'OSD Read Latencies', + '', + 'ms', + null, + '0', + ||| + avg ( + rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) / + on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) * 1000 + ) + ||| % $.matchers(), + 'AVG read', + 0, + 0, + 8, + 8 + ) + .addTargets( + [ + $.addTargetSchema( + ||| + max( + rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) / + on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) * 1000 + ) + ||| % $.matchers(), + 'MAX read' + ), + $.addTargetSchema( + ||| + quantile(0.95, + ( + rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) / + on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) + * 1000 + ) + ) + ||| % $.matchers(), + '@95%ile' + ), + ], + ), + + $.addTableExtended( + datasource='${datasource}', + title='Highest READ Latencies', + gridPosition={ h: 8, w: 4, x: 8, y: 0 }, + options={ + footer: { + fields: '', + reducer: ['sum'], + countRows: false, + enablePagination: false, + show: false, + }, + frameIndex: 1, + showHeader: true, + }, + custom={ align: 'null', cellOptions: { type: 'auto' }, filterable: true, inspect: false }, + thresholds={ + mode: 'absolute', + steps: [ + { color: 'green', value: null }, + { color: 'red', value: 80 }, + ], + }, + overrides=[ + { + matcher: { id: 'byName', options: 'ceph_daemon' }, + properties: [ + { id: 'displayName', value: 'OSD ID' }, + { id: 'unit', value: 'short' }, + { id: 'decimals', value: 2 }, + { id: 'custom.align', value: null }, + ], + }, + { + matcher: { id: 'byName', options: 'Value' }, + properties: [ + { id: 'displayName', value: 'Latency (ms)' }, + { id: 'unit', value: 'none' }, + { id: 'decimals', value: 2 }, + { id: 'custom.align', value: null }, + ], + }, + ], + pluginVersion='10.4.0' + ) + .addTransformations([ + { + id: 'merge', + options: { reducers: [] }, + }, + { + id: 'organize', + options: { + excludeByName: { + Time: true, + cluster: true, + }, + indexByName: {}, + renameByName: {}, + includeByName: {}, + }, + }, + ]).addTarget( + $.addTargetSchema( + ||| + topk(10, + (sort( + ( + rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) / + on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) * + 1000 + ) + )) + ) + ||| % $.matchers(), + '', + 'table', + 1, + true + ) + ), + + $.simpleGraphPanel( + { + '@95%ile write': '#e0752d', + }, + 'OSD Write Latencies', + '', + 'ms', + null, + '0', + ||| + avg( + rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) / + on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) + * 1000 + ) + ||| % $.matchers(), + 'AVG write', + 12, + 0, + 8, + 8 + ) + .addTargets( + [ + $.addTargetSchema( + ||| + max( + rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) / + on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) * + 1000 + ) + ||| % $.matchers(), 'MAX write' + ), + $.addTargetSchema( + ||| + quantile(0.95, ( + rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) / + on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) * + 1000 + )) + ||| % $.matchers(), '@95%ile write' + ), + ], + ), + + $.addTableExtended( + datasource='${datasource}', + title='Highest WRITE Latencies', + description="This table shows the osd's that are delivering the 10 highest write latencies within the cluster", + gridPosition={ h: 8, w: 4, x: 20, y: 0 }, + options={ + footer: { + fields: '', + reducer: ['sum'], + countRows: false, + enablePagination: false, + show: false, + }, + frameIndex: 1, + showHeader: true, + }, + custom={ align: 'null', cellOptions: { type: 'auto' }, filterable: true, inspect: false }, + thresholds={ + mode: 'absolute', + steps: [ + { color: 'green', value: null }, + { color: 'red', value: 80 }, + ], + }, + overrides=[ + { + matcher: { id: 'byName', options: 'ceph_daemon' }, + properties: [ + { id: 'displayName', value: 'OSD ID' }, + { id: 'unit', value: 'short' }, + { id: 'decimals', value: 2 }, + { id: 'custom.align', value: null }, + ], + }, + { + matcher: { id: 'byName', options: 'Value' }, + properties: [ + { id: 'displayName', value: 'Latency (ms)' }, + { id: 'unit', value: 'none' }, + { id: 'decimals', value: 2 }, + { id: 'custom.align', value: null }, + ], + }, + { + matcher: { id: 'byName', options: 'Value' }, + properties: [ + { id: 'mappings', value: [{ type: 'value', options: { NaN: { text: '0.00', index: 0 } } }] }, + { id: 'unit', value: 'none' }, + { id: 'decimals', value: 2 }, + { id: 'custom.align', value: null }, + ], + }, + ], + pluginVersion='10.4.0' + ) + .addTransformations([ + { + id: 'merge', + options: { reducers: [] }, + }, + { + id: 'organize', + options: { + excludeByName: { + Time: true, + cluster: true, + }, + indexByName: {}, + renameByName: {}, + includeByName: {}, + }, + }, + ]).addTarget( + $.addTargetSchema( + ||| + topk(10, + (sort( + (rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) / + on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) * + 1000) + )) + ) + ||| % $.matchers(), + '', + 'table', + 1, + true + ) + ), + + $.pieChartPanel('OSD Types Summary', '', '$datasource', { x: 0, y: 8, w: 4, h: 8 }, 'table', 'bottom', true, ['percent'], { mode: 'single', sort: 'none' }, 'pie', ['percent', 'value'], 'palette-classic') + .addTarget( + $.addTargetSchema('count by (device_class) (ceph_osd_metadata{%(matchers)s})' % $.matchers(), '{{device_class}}') + ), + $.pieChartPanel('OSD Objectstore Types', '', '$datasource', { x: 4, y: 8, w: 4, h: 8 }, 'table', 'bottom', true, ['percent'], { mode: 'single', sort: 'none' }, 'pie', ['percent', 'value'], 'palette-classic') + .addTarget($.addTargetSchema( + 'count(ceph_bluefs_wal_total_bytes{%(matchers)s})' % $.matchers(), 'bluestore', 'time_series', 2 + )) + .addTarget($.addTargetSchema( + 'absent(ceph_bluefs_wal_total_bytes{%(matchers)s}) * count(ceph_osd_metadata{%(matchers)s})' % $.matchers(), 'filestore', 'time_series', 2 + )), + $.pieChartPanel('OSD Size Summary', 'The pie chart shows the various OSD sizes used within the cluster', '$datasource', { x: 8, y: 8, w: 4, h: 8 }, 'table', 'bottom', true, ['percent'], { mode: 'single', sort: 'none' }, 'pie', ['percent', 'value'], 'palette-classic') + .addTarget($.addTargetSchema( + 'count(ceph_osd_stat_bytes{%(matchers)s} < 1099511627776)' % $.matchers(), '<1TB', 'time_series', 2 + )) + .addTarget($.addTargetSchema( + 'count(ceph_osd_stat_bytes{%(matchers)s} >= 1099511627776 < 2199023255552)' % $.matchers(), '<2TB', 'time_series', 2 + )) + .addTarget($.addTargetSchema( + 'count(ceph_osd_stat_bytes{%(matchers)s} >= 2199023255552 < 3298534883328)' % $.matchers(), '<3TB', 'time_series', 2 + )) + .addTarget($.addTargetSchema( + 'count(ceph_osd_stat_bytes{%(matchers)s} >= 3298534883328 < 4398046511104)' % $.matchers(), '<4TB', 'time_series', 2 + )) + .addTarget($.addTargetSchema( + 'count(ceph_osd_stat_bytes{%(matchers)s} >= 4398046511104 < 6597069766656)' % $.matchers(), '<6TB', 'time_series', 2 + )) + .addTarget($.addTargetSchema( + 'count(ceph_osd_stat_bytes{%(matchers)s} >= 6597069766656 < 8796093022208)' % $.matchers(), '<8TB', 'time_series', 2 + )) + .addTarget($.addTargetSchema( + 'count(ceph_osd_stat_bytes{%(matchers)s} >= 8796093022208 < 10995116277760)' % $.matchers(), '<10TB', 'time_series', 2 + )) + .addTarget($.addTargetSchema( + 'count(ceph_osd_stat_bytes{%(matchers)s} >= 10995116277760 < 13194139533312)' % $.matchers(), '<12TB', 'time_series', 2 + )) + .addTarget($.addTargetSchema( + 'count(ceph_osd_stat_bytes{%(matchers)s} >= 13194139533312)' % $.matchers(), '<12TB+', 'time_series', 2 + )), + g.graphPanel.new(bars=true, + datasource='$datasource', + title='Distribution of PGs per OSD', + x_axis_buckets=20, + x_axis_mode='histogram', + x_axis_values=['total'], + formatY1='short', + formatY2='short', + labelY1='# of OSDs', + min='0', + nullPointMode='null') + .addTarget($.addTargetSchema( + 'ceph_osd_numpg{%(matchers)s}' % $.matchers(), 'PGs per OSD', 'time_series', 1, true + )) + { type: 'timeseries' } + { fieldConfig: { defaults: { unit: 'short', custom: { fillOpacity: 8, showPoints: 'never' } } } } + { gridPos: { x: 12, y: 8, w: 8, h: 8 } }, + $.gaugeSingleStatPanel( + 'percentunit', + 'OSD onode Hits Ratio', + 'This gauge panel shows onode Hits ratio to help determine if increasing RAM per OSD could help improve the performance of the cluster', + 'current', + true, + 1, + true, + false, + '.75', + ||| + sum(ceph_bluestore_onode_hits{%(matchers)s}) / ( + sum(ceph_bluestore_onode_hits{%(matchers)s}) + + sum(ceph_bluestore_onode_misses{%(matchers)s}) + ) + ||| % $.matchers(), + 'time_series', + 20, + 8, + 4, + 8 + ), + $.addRowSchema(false, + true, + 'R/W Profile') + { gridPos: { x: 0, y: 16, w: 24, h: 1 } }, + $.simpleGraphPanel( + {}, + 'Read/Write Profile', + 'Show the read/write workload profile overtime', + 'short', + null, + null, + 'round(sum(rate(ceph_pool_rd{%(matchers)s}[$__rate_interval])))' % $.matchers(), + 'Reads', + 0, + 17, + 24, + 8 + ) + .addTargets([$.addTargetSchema( + 'round(sum(rate(ceph_pool_wr{%(matchers)s}[$__rate_interval])))' % $.matchers(), 'Writes' + )]), + + $.addTableExtended( + datasource='${datasource}', + title='Top Slow Ops', + description='This table shows the 10 OSDs with the highest number of slow ops', + gridPosition={ h: 8, w: 5, x: 0, y: 25 }, + options={ + footer: { + fields: '', + reducer: ['sum'], + countRows: false, + enablePagination: false, + show: false, + }, + frameIndex: 1, + showHeader: true, + }, + custom={ align: 'null', cellOptions: { type: 'auto' }, filterable: true, inspect: false }, + thresholds={ + mode: 'absolute', + steps: [ + { color: 'green', value: null }, + { color: 'red', value: 80 }, + ], + }, + overrides=[ + { + matcher: { id: 'byName', options: 'ceph_daemon' }, + properties: [ + { id: 'displayName', value: 'OSD ID' }, + { id: 'unit', value: 'short' }, + { id: 'decimals', value: 2 }, + { id: 'custom.align', value: null }, + ], + }, + { + matcher: { id: 'byName', options: 'Value' }, + properties: [ + { id: 'displayName', value: 'Slow Ops' }, + { id: 'unit', value: 'none' }, + { id: 'decimals', value: 2 }, + { id: 'custom.align', value: null }, + ], + }, + ], + pluginVersion='10.4.0' + ) + .addTransformations([ + { + id: 'merge', + options: { reducers: [] }, + }, + { + id: 'organize', + options: { + excludeByName: { + Time: true, + __name__: true, + instance: true, + job: true, + type: true, + cluster: true, + }, + indexByName: {}, + renameByName: {}, + includeByName: {}, + }, + }, + ]).addTarget( + $.addTargetSchema( + ||| + topk(10, + (ceph_daemon_health_metrics{type="SLOW_OPS", ceph_daemon=~"osd.*"}) + ) + ||| % $.matchers(), + '', + 'table', + 1, + true + ) + ), + ]), + 'osd-device-details.json': + local OsdDeviceDetailsPanel(title, + description, + formatY1, + labelY1, + expr1, + expr2, + legendFormat1, + legendFormat2, + x, + y, + w, + h) = + $.graphPanelSchema({}, + title, + description, + 'null as zero', + false, + formatY1, + 'short', + labelY1, + null, + null, + 1, + '$datasource') + .addTargets( + [ + $.addTargetSchema(expr1, + legendFormat1), + $.addTargetSchema(expr2, legendFormat2), + ] + ) + { type: 'timeseries' } + { fieldConfig: { defaults: { unit: formatY1, custom: { fillOpacity: 8, showPoints: 'never' } } } } + { gridPos: { x: x, y: y, w: w, h: h } }; + + $.dashboardSchema( + 'OSD device details', + '', + 'CrAHE0iZz', + 'now-3h', + '30s', + 16, + $._config.dashboardTags, + '' + ) + .addAnnotation( + $.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' + ) + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.3.2' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addTemplate( + g.template.datasource('datasource', + 'prometheus', + 'default', + label='Data Source') + ) + .addTemplate( + $.addClusterTemplate() + ) + .addTemplate( + $.addTemplateSchema('osd', + '$datasource', + 'label_values(ceph_osd_metadata{%(matchers)s}, ceph_daemon)' % $.matchers(), + 1, + false, + 1, + 'OSD', + '(.*)') + ) + .addPanels([ + $.addRowSchema( + false, true, 'OSD Performance' + ) + { gridPos: { x: 0, y: 0, w: 24, h: 1 } }, + OsdDeviceDetailsPanel( + '$osd Latency', + '', + 's', + 'Read (-) / Write (+)', + ||| + rate(ceph_osd_op_r_latency_sum{ceph_daemon=~"$osd", %(matchers)s}[$__rate_interval]) / + on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) + ||| % $.matchers(), + ||| + rate(ceph_osd_op_w_latency_sum{ceph_daemon=~"$osd", %(matchers)s}[$__rate_interval]) / + on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) + ||| % $.matchers(), + 'read', + 'write', + 0, + 1, + 6, + 9 + ) + .addSeriesOverride( + { + alias: 'read', + transform: 'negative-Y', + } + ), + OsdDeviceDetailsPanel( + '$osd R/W IOPS', + '', + 'short', + 'Read (-) / Write (+)', + 'rate(ceph_osd_op_r{ceph_daemon=~"$osd", %(matchers)s}[$__rate_interval])' % $.matchers(), + 'rate(ceph_osd_op_w{ceph_daemon=~"$osd", %(matchers)s}[$__rate_interval])' % $.matchers(), + 'Reads', + 'Writes', + 6, + 1, + 6, + 9 + ) + .addSeriesOverride( + { alias: 'Reads', transform: 'negative-Y' } + ), + OsdDeviceDetailsPanel( + '$osd R/W Bytes', + '', + 'bytes', + 'Read (-) / Write (+)', + 'rate(ceph_osd_op_r_out_bytes{ceph_daemon=~"$osd", %(matchers)s}[$__rate_interval])' % $.matchers(), + 'rate(ceph_osd_op_w_in_bytes{ceph_daemon=~"$osd", %(matchers)s}[$__rate_interval])' % $.matchers(), + 'Read Bytes', + 'Write Bytes', + 12, + 1, + 6, + 9 + ) + .addSeriesOverride({ alias: 'Read Bytes', transform: 'negative-Y' }), + $.addRowSchema( + false, true, 'Physical Device Performance' + ) + { gridPos: { x: 0, y: 10, w: 24, h: 1 } }, + OsdDeviceDetailsPanel( + 'Physical Device Latency for $osd', + '', + 's', + 'Read (-) / Write (+)', + ||| + ( + label_replace( + rate(node_disk_read_time_seconds_total[$__rate_interval]) / + rate(node_disk_reads_completed_total[$__rate_interval]), + "instance", "$1", "instance", "([^:.]*).*" + ) and on (instance, device) label_replace( + label_replace( + ceph_disk_occupation_human{ceph_daemon=~"$osd", %(matchers)s}, + "device", "$1", "device", "/dev/(.*)" + ), "instance", "$1", "instance", "([^:.]*).*" + ) + ) + ||| % $.matchers(), + ||| + ( + label_replace( + rate(node_disk_write_time_seconds_total[$__rate_interval]) / + rate(node_disk_writes_completed_total[$__rate_interval]), + "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) + label_replace( + label_replace( + ceph_disk_occupation_human{ceph_daemon=~"$osd", %(matchers)s}, "device", "$1", "device", "/dev/(.*)" + ), "instance", "$1", "instance", "([^:.]*).*" + ) + ) + ||| % $.matchers(), + '{{instance}}/{{device}} Reads', + '{{instance}}/{{device}} Writes', + 0, + 11, + 6, + 9 + ) + .addSeriesOverride( + { alias: '/.*Reads/', transform: 'negative-Y' } + ), + OsdDeviceDetailsPanel( + 'Physical Device R/W IOPS for $osd', + '', + 'short', + 'Read (-) / Write (+)', + ||| + label_replace( + rate(node_disk_writes_completed_total[$__rate_interval]), + "instance", "$1", "instance", "([^:.]*).*" + ) and on (instance, device) label_replace( + label_replace( + ceph_disk_occupation_human{ceph_daemon=~"$osd", %(matchers)s}, + "device", "$1", "device", "/dev/(.*)" + ), "instance", "$1", "instance", "([^:.]*).*" + ) + ||| % $.matchers(), + ||| + label_replace( + rate(node_disk_reads_completed_total[$__rate_interval]), + "instance", "$1", "instance", "([^:.]*).*" + ) and on (instance, device) label_replace( + label_replace( + ceph_disk_occupation_human{ceph_daemon=~"$osd", %(matchers)s}, + "device", "$1", "device", "/dev/(.*)" + ), "instance", "$1", "instance", "([^:.]*).*" + ) + ||| % $.matchers(), + '{{device}} on {{instance}} Writes', + '{{device}} on {{instance}} Reads', + 6, + 11, + 6, + 9 + ) + .addSeriesOverride( + { alias: '/.*Reads/', transform: 'negative-Y' } + ), + OsdDeviceDetailsPanel( + 'Physical Device R/W Bytes for $osd', + '', + 'Bps', + 'Read (-) / Write (+)', + ||| + label_replace( + rate(node_disk_read_bytes_total[$__rate_interval]), "instance", "$1", "instance", "([^:.]*).*" + ) and on (instance, device) label_replace( + label_replace( + ceph_disk_occupation_human{ceph_daemon=~"$osd", %(matchers)s}, + "device", "$1", "device", "/dev/(.*)" + ), "instance", "$1", "instance", "([^:.]*).*" + ) + ||| % $.matchers(), + ||| + label_replace( + rate(node_disk_written_bytes_total[$__rate_interval]), "instance", "$1", "instance", "([^:.]*).*" + ) and on (instance, device) label_replace( + label_replace( + ceph_disk_occupation_human{ceph_daemon=~"$osd", %(matchers)s}, + "device", "$1", "device", "/dev/(.*)" + ), "instance", "$1", "instance", "([^:.]*).*" + ) + ||| % $.matchers(), + '{{instance}} {{device}} Reads', + '{{instance}} {{device}} Writes', + 12, + 11, + 6, + 9 + ) + .addSeriesOverride( + { alias: '/.*Reads/', transform: 'negative-Y' } + ), + $.graphPanelSchema( + {}, + 'Physical Device Util% for $osd', + '', + 'null', + false, + 'percentunit', + 'short', + null, + null, + null, + 1, + '$datasource' + ) + .addTarget($.addTargetSchema( + ||| + label_replace( + rate(node_disk_io_time_seconds_total[$__rate_interval]), + "instance", "$1", "instance", "([^:.]*).*" + ) and on (instance, device) label_replace( + label_replace( + ceph_disk_occupation_human{ceph_daemon=~"$osd", %(matchers)s}, "device", "$1", "device", "/dev/(.*)" + ), "instance", "$1", "instance", "([^:.]*).*" + ) + ||| % $.matchers(), + '{{device}} on {{instance}}' + )) + { type: 'timeseries' } + { fieldConfig: { defaults: { unit: 'percentunit', custom: { fillOpacity: 8, showPoints: 'never' } } } } + { gridPos: { x: 18, y: 11, w: 6, h: 9 } }, + ]), +} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards/piechart_panel.libsonnet b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards/piechart_panel.libsonnet new file mode 100644 index 000000000..68ff71954 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards/piechart_panel.libsonnet @@ -0,0 +1,73 @@ +{ + /** + * Creates a pie chart panel. + * + * @name pieChartPanel.new + * + * @param title The title of the pie chart panel. + * @param description (default `''`) Description of the panel + * @param datasource (optional) Datasource + * @param pieType (default `'pie'`) Type of pie chart (one of pie or donut) + * + * @method addTarget(target) Adds a target object. + */ + new( + title, + description='', + datasource=null, + gridPos={}, + displayMode='table', + placement='bottom', + showLegend=true, + displayLabels=[], + tooltip={}, + pieType='pie', + values=[], + colorMode='auto', + overrides=[], + reduceOptions={}, + ):: { + type: 'piechart', + [if description != null then 'description']: description, + title: title, + gridPos: gridPos, + datasource: datasource, + options: { + legend: { + calcs: [], + values: values, + displayMode: displayMode, + placement: placement, + showLegend: showLegend, + }, + pieType: pieType, + tooltip: tooltip, + displayLabels: displayLabels, + reduceOptions: reduceOptions, + }, + fieldConfig: { + defaults: { + color: { mode: colorMode }, + mappings: [], + custom: { + hideFrom: { + legend: false, + tooltip: false, + viz: false, + }, + }, + }, + overrides: overrides, + }, + targets: [ + ], + _nextTarget:: 0, + addTarget(target):: self { + // automatically ref id in added targets. + local nextTarget = super._nextTarget, + _nextTarget: nextTarget + 1, + targets+: [target { refId: std.char(std.codepoint('A') + nextTarget) }], + }, + addTargets(targets):: std.foldl(function(p, t) p.addTarget(t), targets, self), + }, +} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards/pool.libsonnet b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards/pool.libsonnet new file mode 100644 index 000000000..068321140 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards/pool.libsonnet @@ -0,0 +1,775 @@ +local g = import 'grafonnet/grafana.libsonnet'; + +(import 'utils.libsonnet') { + 'pool-overview.json': + $.dashboardSchema( + 'Ceph Pools Overview', + '', + 'z99hzWtmk', + 'now-1h', + '30s', + 22, + $._config.dashboardTags, + '' + ) + .addAnnotation( + $.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' + ) + ) + .addTemplate( + g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') + ) + .addTemplate( + $.addClusterTemplate() + ) + .addTemplate( + g.template.custom(label='TopK', + name='topk', + current='15', + query='15') + ) + .addPanels([ + $.simpleSingleStatPanel( + 'none', + 'Pools', + '', + 'avg', + 'count(ceph_pool_metadata{%(matchers)s})' % $.matchers(), + true, + 'table', + 0, + 0, + 3, + 3 + ), + $.simpleSingleStatPanel( + 'none', + 'Pools with Compression', + 'Count of the pools that have compression enabled', + 'current', + 'count(ceph_pool_metadata{compression_mode!="none", %(matchers)s})' % $.matchers(), + null, + '', + 3, + 0, + 3, + 3 + ), + $.simpleSingleStatPanel( + 'bytes', + 'Total Raw Capacity', + 'Total raw capacity available to the cluster', + 'current', + 'sum(ceph_osd_stat_bytes{%(matchers)s})' % $.matchers(), + null, + '', + 6, + 0, + 3, + 3 + ), + $.simpleSingleStatPanel( + 'bytes', + 'Raw Capacity Consumed', + 'Total raw capacity consumed by user data and associated overheads (metadata + redundancy)', + 'current', + 'sum(ceph_pool_bytes_used{%(matchers)s})' % $.matchers(), + true, + '', + 9, + 0, + 3, + 3 + ), + $.simpleSingleStatPanel( + 'bytes', + 'Logical Stored ', + 'Total of client data stored in the cluster', + 'current', + 'sum(ceph_pool_stored{%(matchers)s})' % $.matchers(), + true, + '', + 12, + 0, + 3, + 3 + ), + $.simpleSingleStatPanel( + 'bytes', + 'Compression Savings', + 'A compression saving is determined as the data eligible to be compressed minus the capacity used to store the data after compression', + 'current', + ||| + sum( + ceph_pool_compress_under_bytes{%(matchers)s} - + ceph_pool_compress_bytes_used{%(matchers)s} + ) + ||| % $.matchers(), + null, + '', + 15, + 0, + 3, + 3 + ), + $.simpleSingleStatPanel( + 'percent', + 'Compression Eligibility', + 'Indicates how suitable the data is within the pools that are/have been enabled for compression - averaged across all pools holding compressed data', + 'current', + ||| + ( + sum(ceph_pool_compress_under_bytes{%(matchers)s} > 0) / + sum(ceph_pool_stored_raw{%(matchers)s} and ceph_pool_compress_under_bytes{%(matchers)s} > 0) + ) * 100 + ||| % $.matchers(), + null, + 'table', + 18, + 0, + 3, + 3 + ), + $.simpleSingleStatPanel( + 'none', + 'Compression Factor', + 'This factor describes the average ratio of data eligible to be compressed divided by the data actually stored. It does not account for data written that was ineligible for compression (too small, or compression yield too low)', + 'current', + ||| + sum( + ceph_pool_compress_under_bytes{%(matchers)s} > 0) + / sum(ceph_pool_compress_bytes_used{%(matchers)s} > 0 + ) + ||| % $.matchers(), + null, + '', + 21, + 0, + 3, + 3 + ), + + $.addTableExtended( + datasource='${datasource}', + title='Pool Overview', + gridPosition={ h: 6, w: 24, x: 0, y: 3 }, + options={ + footer: { + fields: '', + reducer: ['sum'], + countRows: false, + enablePagination: false, + show: false, + }, + frameIndex: 1, + showHeader: true, + }, + custom={ align: 'auto', cellOptions: { type: 'auto' }, filterable: true, inspect: false }, + thresholds={ + mode: 'absolute', + steps: [ + { color: 'green', value: null }, + { color: 'red', value: 80 }, + ], + }, + overrides=[ + { + matcher: { id: 'byName', options: 'Time' }, + properties: [ + { id: 'unit', value: 'short' }, + { id: 'decimals', value: 2 }, + ], + }, + { + matcher: { id: 'byName', options: 'instance' }, + properties: [ + { id: 'unit', value: 'short' }, + { id: 'decimals', value: 2 }, + ], + }, + { + matcher: { id: 'byName', options: 'job' }, + properties: [ + { id: 'unit', value: 'short' }, + { id: 'decimals', value: 2 }, + ], + }, + { + matcher: { id: 'byName', options: 'name' }, + properties: [ + { id: 'displayName', value: 'Pool Name' }, + { id: 'unit', value: 'short' }, + { id: 'decimals', value: 2 }, + ], + }, + { + matcher: { id: 'byName', options: 'pool_id' }, + properties: [ + { id: 'displayName', value: 'Pool ID' }, + { id: 'unit', value: 'none' }, + { id: 'decimals', value: 2 }, + ], + }, + { + matcher: { id: 'byName', options: 'Value #A' }, + properties: [ + { id: 'displayName', value: 'Compression Factor' }, + { id: 'unit', value: 'none' }, + { id: 'decimals', value: 2 }, + ], + }, + { + matcher: { id: 'byName', options: 'Value #D' }, + properties: [ + { id: 'displayName', value: '% Used' }, + { id: 'unit', value: 'percentunit' }, + { id: 'decimals', value: 2 }, + { id: 'custom.cellOptions', value: { type: 'color-text' } }, + { + id: 'thresholds', + value: { + mode: 'absolute', + steps: [ + { + color: 'rgba(245, 54, 54, 0.9)', + value: null, + }, + { + color: 'rgba(237, 129, 40, 0.89)', + value: 70, + }, + { + color: 'rgba(50, 172, 45, 0.97)', + value: 85, + }, + ], + }, + }, + ], + }, + { + matcher: { id: 'byName', options: 'Value #B' }, + properties: [ + { id: 'displayName', value: 'Usable Free' }, + { id: 'unit', value: 'bytes' }, + { id: 'decimals', value: 2 }, + ], + }, + { + matcher: { id: 'byName', options: 'Value #C' }, + properties: [ + { id: 'displayName', value: 'Compression Eligibility' }, + { id: 'unit', value: 'percent' }, + { id: 'decimals', value: 2 }, + ], + }, + { + matcher: { id: 'byName', options: 'Value #E' }, + properties: [ + { id: 'displayName', value: 'Compression Savings' }, + { id: 'unit', value: 'bytes' }, + { id: 'decimals', value: 2 }, + ], + }, + { + matcher: { id: 'byName', options: 'Value #F' }, + properties: [ + { id: 'displayName', value: 'Growth (5d)' }, + { id: 'unit', value: 'bytes' }, + { id: 'decimals', value: 2 }, + { id: 'custom.cellOptions', value: { type: 'color-text' } }, + { + id: 'thresholds', + value: { + mode: 'absolute', + steps: [ + { + color: 'rgba(245, 54, 54, 0.9)', + value: null, + }, + { + color: 'rgba(237, 129, 40, 0.89)', + value: 70, + }, + { + color: 'rgba(50, 172, 45, 0.97)', + value: 85, + }, + ], + }, + }, + ], + }, + { + matcher: { id: 'byName', options: 'Value #G' }, + properties: [ + { id: 'displayName', value: 'IOPS' }, + { id: 'unit', value: 'none' }, + { id: 'decimals', value: 2 }, + ], + }, + { + matcher: { id: 'byName', options: 'Value #H' }, + properties: [ + { id: 'displayName', value: 'Bandwidth' }, + { id: 'unit', value: 'Bps' }, + { id: 'decimals', value: 2 }, + ], + }, + { + matcher: { id: 'byName', options: '__name__' }, + properties: [ + { id: 'unit', value: 'short' }, + { id: 'decimals', value: 2 }, + ], + }, + { + matcher: { id: 'byName', options: 'type' }, + properties: [ + { id: 'unit', value: 'short' }, + { id: 'decimals', value: 2 }, + ], + }, + { + matcher: { id: 'byName', options: 'compression_mode' }, + properties: [ + { id: 'unit', value: 'short' }, + { id: 'decimals', value: 2 }, + ], + }, + { + matcher: { id: 'byName', options: 'description' }, + properties: [ + { id: 'displayName', value: 'Type' }, + { id: 'unit', value: 'short' }, + { id: 'decimals', value: 2 }, + ], + }, + { + matcher: { id: 'byName', options: 'Value #J' }, + properties: [ + { id: 'displayName', value: 'Stored' }, + { id: 'unit', value: 'bytes' }, + { id: 'decimals', value: 2 }, + ], + }, + { + matcher: { id: 'byName', options: 'Value #I' }, + properties: [ + { id: 'unit', value: 'short' }, + { id: 'decimals', value: 2 }, + ], + }, + { + matcher: { id: 'byName', options: 'Value #K' }, + properties: [ + { id: 'displayName', value: 'Compression' }, + { id: 'unit', value: 'short' }, + { id: 'decimals', value: 2 }, + ], + }, + ], + pluginVersion='10.4.0' + ) + .addTransformations([ + { + id: 'merge', + options: {}, + }, + { + id: 'seriesToRows', + options: {}, + }, + { + id: 'organize', + options: { + excludeByName: { + Time: true, + 'Value #A': true, + instance: true, + job: true, + pool_id: true, + 'Value #B': false, + 'Value #C': true, + __name__: true, + compression_mode: true, + type: true, + 'Value #I': true, + 'Value #K': true, + 'Value #D': false, + 'Value #E': true, + cluster: true, + }, + indexByName: {}, + renameByName: {}, + includeByName: {}, + }, + }, + ]).addTargets( + [ + $.addTargetSchema( + ||| + ( + ceph_pool_compress_under_bytes{%(matchers)s} / + ceph_pool_compress_bytes_used{%(matchers)s} > 0 + ) and on(pool_id) ( + ( + (ceph_pool_compress_under_bytes{%(matchers)s} > 0) / + ceph_pool_stored_raw{%(matchers)s} + ) * 100 > 0.5 + ) + ||| % $.matchers(), + 'A', + 'table', + 1, + true + ), + $.addTargetSchema( + ||| + ceph_pool_max_avail{%(matchers)s} * + on(pool_id) group_left(name) ceph_pool_metadata{%(matchers)s} + ||| % $.matchers(), + 'B', + 'table', + 1, + true + ), + $.addTargetSchema( + ||| + ( + (ceph_pool_compress_under_bytes{%(matchers)s} > 0) / + ceph_pool_stored_raw{%(matchers)s} + ) * 100 + ||| % $.matchers(), + 'C', + 'table', + 1, + true + ), + $.addTargetSchema( + ||| + ceph_pool_percent_used{%(matchers)s} * + on(pool_id) group_left(name) ceph_pool_metadata{%(matchers)s} + ||| % $.matchers(), + 'D', + 'table', + 1, + true + ), + $.addTargetSchema( + ||| + ceph_pool_compress_under_bytes{%(matchers)s} - + ceph_pool_compress_bytes_used{%(matchers)s} > 0 + ||| % $.matchers(), + 'E', + 'table', + 1, + true + ), + $.addTargetSchema( + 'delta(ceph_pool_stored{%(matchers)s}[5d])' % $.matchers(), 'F', 'table', 1, true + ), + $.addTargetSchema( + ||| + rate(ceph_pool_rd{%(matchers)s}[$__rate_interval]) + + rate(ceph_pool_wr{%(matchers)s}[$__rate_interval]) + ||| % $.matchers(), + 'G', + 'table', + 1, + true + ), + $.addTargetSchema( + ||| + rate(ceph_pool_rd_bytes{%(matchers)s}[$__rate_interval]) + + rate(ceph_pool_wr_bytes{%(matchers)s}[$__rate_interval]) + ||| % $.matchers(), + 'H', + 'table', + 1, + true + ), + $.addTargetSchema( + 'ceph_pool_metadata{%(matchers)s}' % $.matchers(), 'I', 'table', 1, true + ), + $.addTargetSchema( + 'ceph_pool_stored{%(matchers)s} * on(pool_id) group_left ceph_pool_metadata{%(matchers)s}' % $.matchers(), + 'J', + 'table', + 1, + true + ), + $.addTargetSchema( + 'ceph_pool_metadata{compression_mode!="none", %(matchers)s}' % $.matchers(), 'K', 'table', 1, true + ), + $.addTargetSchema('', 'L', '', '', null), + ] + ), + + $.simpleGraphPanel( + {}, + 'Top $topk Client IOPS by Pool', + 'This chart shows the sum of read and write IOPS from all clients by pool', + 'short', + 'IOPS', + 0, + ||| + topk($topk, + round( + ( + rate(ceph_pool_rd{%(matchers)s}[$__rate_interval]) + + rate(ceph_pool_wr{%(matchers)s}[$__rate_interval]) + ), 1 + ) * on(pool_id) group_left(instance,name) ceph_pool_metadata{%(matchers)s}) + ||| % $.matchers(), + '{{name}} ', + 0, + 9, + 12, + 8 + ) + .addTarget( + $.addTargetSchema( + ||| + topk($topk, + rate(ceph_pool_wr{%(matchers)s}[$__rate_interval]) + + on(pool_id) group_left(instance,name) ceph_pool_metadata{%(matchers)s} + ) + ||| % $.matchers(), + '{{name}} - write' + ) + ), + $.simpleGraphPanel( + {}, + 'Top $topk Client Bandwidth by Pool', + 'The chart shows the sum of read and write bytes from all clients, by pool', + 'Bps', + 'Throughput', + 0, + ||| + topk($topk, + ( + rate(ceph_pool_rd_bytes{%(matchers)s}[$__rate_interval]) + + rate(ceph_pool_wr_bytes{%(matchers)s}[$__rate_interval]) + ) * on(pool_id) group_left(instance, name) ceph_pool_metadata{%(matchers)s} + ) + ||| % $.matchers(), + '{{name}}', + 12, + 9, + 12, + 8 + ), + $.simpleGraphPanel( + {}, + 'Pool Capacity Usage (RAW)', + 'Historical view of capacity usage, to help identify growth and trends in pool consumption', + 'bytes', + 'Capacity Used', + 0, + 'ceph_pool_bytes_used{%(matchers)s} * on(pool_id) group_right ceph_pool_metadata{%(matchers)s}' % $.matchers(), + '{{name}}', + 0, + 17, + 24, + 7 + ), + ]), + 'pool-detail.json': + $.dashboardSchema( + 'Ceph Pool Details', + '', + '-xyV8KCiz', + 'now-1h', + '30s', + 22, + $._config.dashboardTags, + '' + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.3.2' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addRequired( + type='panel', id='singlestat', name='Singlestat', version='5.0.0' + ) + .addAnnotation( + $.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' + ) + ) + .addTemplate( + g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') + ) + .addTemplate( + $.addClusterTemplate() + ) + .addTemplate( + $.addTemplateSchema('pool_name', + '$datasource', + 'label_values(ceph_pool_metadata{%(matchers)s}, name)' % $.matchers(), + 1, + false, + 1, + 'Pool Name', + '') + ) + .addPanels([ + $.gaugeSingleStatPanel( + 'percentunit', + 'Capacity used', + '', + 'current', + true, + 1, + true, + true, + '.7,.8', + ||| + (ceph_pool_stored{%(matchers)s} / (ceph_pool_stored{%(matchers)s} + ceph_pool_max_avail{%(matchers)s})) * + on(pool_id) group_left(instance, name) ceph_pool_metadata{name=~"$pool_name", %(matchers)s} + ||| % $.matchers(), + 'time_series', + 0, + 0, + 7, + 7 + ), + $.gaugeSingleStatPanel( + 's', + 'Time till full', + 'Time till pool is full assuming the average fill rate of the last 6 hours', + false, + 100, + false, + false, + '', + 'current', + ||| + (ceph_pool_max_avail{%(matchers)s} / deriv(ceph_pool_stored{%(matchers)s}[6h])) * + on(pool_id) group_left(instance, name) ceph_pool_metadata{name=~"$pool_name", %(matchers)s} > 0 + ||| % $.matchers(), + 'time_series', + 7, + 0, + 5, + 7 + ), + $.simpleGraphPanel( + { + read_op_per_sec: + '#3F6833', + write_op_per_sec: '#E5AC0E', + }, + '$pool_name Object Ingress/Egress', + '', + 'ops', + 'Objects out(-) / in(+) ', + null, + ||| + deriv(ceph_pool_objects{%(matchers)s}[1m]) * + on(pool_id) group_left(instance, name) ceph_pool_metadata{name=~"$pool_name", %(matchers)s} + ||| % $.matchers(), + 'Objects per second', + 12, + 0, + 12, + 7 + ), + $.simpleGraphPanel( + { + read_op_per_sec: '#3F6833', + write_op_per_sec: '#E5AC0E', + }, + '$pool_name Client IOPS', + '', + 'iops', + 'Read (-) / Write (+)', + null, + ||| + rate(ceph_pool_rd{%(matchers)s}[$__rate_interval]) * + on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~"$pool_name", %(matchers)s} + ||| % $.matchers(), + 'reads', + 0, + 7, + 12, + 7 + ) + .addSeriesOverride({ alias: 'reads', transform: 'negative-Y' }) + .addTarget( + $.addTargetSchema( + ||| + rate(ceph_pool_wr{%(matchers)s}[$__rate_interval]) * + on(pool_id) group_left(instance, name) ceph_pool_metadata{name=~"$pool_name", %(matchers)s} + ||| % $.matchers(), + 'writes' + ) + ), + $.simpleGraphPanel( + { + read_op_per_sec: '#3F6833', + write_op_per_sec: '#E5AC0E', + }, + '$pool_name Client Throughput', + '', + 'Bps', + 'Read (-) / Write (+)', + null, + ||| + rate(ceph_pool_rd_bytes{%(matchers)s}[$__rate_interval]) + + on(pool_id) group_left(instance, name) ceph_pool_metadata{name=~"$pool_name", %(matchers)s} + ||| % $.matchers(), + 'reads', + 12, + 7, + 12, + 7 + ) + .addSeriesOverride({ alias: 'reads', transform: 'negative-Y' }) + .addTarget( + $.addTargetSchema( + ||| + rate(ceph_pool_wr_bytes{%(matchers)s}[$__rate_interval]) + + on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~"$pool_name", %(matchers)s} + ||| % $.matchers(), + 'writes' + ) + ), + $.simpleGraphPanel( + { + read_op_per_sec: '#3F6833', + write_op_per_sec: '#E5AC0E', + }, + '$pool_name Objects', + '', + 'short', + 'Objects', + null, + ||| + ceph_pool_objects{%(matchers)s} * + on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~"$pool_name", %(matchers)s} + ||| % $.matchers(), + 'Number of Objects', + 0, + 14, + 12, + 7 + ), + ]), +} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards/rbd.libsonnet b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards/rbd.libsonnet new file mode 100644 index 000000000..bcb8a28cf --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards/rbd.libsonnet @@ -0,0 +1,492 @@ +local g = import 'grafonnet/grafana.libsonnet'; + +local info_rbd_stats = std.join( + '', + [ + 'RBD per-image IO statistics are disabled by default.\n\n', + 'Please refer to ', + 'https://docs.ceph.com/en/latest/mgr/prometheus/#rbd-io-statistics ', + 'for information about how to enable those optionally.', + ] +); + +(import 'utils.libsonnet') { + 'rbd-details.json': + local RbdDetailsPanel(title, description, formatY1, expr1, expr2, x, y, w, h) = + $.graphPanelSchema({}, + title, + description, + 'null as zero', + false, + formatY1, + formatY1, + null, + null, + 0, + 1, + '$datasource') + .addTargets( + [ + $.addTargetSchema(expr1, + '{{pool}} Write'), + $.addTargetSchema(expr2, '{{pool}} Read'), + ] + ) + { type: 'timeseries' } + { fieldConfig: { defaults: { unit: formatY1, custom: { fillOpacity: 8, showPoints: 'never' } } } } + { gridPos: { x: x, y: y, w: w, h: h } }; + + $.dashboardSchema( + 'RBD Details', + 'Detailed Performance of RBD Images (IOPS/Throughput/Latency)', + 'YhCYGcuZz', + 'now-1h', + '30s', + 16, + $._config.dashboardTags, + '' + ) + .addAnnotation( + $.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' + ) + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.3.3' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addTemplate( + g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') + ) + .addTemplate( + $.addClusterTemplate() + ) + .addTemplate( + $.addTemplateSchema('pool', + '$datasource', + 'label_values(ceph_rbd_read_ops{%(matchers)s}, pool)' % $.matchers(), + 1, + false, + 0, + '', + '') + ) + + .addTemplate( + $.addTemplateSchema('image', + '$datasource', + 'label_values(ceph_rbd_read_ops{%(matchers)s, pool="$pool"}, image)' % $.matchers(), + 1, + false, + 0, + '', + '') + ) + .addPanels([ + RbdDetailsPanel( + 'IOPS', + info_rbd_stats, + 'iops', + 'rate(ceph_rbd_write_ops{pool="$pool", image="$image", %(matchers)s}[$__rate_interval])' % $.matchers() + , + 'rate(ceph_rbd_read_ops{pool="$pool", image="$image", %(matchers)s}[$__rate_interval])' % $.matchers(), + 0, + 0, + 8, + 9 + ), + RbdDetailsPanel( + 'Throughput', + info_rbd_stats, + 'Bps', + 'rate(ceph_rbd_write_bytes{pool="$pool", image="$image", %(matchers)s}[$__rate_interval])' % $.matchers(), + 'rate(ceph_rbd_read_bytes{pool="$pool", image="$image", %(matchers)s}[$__rate_interval])' % $.matchers(), + 8, + 0, + 8, + 9 + ), + RbdDetailsPanel( + 'Average Latency', + info_rbd_stats, + 'ns', + ||| + rate(ceph_rbd_write_latency_sum{pool="$pool", image="$image", %(matchers)s}[$__rate_interval]) / + rate(ceph_rbd_write_latency_count{pool="$pool", image="$image", %(matchers)s}[$__rate_interval]) + ||| % $.matchers(), + ||| + rate(ceph_rbd_read_latency_sum{pool="$pool", image="$image", %(matchers)s}[$__rate_interval]) / + rate(ceph_rbd_read_latency_count{pool="$pool", image="$image", %(matchers)s}[$__rate_interval]) + ||| % $.matchers(), + 16, + 0, + 8, + 9 + ), + ]), + 'rbd-overview.json': + local RbdOverviewPanel(title, + description, + formatY1, + expr1, + expr2, + legendFormat1, + legendFormat2, + x, + y, + w, + h) = + $.graphPanelSchema({}, + title, + description, + 'null as zero', + false, + formatY1, + 'short', + null, + null, + 0, + 1, + '$datasource') + .addTargets( + [ + $.addTargetSchema(expr1, + legendFormat1), + $.addTargetSchema(expr2, + legendFormat2), + ] + ) + { type: 'timeseries' } + { fieldConfig: { defaults: { unit: formatY1, custom: { fillOpacity: 8, showPoints: 'never' } } } } + { gridPos: { x: x, y: y, w: w, h: h } }; + + $.dashboardSchema( + 'RBD Overview', + '', + '41FrpeUiz', + 'now-1h', + '30s', + 16, + $._config.dashboardTags + ['overview'], + '' + ) + .addAnnotation( + $.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' + ) + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.4.2' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addRequired( + type='datasource', id='prometheus', name='Prometheus', version='5.0.0' + ) + .addRequired( + type='panel', id='table', name='Table', version='5.0.0' + ) + .addTemplate( + g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') + ) + .addTemplate( + $.addClusterTemplate() + ) + .addPanels([ + RbdOverviewPanel( + 'IOPS', + info_rbd_stats, + 'short', + 'round(sum(rate(ceph_rbd_write_ops{%(matchers)s}[$__rate_interval])))' % $.matchers(), + 'round(sum(rate(ceph_rbd_read_ops{%(matchers)s}[$__rate_interval])))' % $.matchers(), + 'Writes', + 'Reads', + 0, + 0, + 8, + 7 + ), + RbdOverviewPanel( + 'Throughput', + info_rbd_stats, + 'Bps', + 'round(sum(rate(ceph_rbd_write_bytes{%(matchers)s}[$__rate_interval])))' % $.matchers(), + 'round(sum(rate(ceph_rbd_read_bytes{%(matchers)s}[$__rate_interval])))' % $.matchers(), + 'Write', + 'Read', + 8, + 0, + 8, + 7 + ), + RbdOverviewPanel( + 'Average Latency', + info_rbd_stats, + 'ns', + ||| + round( + sum(rate(ceph_rbd_write_latency_sum{%(matchers)s}[$__rate_interval])) / + sum(rate(ceph_rbd_write_latency_count{%(matchers)s}[$__rate_interval])) + ) + ||| % $.matchers(), + ||| + round( + sum(rate(ceph_rbd_read_latency_sum{%(matchers)s}[$__rate_interval])) / + sum(rate(ceph_rbd_read_latency_count{%(matchers)s}[$__rate_interval])) + ) + ||| % $.matchers(), + 'Write', + 'Read', + 16, + 0, + 8, + 7 + ), + + $.addTableExtended( + datasource='${datasource}', + title='Highest IOPS', + description='RBD per-image IO statistics are disabled by default.\n\nPlease refer to https://docs.ceph.com/en/latest/mgr/prometheus/#rbd-io-statistics for information about how to enable those optionally.', + gridPosition={ h: 7, w: 8, x: 0, y: 7 }, + options={ + footer: { + fields: '', + reducer: ['sum'], + countRows: false, + enablePagination: false, + show: false, + }, + frameIndex: 1, + showHeader: true, + }, + custom={ align: 'null', cellOptions: { type: 'auto' }, filterable: true, inspect: false }, + thresholds={ + mode: 'absolute', + steps: [ + { color: 'green', value: null }, + { color: 'red', value: 80 }, + ], + }, + overrides=[ + { + matcher: { id: 'byName', options: 'pool' }, + properties: [ + { id: 'displayName', value: 'Pool' }, + { id: 'unit', value: 'short' }, + { id: 'decimals', value: 2 }, + { id: 'custom.align', value: null }, + ], + }, + { + matcher: { id: 'byName', options: 'image' }, + properties: [ + { id: 'displayName', value: 'Image' }, + { id: 'unit', value: 'short' }, + { id: 'decimals', value: 2 }, + { id: 'custom.align', value: null }, + ], + }, + { + matcher: { id: 'byName', options: 'Value' }, + properties: [ + { id: 'displayName', value: 'IOPS' }, + { id: 'unit', value: 'iops' }, + { id: 'decimals', value: 2 }, + { id: 'custom.align', value: null }, + ], + }, + ], + pluginVersion='10.4.0' + ) + .addTransformations([ + { + id: 'merge', + options: { reducers: [] }, + }, + ]).addTarget( + $.addTargetSchema( + ||| + topk(10, + ( + sort(( + rate(ceph_rbd_write_ops{%(matchers)s}[$__rate_interval]) + + on (image, pool, namespace) rate(ceph_rbd_read_ops{%(matchers)s}[$__rate_interval]) + )) + ) + ) + ||| % $.matchers(), + '', + 'table', + 1, + true + ) + ), + + $.addTableExtended( + datasource='${datasource}', + title='Highest Throughput', + description='RBD per-image IO statistics are disabled by default.\n\nPlease refer to https://docs.ceph.com/en/latest/mgr/prometheus/#rbd-io-statistics for information about how to enable those optionally.', + gridPosition={ h: 7, w: 8, x: 8, y: 7 }, + options={ + footer: { + fields: '', + reducer: ['sum'], + countRows: false, + enablePagination: false, + show: false, + }, + frameIndex: 1, + showHeader: true, + }, + custom={ align: 'null', cellOptions: { type: 'auto' }, filterable: true, inspect: false }, + thresholds={ + mode: 'absolute', + steps: [ + { color: 'green', value: null }, + { color: 'red', value: 80 }, + ], + }, + overrides=[ + { + matcher: { id: 'byName', options: 'pool' }, + properties: [ + { id: 'displayName', value: 'Pool' }, + { id: 'unit', value: 'short' }, + { id: 'decimals', value: 2 }, + { id: 'custom.align', value: null }, + ], + }, + { + matcher: { id: 'byName', options: 'image' }, + properties: [ + { id: 'displayName', value: 'Image' }, + { id: 'unit', value: 'short' }, + { id: 'decimals', value: 2 }, + { id: 'custom.align', value: null }, + ], + }, + { + matcher: { id: 'byName', options: 'Value' }, + properties: [ + { id: 'displayName', value: 'Throughput' }, + { id: 'unit', value: 'Bps' }, + { id: 'decimals', value: 2 }, + { id: 'custom.align', value: null }, + ], + }, + ], + pluginVersion='10.4.0' + ) + .addTransformations([ + { + id: 'merge', + options: { reducers: [] }, + }, + ]).addTarget( + $.addTargetSchema( + ||| + topk(10, + sort( + sum( + rate(ceph_rbd_read_bytes{%(matchers)s}[$__rate_interval]) + + rate(ceph_rbd_write_bytes{%(matchers)s}[$__rate_interval]) + ) by (pool, image, namespace) + ) + ) + ||| % $.matchers(), + '', + 'table', + 1, + true + ) + ), + + $.addTableExtended( + datasource='${datasource}', + title='Highest Latency', + description='RBD per-image IO statistics are disabled by default.\n\nPlease refer to https://docs.ceph.com/en/latest/mgr/prometheus/#rbd-io-statistics for information about how to enable those optionally.', + gridPosition={ h: 7, w: 8, x: 16, y: 7 }, + options={ + footer: { + fields: '', + reducer: ['sum'], + countRows: false, + enablePagination: false, + show: false, + }, + frameIndex: 1, + showHeader: true, + }, + custom={ align: 'null', cellOptions: { type: 'auto' }, filterable: true, inspect: false }, + thresholds={ + mode: 'absolute', + steps: [ + { color: 'green', value: null }, + { color: 'red', value: 80 }, + ], + }, + overrides=[ + { + matcher: { id: 'byName', options: 'pool' }, + properties: [ + { id: 'displayName', value: 'Pool' }, + { id: 'unit', value: 'short' }, + { id: 'decimals', value: 2 }, + { id: 'custom.align', value: null }, + ], + }, + { + matcher: { id: 'byName', options: 'image' }, + properties: [ + { id: 'displayName', value: 'Image' }, + { id: 'unit', value: 'short' }, + { id: 'decimals', value: 2 }, + { id: 'custom.align', value: null }, + ], + }, + { + matcher: { id: 'byName', options: 'Value' }, + properties: [ + { id: 'displayName', value: 'Latency' }, + { id: 'unit', value: 'ns' }, + { id: 'decimals', value: 2 }, + { id: 'custom.align', value: null }, + ], + }, + ], + pluginVersion='10.4.0' + ) + .addTransformations([ + { + id: 'merge', + options: { reducers: [] }, + }, + ]).addTarget( + $.addTargetSchema( + ||| + topk(10, + sum( + rate(ceph_rbd_write_latency_sum{%(matchers)s}[$__rate_interval]) / + clamp_min(rate(ceph_rbd_write_latency_count{%(matchers)s}[$__rate_interval]), 1) + + rate(ceph_rbd_read_latency_sum{%(matchers)s}[$__rate_interval]) / + clamp_min(rate(ceph_rbd_read_latency_count{%(matchers)s}[$__rate_interval]), 1) + ) by (pool, image, namespace) + ) + ||| % $.matchers(), + '', + 'table', + 1, + true + ) + ), + ]), +} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards/rgw-s3-analytics.libsonnet b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards/rgw-s3-analytics.libsonnet new file mode 100644 index 000000000..720ffcb60 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards/rgw-s3-analytics.libsonnet @@ -0,0 +1,2450 @@ +local g = import 'grafonnet/grafana.libsonnet'; + +(import 'utils.libsonnet') { + 'rgw-s3-analytics.json': + $.dashboardSchema( + 'RGW S3 Analytics', + '', + 'BnxelG7Sz', + 'now-1h', + '30s', + 22, + $._config.dashboardTags, + '' + ) + .addAnnotation( + $.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' + ) + ) + + .addTemplate( + g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') + ) + + .addTemplate( + $.addClusterTemplate() + ) + + .addTemplate( + $.addTemplateSchema('rgw_servers', + '$datasource', + 'label_values(ceph_rgw_metadata{%(matchers)s}, ceph_daemon)' % $.matchers(), + 2, + true, + 0, + null, + '') + ) + + .addTemplate( + g.template.adhoc('Filters', '$datasource', 'filters', 0) + ) + + + .addPanels([ + $.addRowSchema(false, true, 'Overview') + { gridPos: { x: 0, y: 0, w: 24, h: 1 } }, + $.addStatPanel( + title='Total PUTs', + datasource='${datasource}', + gridPosition={ x: 0, y: 1, w: 6, h: 3 }, + graphMode='none', + colorMode='none', + unit='decbytes', + orientation='auto', + justifyMode='auto', + thresholdsMode='absolute', + pluginVersion='9.4.7', + ).addThresholds([ + { color: 'green', value: null }, + { color: 'red', value: 80 }, + ]) + .addTargets([ + $.addTargetSchema( + expr='sum(ceph_rgw_op_put_obj_bytes *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource='${datasource}', + legendFormat='__auto', + range=true + ), + ]), + + $.addStatPanel( + title='Total GETs', + datasource='${datasource}', + gridPosition={ x: 6, y: 1, w: 6, h: 3 }, + graphMode='none', + colorMode='none', + unit='decbytes', + orientation='auto', + justifyMode='auto', + thresholdsMode='absolute', + pluginVersion='9.4.7', + ).addThresholds([ + { color: 'green', value: null }, + { color: 'red', value: 80 }, + ]) + .addTargets([ + $.addTargetSchema( + expr='sum\n(ceph_rgw_op_get_obj_bytes *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource='${datasource}', + legendFormat='__auto', + range=true + ), + ]), + + $.addStatPanel( + title='Total Objects', + datasource='${datasource}', + gridPosition={ x: 12, y: 1, w: 6, h: 3 }, + graphMode='none', + colorMode='none', + unit='none', + orientation='auto', + justifyMode='auto', + thresholdsMode='absolute', + pluginVersion='9.4.7', + ).addThresholds([ + { color: 'green', value: null }, + { color: 'red', value: 80 }, + ]) + .addTargets([ + $.addTargetSchema( + expr='sum(ceph_rgw_op_put_obj_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource='${datasource}', + legendFormat='__auto', + range=true + ), + ]), + + $.addStatPanel( + title='Average Object Size', + datasource='${datasource}', + gridPosition={ x: 18, y: 1, w: 6, h: 3 }, + graphMode='none', + colorMode='none', + unit='decbytes', + orientation='auto', + justifyMode='auto', + thresholdsMode='absolute', + pluginVersion='9.4.7', + ).addThresholds([ + { color: 'green', value: null }, + { color: 'red', value: 80 }, + ]) + .addTargets([ + $.addTargetSchema( + expr='sum\n((sum by(instance_id)(ceph_rgw_op_put_obj_bytes) > 0) / (sum by(instance_id)(ceph_rgw_op_put_obj_ops) > 0) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource='${datasource}', + legendFormat='__auto', + range=true + ), + ]), + + $.addBarGaugePanel( + title='Total Operations', + datasource='${datasource}', + gridPosition={ x: 0, y: 4, w: 8, h: 8 }, + unit='none', + thresholds={ color: 'green', value: null } + ) + .addTargets([ + $.addTargetSchema( + expr='sum(ceph_rgw_op_list_obj_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource='${datasource}', + legendFormat='List Objects', + range=false, + instant=true + ), + $.addTargetSchema( + expr='sum(ceph_rgw_op_list_buckets_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource='${datasource}', + legendFormat='List Buckets', + range=true + ), + $.addTargetSchema( + expr='sum(ceph_rgw_op_put_obj_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource='${datasource}', + legendFormat='Put Objects', + range=false, + instant=true + ), + $.addTargetSchema( + expr='sum(ceph_rgw_op_per_bucket_get_obj_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource='${datasource}', + legendFormat='Get Objects', + range=false, + instant=true + ), + $.addTargetSchema( + expr='sum(ceph_rgw_op_del_obj_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource='${datasource}', + legendFormat='Delete Objects', + range=false, + instant=true + ), + $.addTargetSchema( + expr='sum(ceph_rgw_op_del_bucket_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource='${datasource}', + legendFormat='Delete Buckets', + range=false, + instant=true + ), + $.addTargetSchema( + expr='sum(ceph_rgw_op_copy_obj_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource='${datasource}', + legendFormat='Copy Objects', + range=true + ), + ]) + { fieldConfig: { defaults: { color: { mode: 'thresholds' }, thresholds: { mode: 'absolute', steps: [{ color: 'green', value: null }] } } } } + + { options: { orientation: 'horizontal', reduceOptions: { calcs: ['lastNotNull'] }, displayMode: 'gradient' } }, + + + $.addBarGaugePanel( + title='Total Size', + datasource='${datasource}', + gridPosition={ x: 8, y: 4, w: 8, h: 8 }, + unit='none', + thresholds={ color: 'green', value: null } + ) + .addTargets([ + $.addTargetSchema( + expr='sum(ceph_rgw_op_put_obj_bytes *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource='${datasource}', + legendFormat='Put Objects', + range=false, + instant=true + ), + $.addTargetSchema( + expr='sum(ceph_rgw_op_per_bucket_get_obj_bytes *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource='${datasource}', + legendFormat='Get Objects', + range=false, + instant=true + ), + $.addTargetSchema( + expr='sum(ceph_rgw_op_del_obj_bytes *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource='${datasource}', + legendFormat='Delete Objects', + range=false, + instant=true + ), + $.addTargetSchema( + expr='sum(ceph_rgw_op_copy_obj_bytes *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource='${datasource}', + legendFormat='Copy Objects', + range=true + ), + ]) + { fieldConfig: { defaults: { color: { mode: 'thresholds' }, thresholds: { mode: 'absolute', steps: [{ color: 'green', value: null }] } }, overrides: [{ matcher: { id: 'byType', unit: 'number' }, properties: [{ id: 'unit', value: 'decbytes' }] }] } } + + { options: { orientation: 'horizontal', reduceOptions: { calcs: ['lastNotNull'] }, displayMode: 'gradient' } }, + + $.addBarGaugePanel( + title='Total Latencies', + datasource='${datasource}', + gridPosition={ x: 16, y: 4, w: 8, h: 8 }, + unit='none', + thresholds={ color: 'green', value: null } + ) + .addTargets([ + $.addTargetSchema( + expr='sum(ceph_rgw_op_list_obj_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource='${datasource}', + legendFormat='List Object', + range=false, + instant=true + ), + $.addTargetSchema( + expr='sum(ceph_rgw_op_list_buckets_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource='${datasource}', + legendFormat='List Bucket', + range=true + ), + $.addTargetSchema( + expr='sum(ceph_rgw_op_put_obj_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource='${datasource}', + legendFormat='Put Object', + range=false, + instant=true + ), + $.addTargetSchema( + expr='sum(ceph_rgw_op_get_obj_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource='${datasource}', + legendFormat='Get Object', + range=false, + instant=true + ), + $.addTargetSchema( + expr='sum(ceph_rgw_op_del_obj_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource='${datasource}', + legendFormat='Delete Object', + range=false, + instant=true + ), + $.addTargetSchema( + expr='sum(ceph_rgw_op_del_bucket_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource='${datasource}', + legendFormat='Delete Bucket', + range=false, + instant=true + ), + $.addTargetSchema( + expr='sum(ceph_rgw_op_copy_obj_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource='${datasource}', + legendFormat='Copy Object', + range=true + ), + ]) + { fieldConfig: { defaults: { color: { mode: 'thresholds' }, thresholds: { mode: 'absolute', steps: [{ color: 'green', value: null }] } }, overrides: [{ matcher: { id: 'byType', unit: 'number' }, properties: [{ id: 'unit', value: 'ms' }] }] } } + + { options: { orientation: 'horizontal', reduceOptions: { calcs: ['lastNotNull'] }, displayMode: 'gradient' } }, + + + $.addTableExtended( + datasource='${datasource}', + title='Summary Per Bucket by Bandwidth', + gridPosition={ h: 8, w: 12, x: 0, y: 12 }, + options={ + footer: { + fields: '', + reducer: ['sum'], + countRows: false, + enablePagination: false, + show: false, + }, + frameIndex: 1, + showHeader: true, + sortBy: [ + { + desc: true, + displayName: 'PUTs', + }, + ], + }, + custom={ align: 'auto', cellOptions: { type: 'color-text' }, filterable: false, inspect: false }, + thresholds={ + mode: 'absolute', + steps: [ + { color: 'green', value: null }, + ], + }, + overrides=[{ + matcher: { id: 'byType', options: 'number' }, + properties: [ + { id: 'unit', value: 'decbytes' }, + ], + }], + pluginVersion='9.4.7' + ) + .addTransformations([ + { + id: 'merge', + options: {}, + }, + { + id: 'groupBy', + options: { + fields: { + Bucket: { + aggregations: [], + operation: 'groupby', + }, + 'Value #A': { + aggregations: [], + operation: 'groupby', + }, + 'Value #B': { + aggregations: [], + operation: 'groupby', + }, + 'Value #D': { + aggregations: [], + operation: 'groupby', + }, + 'Value #F': { + aggregations: [], + operation: 'groupby', + }, + bucket: { + aggregations: [], + operation: 'groupby', + }, + ceph_daemon: { + aggregations: [], + operation: 'groupby', + }, + }, + }, + }, + { + id: 'organize', + options: { + excludeByName: { + 'Time 1': true, + 'Time 2': true, + 'Time 3': true, + 'Time 4': true, + 'Time 5': true, + 'Time 6': true, + 'Time 7': true, + '__name__ 1': true, + '__name__ 2': true, + '__name__ 3': true, + '__name__ 4': true, + '__name__ 5': true, + '__name__ 6': true, + '__name__ 7': true, + 'ceph_daemon 1': false, + 'ceph_daemon 2': true, + 'ceph_daemon 3': true, + 'ceph_daemon 4': true, + 'instance 1': true, + 'instance 2': true, + 'instance 3': true, + 'instance 4': true, + 'instance 5': true, + 'instance 6': true, + 'instance 7': true, + 'instance_id 1': true, + 'instance_id 2': true, + 'instance_id 3': true, + 'instance_id 4': true, + 'instance_id 5': true, + 'instance_id 6': true, + 'instance_id 7': true, + 'job 1': true, + 'job 2': true, + 'job 3': true, + 'job 4': true, + 'job 5': true, + 'job 6': true, + 'job 7': true, + }, + indexByName: { + 'Value #A': 2, + 'Value #B': 3, + 'Value #D': 4, + 'Value #F': 5, + bucket: 1, + ceph_daemon: 0, + }, + renameByName: { + Bucket: '', + 'Value #A': 'PUTs', + 'Value #B': 'GETs', + 'Value #C': 'List', + 'Value #D': 'Delete', + 'Value #E': 'Copy', + 'Value #F': 'Copy', + 'Value #G': '', + bucket: 'Bucket', + ceph_daemon: 'Daemon', + 'ceph_daemon 1': 'Daemon', + }, + }, + }, + ]).addTargets([ + $.addTargetSchema( + expr='sum by (bucket, ceph_daemon) (ceph_rgw_op_per_bucket_put_obj_bytes *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource={ type: 'prometheus', uid: '${datasource}' }, + format='table', + hide=false, + exemplar=false, + instant=true, + interval='', + legendFormat='Upload Objects', + range=false, + ), + $.addTargetSchema( + expr='sum by (bucket, ceph_daemon) (ceph_rgw_op_per_bucket_get_obj_bytes *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource={ type: 'prometheus', uid: '${datasource}' }, + format='table', + hide=false, + exemplar=false, + instant=true, + interval='', + legendFormat='Get Objects', + range=false, + ), + $.addTargetSchema( + expr='sum by (bucket, ceph_daemon) (ceph_rgw_op_per_bucket_del_obj_bytes *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource={ type: 'prometheus', uid: '${datasource}' }, + format='table', + hide=false, + exemplar=false, + instant=true, + interval='', + legendFormat='Delete Objects', + range=false, + ), + $.addTargetSchema( + expr='sum by (bucket, ceph_daemon) (ceph_rgw_op_per_bucket_copy_obj_bytes *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource={ type: 'prometheus', uid: '${datasource}' }, + format='table', + hide=false, + exemplar=false, + instant=true, + interval='', + legendFormat='Copy Objects', + range=false, + ), + ]), + + + $.addTableExtended( + datasource='${datasource}', + title='Latency(ms) Per Bucket', + gridPosition={ h: 8, w: 12, x: 12, y: 12 }, + options={ + footer: { + fields: '', + reducer: ['sum'], + countRows: false, + enablePagination: false, + show: false, + }, + frameIndex: 1, + showHeader: true, + sortBy: [ + { + desc: true, + displayName: 'PUTs', + }, + ], + }, + custom={ align: 'auto', cellOptions: { type: 'auto' }, filterable: false, inspect: false }, + thresholds={ + mode: 'absolute', + steps: [ + { color: 'green', value: null }, + ], + }, + overrides=[{ + matcher: { id: 'byType', options: 'number' }, + properties: [ + { id: 'unit', value: 'ms' }, + ], + }], + pluginVersion='9.4.7' + ) + .addTransformations([ + { + id: 'merge', + options: {}, + }, + { + id: 'joinByField', + options: { + byField: 'Bucket', + mode: 'outer', + }, + }, + { + id: 'groupBy', + options: { + fields: { + Bucket: { + aggregations: [], + operation: 'groupby', + }, + 'Value #A': { + aggregations: [], + operation: 'groupby', + }, + 'Value #B': { + aggregations: [], + operation: 'groupby', + }, + 'Value #C': { + aggregations: [], + operation: 'groupby', + }, + 'Value #D': { + aggregations: [], + operation: 'groupby', + }, + 'Value #F': { + aggregations: [], + operation: 'groupby', + }, + bucket: { + aggregations: [], + operation: 'groupby', + }, + ceph_daemon: { + aggregations: [], + operation: 'groupby', + }, + }, + }, + }, + { + id: 'organize', + options: { + excludeByName: { + 'Time 1': true, + 'Time 2': true, + 'Time 3': true, + 'Time 4': true, + 'Time 5': true, + 'Time 6': true, + 'Time 7': true, + '__name__ 1': true, + '__name__ 2': true, + '__name__ 3': true, + '__name__ 4': true, + '__name__ 5': true, + '__name__ 6': true, + '__name__ 7': true, + 'ceph_daemon 1': true, + 'ceph_daemon 2': true, + 'ceph_daemon 3': true, + 'ceph_daemon 4': true, + 'ceph_daemon 5': true, + 'instance 1': true, + 'instance 2': true, + 'instance 3': true, + 'instance 4': true, + 'instance 5': true, + 'instance 6': true, + 'instance 7': true, + 'instance_id 1': true, + 'instance_id 2': true, + 'instance_id 3': true, + 'instance_id 4': true, + 'instance_id 5': true, + 'instance_id 6': true, + 'instance_id 7': true, + 'job 1': true, + 'job 2': true, + 'job 3': true, + 'job 4': true, + 'job 5': true, + 'job 6': true, + 'job 7': true, + }, + indexByName: { + 'Value #A': 2, + 'Value #B': 3, + 'Value #C': 4, + 'Value #D': 5, + 'Value #F': 6, + bucket: 1, + ceph_daemon: 0, + }, + renameByName: { + Bucket: '', + 'Value #A': 'PUTs', + 'Value #B': 'GETs', + 'Value #C': 'List', + 'Value #D': 'Delete', + 'Value #E': 'Copy', + 'Value #F': 'Copy', + 'Value #G': '', + bucket: 'Bucket', + ceph_daemon: 'Daemon', + }, + }, + }, + ]).addTargets([ + $.addTargetSchema( + expr='ceph_rgw_op_per_bucket_list_obj_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s}' % $.matchers(), + datasource={ type: 'prometheus', uid: '${datasource}' }, + format='table', + hide=false, + exemplar=false, + instant=true, + interval='', + legendFormat='List Objects', + range=false, + ), + $.addTargetSchema( + expr='ceph_rgw_op_per_bucket_put_obj_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s}' % $.matchers(), + datasource={ type: 'prometheus', uid: '${datasource}' }, + format='table', + hide=false, + exemplar=false, + instant=true, + interval='', + legendFormat='Upload Objects', + range=false, + ), + $.addTargetSchema( + expr='ceph_rgw_op_per_bucket_get_obj_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s}' % $.matchers(), + datasource={ type: 'prometheus', uid: '${datasource}' }, + format='table', + hide=false, + exemplar=false, + instant=true, + interval='', + legendFormat='Get Objects', + range=false, + ), + $.addTargetSchema( + expr='ceph_rgw_op_per_bucket_del_obj_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s}' % $.matchers(), + datasource={ type: 'prometheus', uid: '${datasource}' }, + format='table', + hide=false, + exemplar=false, + instant=true, + interval='', + legendFormat='Delete Objects', + range=false, + ), + $.addTargetSchema( + expr='ceph_rgw_op_per_bucket_copy_obj_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s}' % $.matchers(), + datasource={ type: 'prometheus', uid: '${datasource}' }, + format='table', + hide=false, + exemplar=false, + instant=true, + interval='', + legendFormat='Copy Objects', + range=false, + ), + ]), + + + $.addTableExtended( + datasource='${datasource}', + title='Summary Per User By Bandwidth', + gridPosition={ h: 8, w: 12, x: 0, y: 20 }, + options={ + footer: { + fields: '', + reducer: ['sum'], + countRows: false, + enablePagination: false, + show: false, + }, + frameIndex: 1, + showHeader: true, + sortBy: [ + { + desc: true, + displayName: 'PUTs', + }, + ], + }, + custom={ align: 'auto', cellOptions: { type: 'auto' }, filterable: false, inspect: false }, + thresholds={ + mode: 'absolute', + steps: [ + { color: 'green', value: null }, + ], + }, + overrides=[{ + matcher: { id: 'byType', options: 'number' }, + properties: [ + { id: 'unit', value: 'decbytes' }, + ], + }], + pluginVersion='9.4.7' + ) + .addTransformations([ + { + id: 'merge', + options: {}, + }, + { + id: 'groupBy', + options: { + fields: { + User: { + aggregations: [], + operation: 'groupby', + }, + 'Value #A': { + aggregations: [], + operation: 'groupby', + }, + 'Value #B': { + aggregations: [], + operation: 'groupby', + }, + 'Value #D': { + aggregations: [], + operation: 'groupby', + }, + 'Value #F': { + aggregations: [], + operation: 'groupby', + }, + ceph_daemon: { + aggregations: [], + operation: 'groupby', + }, + instance: { + aggregations: [], + }, + user: { + aggregations: [], + operation: 'groupby', + }, + }, + }, + }, + { + id: 'organize', + options: { + excludeByName: { + 'Time 1': true, + 'Time 2': true, + 'Time 3': true, + 'Time 4': true, + 'Time 5': true, + 'Time 6': true, + 'Time 7': true, + '__name__ 1': true, + '__name__ 2': true, + '__name__ 3': true, + '__name__ 4': true, + '__name__ 5': true, + '__name__ 6': true, + '__name__ 7': true, + 'ceph_daemon 1': true, + 'ceph_daemon 2': true, + 'ceph_daemon 3': true, + 'ceph_daemon 4': true, + 'instance 1': true, + 'instance 2': true, + 'instance 3': true, + 'instance 4': true, + 'instance 5': true, + 'instance 6': true, + 'instance 7': true, + 'instance_id 1': true, + 'instance_id 2': true, + 'instance_id 3': true, + 'instance_id 4': true, + 'instance_id 5': true, + 'instance_id 6': true, + 'instance_id 7': true, + 'job 1': true, + 'job 2': true, + 'job 3': true, + 'job 4': true, + 'job 5': true, + 'job 6': true, + 'job 7': true, + }, + indexByName: { + 'Value #A': 2, + 'Value #B': 3, + 'Value #D': 4, + 'Value #F': 5, + ceph_daemon: 0, + user: 1, + }, + renameByName: { + Bucket: '', + 'Value #A': 'PUTs', + 'Value #B': 'GETs', + 'Value #C': 'List', + 'Value #D': 'Delete', + 'Value #E': 'Copy', + 'Value #F': 'Copy', + 'Value #G': '', + ceph_daemon: 'Daemon', + user: 'User', + }, + }, + }, + ]).addTargets([ + $.addTargetSchema( + expr='ceph_rgw_op_per_user_put_obj_bytes *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s}' % $.matchers(), + datasource={ type: 'prometheus', uid: '${datasource}' }, + format='table', + hide=false, + exemplar=false, + instant=true, + interval='', + legendFormat='Upload Objects', + range=false, + ), + $.addTargetSchema( + expr='ceph_rgw_op_per_user_get_obj_bytes *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s}' % $.matchers(), + datasource={ type: 'prometheus', uid: '${datasource}' }, + format='table', + hide=false, + exemplar=false, + instant=true, + interval='', + legendFormat='Get Objects', + range=false, + ), + $.addTargetSchema( + expr='ceph_rgw_op_per_user_del_obj_bytes *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s}' % $.matchers(), + datasource={ type: 'prometheus', uid: '${datasource}' }, + format='table', + hide=false, + exemplar=false, + instant=true, + interval='', + legendFormat='Delete Objects', + range=false, + ), + $.addTargetSchema( + expr='ceph_rgw_op_per_user_copy_obj_bytes *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s}' % $.matchers(), + datasource={ type: 'prometheus', uid: '${datasource}' }, + format='table', + hide=false, + exemplar=false, + instant=true, + interval='', + legendFormat='Copy Objects', + range=false, + ), + ]), + + + $.addTableExtended( + datasource='${datasource}', + title='Latency(ms) Per User', + gridPosition={ h: 8, w: 12, x: 12, y: 20 }, + options={ + footer: { + fields: '', + reducer: ['sum'], + countRows: false, + enablePagination: false, + show: false, + }, + frameIndex: 1, + showHeader: true, + sortBy: [ + { + desc: true, + displayName: 'PUTs', + }, + ], + }, + custom={ align: 'auto', cellOptions: { type: 'auto' }, filterable: false, inspect: false }, + thresholds={ + mode: 'absolute', + steps: [ + { color: 'green', value: null }, + ], + }, + overrides=[{ + matcher: { id: 'byType', options: 'number' }, + properties: [ + { id: 'unit', value: 'ms' }, + ], + }], + pluginVersion='9.4.7' + ) + .addTransformations([ + { + id: 'merge', + options: {}, + }, + { + id: 'joinByField', + options: { + byField: 'User', + mode: 'outer', + }, + }, + { + id: 'groupBy', + options: { + fields: { + User: { + aggregations: [], + operation: 'groupby', + }, + 'Value #A': { + aggregations: [], + operation: 'groupby', + }, + 'Value #B': { + aggregations: [], + operation: 'groupby', + }, + 'Value #C': { + aggregations: [], + operation: 'groupby', + }, + 'Value #D': { + aggregations: [], + operation: 'groupby', + }, + 'Value #F': { + aggregations: [], + operation: 'groupby', + }, + ceph_daemon: { + aggregations: [], + operation: 'groupby', + }, + user: { + aggregations: [], + operation: 'groupby', + }, + }, + }, + }, + { + id: 'organize', + options: { + excludeByName: { + 'Time 1': true, + 'Time 2': true, + 'Time 3': true, + 'Time 4': true, + 'Time 5': true, + 'Time 6': true, + 'Time 7': true, + '__name__ 1': true, + '__name__ 2': true, + '__name__ 3': true, + '__name__ 4': true, + '__name__ 5': true, + '__name__ 6': true, + '__name__ 7': true, + 'ceph_daemon 1': true, + 'ceph_daemon 2': true, + 'ceph_daemon 3': true, + 'ceph_daemon 4': true, + 'ceph_daemon 5': true, + 'instance 1': true, + 'instance 2': true, + 'instance 3': true, + 'instance 4': true, + 'instance 5': true, + 'instance 6': true, + 'instance 7': true, + 'instance_id 1': true, + 'instance_id 2': true, + 'instance_id 3': true, + 'instance_id 4': true, + 'instance_id 5': true, + 'instance_id 6': true, + 'instance_id 7': true, + 'job 1': true, + 'job 2': true, + 'job 3': true, + 'job 4': true, + 'job 5': true, + 'job 6': true, + 'job 7': true, + }, + indexByName: { + 'Value #A': 2, + 'Value #B': 3, + 'Value #C': 4, + 'Value #D': 5, + 'Value #F': 6, + ceph_daemon: 0, + user: 1, + }, + renameByName: { + Bucket: '', + 'Value #A': 'PUTs', + 'Value #B': 'GETs', + 'Value #C': 'List', + 'Value #D': 'Delete', + 'Value #E': 'Copy', + 'Value #F': 'Copy', + 'Value #G': '', + ceph_daemon: 'Daemon', + user: 'User', + }, + }, + }, + ]).addTargets([ + $.addTargetSchema( + expr='ceph_rgw_op_per_user_list_obj_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s}' % $.matchers(), + datasource={ type: 'prometheus', uid: '${datasource}' }, + format='table', + hide=false, + exemplar=false, + instant=true, + interval='', + legendFormat='__auto', + range=false, + ), + $.addTargetSchema( + expr='ceph_rgw_op_per_user_put_obj_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s}' % $.matchers(), + datasource={ type: 'prometheus', uid: '${datasource}' }, + format='table', + hide=false, + exemplar=false, + instant=true, + interval='', + legendFormat='__auto', + range=false, + ), + $.addTargetSchema( + expr='ceph_rgw_op_per_user_get_obj_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s}' % $.matchers(), + datasource={ type: 'prometheus', uid: '${datasource}' }, + format='table', + hide=false, + exemplar=false, + instant=true, + interval='', + legendFormat='__auto', + range=false, + ), + $.addTargetSchema( + expr='ceph_rgw_op_per_user_del_obj_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s}' % $.matchers(), + datasource={ type: 'prometheus', uid: '${datasource}' }, + format='table', + hide=false, + exemplar=false, + instant=true, + interval='', + legendFormat='__auto', + range=false, + ), + $.addTargetSchema( + expr='ceph_rgw_op_per_user_copy_obj_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s}' % $.matchers(), + datasource={ type: 'prometheus', uid: '${datasource}' }, + format='table', + hide=false, + exemplar=false, + instant=true, + interval='', + legendFormat='__auto', + range=false, + ), + ]), + + + $.addRowSchema(false, true, 'Buckets', collapsed=true) + .addPanels([ + $.addBarGaugePanel( + title='Top 5 Bucket PUTs by Operations', + datasource='${datasource}', + gridPosition={ x: 0, y: 29, w: 6, h: 8 }, + unit='none', + thresholds={ color: 'green', value: null } + ) + .addTargets([ + $.addTargetSchema( + expr='topk(5, \n sum by (bucket, ceph_daemon) ((ceph_rgw_op_per_bucket_put_obj_ops) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})\n)' % $.matchers(), + datasource='${datasource}', + legendFormat='{{ceph_daemon}} - {{bucket}}', + range=false, + instant=true + ), + ]) + { fieldConfig: { defaults: { color: { mode: 'thresholds' }, thresholds: { mode: 'absolute', steps: [{ color: 'green' }] } }, overrides: [{ matcher: { id: 'byType', unit: 'number' }, properties: [{ id: 'color' }, { id: 'color', value: { mode: 'palette-classic' } }] }] } } + + { options: { orientation: 'horizontal', reduceOptions: { calcs: ['lastNotNull'] }, displayMode: 'gradient' } }, + + + $.addBarGaugePanel( + title='Top 5 Bucket GETs by Operations', + datasource='${datasource}', + gridPosition={ x: 6, y: 29, w: 6, h: 8 }, + unit='none', + thresholds={ color: 'green', value: null } + ) + .addTargets([ + $.addTargetSchema( + expr='topk(5, \n sum by (bucket, ceph_daemon) ((ceph_rgw_op_per_bucket_get_obj_ops) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})\n)' % $.matchers(), + datasource='${datasource}', + legendFormat='{{ceph_daemon}} - {{bucket}}', + range=false, + instant=true + ), + ]) + { fieldConfig: { defaults: { color: { mode: 'thresholds' }, thresholds: { mode: 'absolute', steps: [{ color: 'green' }] } }, overrides: [{ matcher: { id: 'byType', unit: 'number' }, properties: [{ id: 'color' }, { id: 'color', value: { mode: 'palette-classic' } }] }] } } + + { options: { orientation: 'horizontal', reduceOptions: { calcs: ['lastNotNull'] }, displayMode: 'gradient' } }, + + + $.addBarGaugePanel( + title='Top 5 Buckets PUTs By Size', + datasource='${datasource}', + gridPosition={ x: 12, y: 29, w: 6, h: 8 }, + unit='decbytes', + thresholds={ color: 'green', value: null } + ) + .addTargets([ + $.addTargetSchema( + expr='topk(5,\n sum by (bucket, ceph_daemon) ((ceph_rgw_op_per_bucket_put_obj_bytes) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})\n)' % $.matchers(), + datasource='${datasource}', + legendFormat='{{ceph_daemon}} - {{bucket}}', + range=false, + instant=true + ), + ]) + { fieldConfig: { defaults: { color: { mode: 'thresholds' }, thresholds: { mode: 'absolute', steps: [{ color: 'green' }] } } } } + + { options: { orientation: 'horizontal', reduceOptions: { calcs: [] }, displayMode: 'gradient' } }, + + + $.addBarGaugePanel( + title='Top 5 Buckets GETs By Size', + datasource='${datasource}', + gridPosition={ x: 18, y: 29, w: 6, h: 8 }, + unit='decbytes', + thresholds={ color: 'green', value: null } + ) + .addTargets([ + $.addTargetSchema( + expr='topk(5,\n sum by (bucket, ceph_daemon) ((ceph_rgw_op_per_bucket_get_obj_bytes) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})\n)' % $.matchers(), + datasource='${datasource}', + legendFormat='{{ceph_daemon}} - {{bucket}}', + range=false, + instant=true + ), + ]) + { fieldConfig: { defaults: { color: { mode: 'thresholds' }, thresholds: { mode: 'absolute', steps: [{ color: 'green' }] } } } } + + { options: { orientation: 'horizontal', reduceOptions: { calcs: [] }, displayMode: 'gradient' } }, + + + $.timeSeriesPanel( + lineInterpolation='linear', + lineWidth=1, + drawStyle='line', + axisPlacement='auto', + title='Bucket PUTs by Size', + datasource='${datasource}', + gridPosition={ h: 8, w: 6, x: 0, y: 37 }, + fillOpacity=0, + pointSize=5, + showPoints='auto', + unit='decbytes', + displayMode='table', + showLegend=true, + placement='bottom', + tooltip={ mode: 'single', sort: 'desc' }, + stackingMode='none', + spanNulls=true, + decimals=2, + thresholdsMode='absolute', + sortBy='Last *', + sortDesc=true + ) + .addThresholds([ + { color: 'green' }, + ]) + .addOverrides([ + { matcher: { id: 'byType', unit: 'number' }, properties: [{ id: 'color' }, { id: 'color', value: { mode: 'palette-classic' } }] }, + ]) + .addTargets( + [ + $.addTargetSchema( + expr='sum by (bucket, ceph_daemon) ((ceph_rgw_op_per_bucket_put_obj_bytes) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource='${datasource}', + format='time_series', + instant=false, + legendFormat='{{ceph_daemon}} - {{bucket}}', + step=300, + range=true, + ), + ] + ) + { options: { legend: { calcs: ['lastNotNull'], displayMode: 'table', placement: 'bottom', showLegend: true, sortBy: 'Last *', sortDesc: true }, tooltip: { mode: 'single', sort: 'desc' } } }, + + + $.timeSeriesPanel( + lineInterpolation='linear', + lineWidth=1, + drawStyle='line', + axisPlacement='auto', + title='Bucket GETs by Size', + datasource='${datasource}', + gridPosition={ h: 8, w: 6, x: 6, y: 37 }, + fillOpacity=0, + pointSize=5, + showPoints='auto', + unit='decbytes', + displayMode='table', + showLegend=true, + placement='bottom', + tooltip={ mode: 'single', sort: 'desc' }, + stackingMode='none', + spanNulls=true, + decimals=2, + thresholdsMode='absolute', + sortBy='Last *', + sortDesc=true + ) + .addThresholds([ + { color: 'green' }, + ]) + .addOverrides([ + { matcher: { id: 'byType', unit: 'number' }, properties: [{ id: 'color' }, { id: 'color', value: { mode: 'palette-classic' } }] }, + ]) + .addTargets( + [ + $.addTargetSchema( + expr='sum by (bucket, ceph_daemon) ((ceph_rgw_op_per_bucket_get_obj_bytes) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource='${datasource}', + format='time_series', + instant=false, + legendFormat='{{ceph_daemon}} - {{bucket}}', + step=300, + range=true, + ), + ] + ) + { options: { legend: { calcs: ['lastNotNull'], displayMode: 'table', placement: 'bottom', showLegend: true, sortBy: 'Last *', sortDesc: true }, tooltip: { mode: 'single', sort: 'desc' } } }, + + + $.timeSeriesPanel( + lineInterpolation='linear', + lineWidth=1, + drawStyle='line', + axisPlacement='auto', + title='Bucket Copy by Size', + datasource='${datasource}', + gridPosition={ h: 8, w: 6, x: 12, y: 37 }, + fillOpacity=0, + pointSize=5, + showPoints='auto', + unit='decbytes', + displayMode='table', + showLegend=true, + placement='bottom', + tooltip={ mode: 'single', sort: 'desc' }, + stackingMode='none', + spanNulls=true, + decimals=2, + thresholdsMode='absolute', + sortBy='Last *', + sortDesc=true + ) + .addThresholds([ + { color: 'green' }, + ]) + .addOverrides([ + { matcher: { id: 'byType', unit: 'number' }, properties: [{ id: 'color' }, { id: 'color', value: { mode: 'palette-classic' } }] }, + ]) + .addTargets( + [ + $.addTargetSchema( + expr='sum by (bucket, ceph_daemon) ((ceph_rgw_op_per_bucket_copy_obj_bytes) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource='${datasource}', + format='time_series', + instant=false, + legendFormat='{{ceph_daemon}} - {{bucket}}', + step=300, + range=true, + ), + ] + ) + { options: { legend: { calcs: ['lastNotNull'], displayMode: 'table', placement: 'bottom', showLegend: true, sortBy: 'Last *', sortDesc: true }, tooltip: { mode: 'single', sort: 'desc' } } }, + + + $.timeSeriesPanel( + lineInterpolation='linear', + lineWidth=1, + drawStyle='line', + axisPlacement='auto', + title='Bucket Delete by Size', + datasource='${datasource}', + gridPosition={ h: 8, w: 6, x: 18, y: 37 }, + fillOpacity=0, + pointSize=5, + showPoints='auto', + unit='decbytes', + displayMode='table', + showLegend=true, + placement='bottom', + tooltip={ mode: 'single', sort: 'desc' }, + stackingMode='none', + spanNulls=true, + decimals=2, + thresholdsMode='absolute', + sortBy='Last *', + sortDesc=true + ) + .addThresholds([ + { color: 'green' }, + ]) + .addOverrides([ + { matcher: { id: 'byType', unit: 'number' }, properties: [{ id: 'color' }, { id: 'color', value: { mode: 'palette-classic' } }] }, + ]) + .addTargets( + [ + $.addTargetSchema( + expr='sum by (bucket, ceph_daemon) ((ceph_rgw_op_per_bucket_del_obj_bytes) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource='${datasource}', + format='time_series', + instant=false, + legendFormat='{{ceph_daemon}} - {{bucket}}', + step=300, + range=true, + ), + ] + ) + { options: { legend: { calcs: ['lastNotNull'], displayMode: 'table', placement: 'bottom', showLegend: true, sortBy: 'Last *', sortDesc: true }, tooltip: { mode: 'single', sort: 'desc' } } }, + + + $.timeSeriesPanel( + lineInterpolation='linear', + lineWidth=1, + drawStyle='line', + axisPlacement='auto', + title='Bucket GETs by Operations', + datasource='${datasource}', + gridPosition={ h: 8, w: 6, x: 0, y: 45 }, + fillOpacity=0, + pointSize=5, + showPoints='auto', + unit='none', + displayMode='table', + showLegend=true, + placement='bottom', + tooltip={ mode: 'single', sort: 'desc' }, + stackingMode='none', + spanNulls=true, + decimals=2, + thresholdsMode='absolute', + sortBy='Last *', + sortDesc=true + ) + .addThresholds([ + { color: 'green' }, + ]) + .addOverrides([ + { matcher: { id: 'byType', unit: 'number' }, properties: [{ id: 'color' }, { id: 'color', value: { mode: 'palette-classic' } }] }, + ]) + .addTargets( + [ + $.addTargetSchema( + expr='sum by (bucket, ceph_daemon) ((ceph_rgw_op_per_bucket_get_obj_ops) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource='${datasource}', + format='time_series', + instant=false, + legendFormat='{{ceph_daemon}} - {{bucket}}', + step=300, + range=true, + ), + ] + ) + { options: { legend: { calcs: ['lastNotNull'], displayMode: 'table', placement: 'bottom', showLegend: true, sortBy: 'Last *', sortDesc: true }, tooltip: { mode: 'single', sort: 'desc' } } }, + + + $.timeSeriesPanel( + lineInterpolation='linear', + lineWidth=1, + drawStyle='line', + axisPlacement='auto', + title='Bucket PUTs by Operations', + datasource='${datasource}', + gridPosition={ h: 8, w: 6, x: 6, y: 45 }, + fillOpacity=0, + pointSize=5, + showPoints='auto', + unit='none', + displayMode='table', + showLegend=true, + placement='bottom', + tooltip={ mode: 'single', sort: 'desc' }, + stackingMode='none', + spanNulls=true, + decimals=2, + thresholdsMode='absolute', + sortBy='Last *', + sortDesc=true + ) + .addThresholds([ + { color: 'green' }, + ]) + .addOverrides([ + { matcher: { id: 'byType', unit: 'number' }, properties: [{ id: 'color' }, { id: 'color', value: { mode: 'palette-classic' } }] }, + ]) + .addTargets( + [ + $.addTargetSchema( + expr='sum by (bucket, ceph_daemon) ((ceph_rgw_op_per_bucket_put_obj_ops) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource='${datasource}', + format='time_series', + instant=false, + legendFormat='{{ceph_daemon}} - {{bucket}}', + step=300, + range=true, + ), + ] + ) + { options: { legend: { calcs: ['lastNotNull'], displayMode: 'table', placement: 'bottom', showLegend: true, sortBy: 'Last *', sortDesc: true }, tooltip: { mode: 'single', sort: 'desc' } } }, + + + $.timeSeriesPanel( + lineInterpolation='linear', + lineWidth=1, + drawStyle='line', + axisPlacement='auto', + title='Bucket List by Operations', + datasource='${datasource}', + gridPosition={ h: 8, w: 6, x: 12, y: 45 }, + fillOpacity=0, + pointSize=5, + showPoints='auto', + unit='none', + displayMode='table', + showLegend=true, + placement='bottom', + tooltip={ mode: 'single', sort: 'desc' }, + stackingMode='none', + spanNulls=true, + decimals=2, + thresholdsMode='absolute', + sortBy='Last *', + sortDesc=true + ) + .addThresholds([ + { color: 'green' }, + ]) + .addOverrides([ + { matcher: { id: 'byType', unit: 'number' }, properties: [{ id: 'color' }, { id: 'color', value: { mode: 'palette-classic' } }] }, + ]) + .addTargets( + [ + $.addTargetSchema( + expr='sum by (bucket, ceph_daemon) ((ceph_rgw_op_per_bucket_list_obj_ops) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource='${datasource}', + format='time_series', + instant=false, + legendFormat='{{ceph_daemon}} - {{bucket}}', + step=300, + range=true, + ), + ] + ) + { options: { legend: { calcs: ['lastNotNull'], displayMode: 'table', placement: 'bottom', showLegend: true, sortBy: 'Last *', sortDesc: true }, tooltip: { mode: 'single', sort: 'desc' } } }, + + + $.timeSeriesPanel( + lineInterpolation='linear', + lineWidth=1, + drawStyle='line', + axisPlacement='auto', + title='Bucket Delete by Operations', + datasource='${datasource}', + gridPosition={ h: 8, w: 6, x: 18, y: 45 }, + fillOpacity=0, + pointSize=5, + showPoints='auto', + unit='none', + displayMode='table', + showLegend=true, + placement='bottom', + tooltip={ mode: 'single', sort: 'desc' }, + stackingMode='none', + spanNulls=true, + decimals=2, + thresholdsMode='absolute', + sortBy='Last *', + sortDesc=true + ) + .addThresholds([ + { color: 'green' }, + ]) + .addOverrides([ + { matcher: { id: 'byType', unit: 'number' }, properties: [{ id: 'color' }, { id: 'color', value: { mode: 'palette-classic' } }] }, + ]) + .addTargets( + [ + $.addTargetSchema( + expr='sum by (bucket, ceph_daemon) ((ceph_rgw_op_per_bucket_del_obj_ops) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource='${datasource}', + format='time_series', + instant=false, + legendFormat='{{ceph_daemon}} - {{bucket}}', + step=300, + range=true, + ), + ] + ) + { options: { legend: { calcs: ['lastNotNull'], displayMode: 'table', placement: 'bottom', showLegend: true, sortBy: 'Last *', sortDesc: true }, tooltip: { mode: 'single', sort: 'desc' } } }, + + + $.timeSeriesPanel( + lineInterpolation='linear', + lineWidth=1, + drawStyle='line', + axisPlacement='auto', + title='Bucket Copy by Operations', + datasource='${datasource}', + gridPosition={ h: 8, w: 12, x: 0, y: 53 }, + fillOpacity=0, + pointSize=5, + showPoints='auto', + unit='none', + displayMode='table', + showLegend=true, + placement='bottom', + tooltip={ mode: 'single', sort: 'desc' }, + stackingMode='none', + spanNulls=true, + decimals=2, + thresholdsMode='absolute', + sortBy='Last *', + sortDesc=true + ) + .addThresholds([ + { color: 'green' }, + ]) + .addOverrides([ + { matcher: { id: 'byType', unit: 'number' }, properties: [{ id: 'color' }, { id: 'color', value: { mode: 'palette-classic' } }] }, + ]) + .addTargets( + [ + $.addTargetSchema( + expr='sum by (bucket, ceph_daemon) ((ceph_rgw_op_per_bucket_copy_obj_ops) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource='${datasource}', + format='time_series', + instant=false, + legendFormat='{{ceph_daemon}} - {{bucket}}', + step=300, + range=true, + ), + ] + ) + { options: { legend: { calcs: ['lastNotNull'], displayMode: 'table', placement: 'bottom', showLegend: true, sortBy: 'Last *', sortDesc: true }, tooltip: { mode: 'single', sort: 'desc' } } }, + + + $.addTableExtended( + datasource='${datasource}', + title='Summary Per Bucket by Operations', + gridPosition={ h: 8, w: 12, x: 12, y: 53 }, + options={ + footer: { + fields: '', + reducer: ['sum'], + countRows: false, + enablePagination: false, + show: false, + }, + frameIndex: 1, + showHeader: true, + sortBy: [ + { + desc: true, + displayName: 'PUTs', + }, + ], + }, + custom={ align: 'auto', cellOptions: { type: 'auto' }, filterable: false, inspect: false }, + thresholds={ + mode: 'absolute', + steps: [ + { color: 'green' }, + ], + }, + overrides=[{ + matcher: { id: 'byType', options: 'number' }, + properties: [ + { id: 'unit', value: 'none' }, + ], + }], + pluginVersion='9.4.7' + ) + .addTransformations([ + { + id: 'merge', + options: {}, + }, + { + id: 'joinByField', + options: { + byField: 'Bucket', + mode: 'outer', + }, + }, + { + id: 'groupBy', + options: { + fields: { + Bucket: { + aggregations: [], + operation: 'groupby', + }, + 'Value #A': { + aggregations: [], + operation: 'groupby', + }, + 'Value #B': { + aggregations: [], + operation: 'groupby', + }, + 'Value #C': { + aggregations: [], + operation: 'groupby', + }, + 'Value #D': { + aggregations: [], + operation: 'groupby', + }, + 'Value #F': { + aggregations: [], + operation: 'groupby', + }, + bucket: { + aggregations: [], + operation: 'groupby', + }, + ceph_daemon: { + aggregations: [], + operation: 'groupby', + }, + }, + }, + }, + { + id: 'organize', + options: { + excludeByName: { + 'Time 1': true, + 'Time 2': true, + 'Time 3': true, + 'Time 4': true, + 'Time 5': true, + 'Time 6': true, + 'Time 7': true, + __name__: true, + '__name__ 1': true, + '__name__ 2': true, + '__name__ 3': true, + '__name__ 4': true, + '__name__ 5': true, + '__name__ 6': true, + '__name__ 7': true, + 'ceph_daemon 1': true, + 'ceph_daemon 2': true, + 'ceph_daemon 3': true, + 'ceph_daemon 4': true, + 'instance 1': true, + 'instance 2': true, + 'instance 3': true, + 'instance 4': true, + 'instance 5': true, + 'instance 6': true, + 'instance 7': true, + 'instance_id 1': true, + 'instance_id 2': true, + 'instance_id 3': true, + 'instance_id 4': true, + 'instance_id 5': true, + 'instance_id 6': true, + 'instance_id 7': true, + 'job 1': true, + 'job 2': true, + 'job 3': true, + 'job 4': true, + 'job 5': true, + 'job 6': true, + 'job 7': true, + }, + indexByName: { + 'Value #A': 2, + 'Value #B': 3, + 'Value #C': 4, + 'Value #D': 5, + 'Value #F': 6, + bucket: 1, + ceph_daemon: 0, + }, + renameByName: { + Bucket: '', + 'Value #A': 'PUTs', + 'Value #B': 'GETs', + 'Value #C': 'List', + 'Value #D': 'Delete', + 'Value #E': 'Copy', + 'Value #F': 'Copy', + 'Value #G': '', + bucket: 'Bucket', + ceph_daemon: 'Daemon', + }, + }, + }, + ]).addTargets([ + $.addTargetSchema( + expr='sum by (bucket, ceph_daemon) (ceph_rgw_op_per_bucket_put_obj_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource={ type: 'prometheus', uid: '${datasource}' }, + format='table', + hide=false, + exemplar=false, + instant=true, + interval='', + legendFormat='__auto', + range=false, + ), + $.addTargetSchema( + expr='sum by (bucket, ceph_daemon) (ceph_rgw_op_per_bucket_get_obj_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource={ type: 'prometheus', uid: '${datasource}' }, + format='table', + hide=false, + exemplar=false, + instant=true, + interval='', + legendFormat='__auto', + range=false, + ), + $.addTargetSchema( + expr='sum by (bucket, ceph_daemon) (ceph_rgw_op_per_bucket_del_obj_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource={ type: 'prometheus', uid: '${datasource}' }, + format='table', + hide=false, + exemplar=false, + instant=true, + interval='', + legendFormat='__auto', + range=false, + ), + $.addTargetSchema( + expr='sum by (bucket, ceph_daemon) (ceph_rgw_op_per_bucket_copy_obj_bytes *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource={ type: 'prometheus', uid: '${datasource}' }, + format='table', + hide=false, + exemplar=false, + instant=true, + interval='', + legendFormat='__auto', + range=false, + ), + $.addTargetSchema( + expr='sum by (bucket, ceph_daemon) (ceph_rgw_op_per_bucket_list_obj_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource={ type: 'prometheus', uid: '${datasource}' }, + format='table', + hide=false, + exemplar=false, + instant=true, + interval='', + legendFormat='__auto', + range=false, + ), + ]), + ]) + { gridPos: { x: 0, y: 28, w: 24, h: 1 } }, + + + $.addRowSchema(false, true, 'Users', collapsed=true) + .addPanels([ + $.addBarGaugePanel( + title='Top 5 Users PUTs By Operations', + datasource='${datasource}', + gridPosition={ x: 0, y: 62, w: 6, h: 8 }, + unit='none', + thresholds={ color: 'green' } + ) + .addTargets([ + $.addTargetSchema( + expr='topk(5, \n sum by (user, ceph_daemon) ((ceph_rgw_op_per_user_put_obj_ops ) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})\n)\n' % $.matchers(), + datasource='${datasource}', + legendFormat='{{ceph_daemon}} - {{user}}', + range=false, + instant=true + ), + ]) + { fieldConfig: { defaults: { color: { mode: 'thresholds' }, thresholds: { mode: 'absolute', steps: [{ color: 'green' }] } }, overrides: [{ matcher: { id: 'byType', unit: 'number' }, properties: [{ id: 'color' }, { id: 'color', value: { mode: 'palette-classic' } }] }] } } + + { options: { orientation: 'horizontal', reduceOptions: { calcs: ['lastNotNull'] }, displayMode: 'gradient' } }, + + + $.addBarGaugePanel( + title='Top 5 Users GETs by Operations', + datasource='${datasource}', + gridPosition={ x: 6, y: 62, w: 6, h: 8 }, + unit='none', + thresholds={ color: 'green', value: null } + ) + .addTargets([ + $.addTargetSchema( + expr='topk(5, \n sum by (user, ceph_daemon) ((ceph_rgw_op_per_user_get_obj_ops ) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})\n)\n' % $.matchers(), + datasource='${datasource}', + legendFormat='{{ceph_daemon}} - {{user}}', + range=false, + instant=true + ), + ]) + { fieldConfig: { defaults: { color: { mode: 'thresholds' }, thresholds: { mode: 'absolute', steps: [{ color: 'green' }] } }, overrides: [{ matcher: { id: 'byType', unit: 'number' }, properties: [{ id: 'color' }, { id: 'color', value: { mode: 'palette-classic' } }] }] } } + + { options: { orientation: 'horizontal', reduceOptions: { calcs: ['lastNotNull'] }, displayMode: 'gradient' } }, + + + $.addBarGaugePanel( + title='Top 5 Users PUTs by Size', + datasource='${datasource}', + gridPosition={ x: 12, y: 62, w: 6, h: 8 }, + unit='decbytes', + thresholds={ color: 'green', value: null } + ) + .addTargets([ + $.addTargetSchema( + expr='topk(5, \n sum by (user, ceph_daemon) ((ceph_rgw_op_per_user_put_obj_bytes) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})\n)' % $.matchers(), + datasource='${datasource}', + legendFormat='{{ceph_daemon}} - {{user}}', + range=false, + instant=true + ), + ]) + { fieldConfig: { defaults: { color: { mode: 'thresholds' }, thresholds: { mode: 'absolute', steps: [{ color: 'green' }] } } } } + + { options: { orientation: 'horizontal', reduceOptions: { calcs: [] }, displayMode: 'gradient' } }, + + + $.addBarGaugePanel( + title='Top 5 Users GETs By Size', + datasource='${datasource}', + gridPosition={ x: 18, y: 62, w: 6, h: 8 }, + unit='decbytes', + thresholds={ color: 'green', value: null } + ) + .addTargets([ + $.addTargetSchema( + expr='topk(5, \n sum by (user, ceph_daemon) ((ceph_rgw_op_per_user_get_obj_bytes) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})\n)' % $.matchers(), + datasource='${datasource}', + legendFormat='{{ceph_daemon}} - {{user}}', + range=false, + instant=true + ), + ]) + { fieldConfig: { defaults: { color: { mode: 'thresholds' }, thresholds: { mode: 'absolute', steps: [{ color: 'green' }] } } } } + + { options: { orientation: 'horizontal', reduceOptions: { calcs: [] }, displayMode: 'gradient' } }, + + + $.timeSeriesPanel( + lineInterpolation='linear', + lineWidth=1, + drawStyle='line', + axisPlacement='auto', + title='User PUTs by Size', + datasource='${datasource}', + gridPosition={ h: 8, w: 6, x: 0, y: 70 }, + fillOpacity=0, + pointSize=5, + showPoints='auto', + unit='decbytes', + displayMode='table', + showLegend=true, + placement='bottom', + tooltip={ mode: 'single', sort: 'desc' }, + stackingMode='none', + spanNulls=true, + decimals=2, + thresholdsMode='absolute', + sortBy='Last *', + sortDesc=true + ) + .addThresholds([ + { color: 'green' }, + ]) + .addOverrides([ + { matcher: { id: 'byType', unit: 'number' }, properties: [{ id: 'color' }, { id: 'color', value: { mode: 'palette-classic' } }] }, + ]) + .addTargets( + [ + $.addTargetSchema( + expr='sum by (user, ceph_daemon) ((ceph_rgw_op_per_user_put_obj_bytes) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource='${datasource}', + format='time_series', + instant=false, + legendFormat='{{ceph_daemon}} - {{user}}', + step=300, + range=true, + ), + ] + ) + { options: { legend: { calcs: ['lastNotNull'], displayMode: 'table', placement: 'bottom', showLegend: true, sortBy: 'Last *', sortDesc: true }, tooltip: { mode: 'single', sort: 'desc' } } }, + + + $.timeSeriesPanel( + lineInterpolation='linear', + lineWidth=1, + drawStyle='line', + axisPlacement='auto', + title='User GETs by Size', + datasource='${datasource}', + gridPosition={ h: 8, w: 6, x: 6, y: 70 }, + fillOpacity=0, + pointSize=5, + showPoints='auto', + unit='decbytes', + displayMode='table', + showLegend=true, + placement='bottom', + tooltip={ mode: 'single', sort: 'desc' }, + stackingMode='none', + spanNulls=true, + decimals=2, + thresholdsMode='absolute', + sortBy='Last *', + sortDesc=true + ) + .addThresholds([ + { color: 'green' }, + ]) + .addOverrides([ + { matcher: { id: 'byType', unit: 'number' }, properties: [{ id: 'color' }, { id: 'color', value: { mode: 'palette-classic' } }] }, + ]) + .addTargets( + [ + $.addTargetSchema( + expr='sum by (user, ceph_daemon) ((ceph_rgw_op_per_user_get_obj_bytes) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource='${datasource}', + format='time_series', + instant=false, + legendFormat='{{ceph_daemon}} - {{user}}', + step=300, + range=true, + ), + ] + ) + { options: { legend: { calcs: ['lastNotNull'], displayMode: 'table', placement: 'bottom', showLegend: true, sortBy: 'Last *', sortDesc: true }, tooltip: { mode: 'single', sort: 'desc' } } }, + + + $.timeSeriesPanel( + lineInterpolation='linear', + lineWidth=1, + drawStyle='line', + axisPlacement='auto', + title='User Delete by Size', + datasource='${datasource}', + gridPosition={ h: 8, w: 6, x: 12, y: 70 }, + fillOpacity=0, + pointSize=5, + showPoints='auto', + unit='decbytes', + displayMode='table', + showLegend=true, + placement='bottom', + tooltip={ mode: 'single', sort: 'desc' }, + stackingMode='none', + spanNulls=true, + decimals=2, + thresholdsMode='absolute', + sortBy='Last *', + sortDesc=true + ) + .addThresholds([ + { color: 'green' }, + ]) + .addOverrides([ + { matcher: { id: 'byType', unit: 'number' }, properties: [{ id: 'color' }, { id: 'color', value: { mode: 'palette-classic' } }] }, + ]) + .addTargets( + [ + $.addTargetSchema( + expr='sum by (user, ceph_daemon) ((ceph_rgw_op_per_user_del_obj_bytes) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource='${datasource}', + format='time_series', + instant=false, + legendFormat='{{ceph_daemon}} - {{user}}', + step=300, + range=true, + ), + ] + ) + { options: { legend: { calcs: ['lastNotNull'], displayMode: 'table', placement: 'bottom', showLegend: true, sortBy: 'Last *', sortDesc: true }, tooltip: { mode: 'single', sort: 'desc' } } }, + + + $.timeSeriesPanel( + lineInterpolation='linear', + lineWidth=1, + drawStyle='line', + axisPlacement='auto', + title='User COPY by Size', + datasource='${datasource}', + gridPosition={ h: 8, w: 6, x: 18, y: 70 }, + fillOpacity=0, + pointSize=5, + showPoints='auto', + unit='decbytes', + displayMode='table', + showLegend=true, + placement='bottom', + tooltip={ mode: 'single', sort: 'desc' }, + stackingMode='none', + spanNulls=true, + decimals=2, + thresholdsMode='absolute', + sortBy='Last *', + sortDesc=true + ) + .addThresholds([ + { color: 'green' }, + ]) + .addOverrides([ + { matcher: { id: 'byType', unit: 'number' }, properties: [{ id: 'color' }, { id: 'color', value: { mode: 'palette-classic' } }] }, + ]) + .addTargets( + [ + $.addTargetSchema( + expr='sum by (user, ceph_daemon) ((ceph_rgw_op_per_user_copy_obj_bytes) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource='${datasource}', + format='time_series', + instant=false, + legendFormat='{{ceph_daemon}} - {{user}}', + step=300, + range=true, + ), + ] + ) + { options: { legend: { calcs: ['lastNotNull'], displayMode: 'table', placement: 'bottom', showLegend: true, sortBy: 'Last *', sortDesc: true }, tooltip: { mode: 'single', sort: 'desc' } } }, + + + $.timeSeriesPanel( + lineInterpolation='linear', + lineWidth=1, + drawStyle='line', + axisPlacement='auto', + title='User GETs by Operations', + datasource='${datasource}', + gridPosition={ h: 8, w: 6, x: 0, y: 78 }, + fillOpacity=0, + pointSize=5, + showPoints='auto', + unit='none', + displayMode='table', + showLegend=true, + placement='bottom', + tooltip={ mode: 'single', sort: 'desc' }, + stackingMode='none', + spanNulls=true, + decimals=2, + thresholdsMode='absolute', + sortBy='Last *', + sortDesc=true + ) + .addThresholds([ + { color: 'green' }, + ]) + .addOverrides([ + { matcher: { id: 'byType', unit: 'number' }, properties: [{ id: 'color' }, { id: 'color', value: { mode: 'palette-classic' } }] }, + ]) + .addTargets( + [ + $.addTargetSchema( + expr='sum by (user, ceph_daemon) ((ceph_rgw_op_per_user_get_obj_ops) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource='${datasource}', + format='time_series', + instant=false, + legendFormat='{{ceph_daemon}} - {{user}}', + step=300, + range=true, + ), + ] + ) + { options: { legend: { calcs: ['lastNotNull'], displayMode: 'table', placement: 'bottom', showLegend: true, sortBy: 'Last *', sortDesc: true }, tooltip: { mode: 'single', sort: 'desc' } } }, + + + $.timeSeriesPanel( + lineInterpolation='linear', + lineWidth=1, + drawStyle='line', + axisPlacement='auto', + title='User PUTs by Operations', + datasource='${datasource}', + gridPosition={ h: 8, w: 6, x: 6, y: 78 }, + fillOpacity=0, + pointSize=5, + showPoints='auto', + unit='none', + displayMode='table', + showLegend=true, + placement='bottom', + tooltip={ mode: 'single', sort: 'desc' }, + stackingMode='none', + spanNulls=true, + decimals=2, + thresholdsMode='absolute', + sortBy='Last *', + sortDesc=true + ) + .addThresholds([ + { color: 'green' }, + ]) + .addOverrides([ + { matcher: { id: 'byType', unit: 'number' }, properties: [{ id: 'color' }, { id: 'color', value: { mode: 'palette-classic' } }] }, + ]) + .addTargets( + [ + $.addTargetSchema( + expr='sum by (user, ceph_daemon) ((ceph_rgw_op_per_user_put_obj_ops) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource='${datasource}', + format='time_series', + instant=false, + legendFormat='{{ceph_daemon}} - {{user}}', + step=300, + range=true, + ), + ] + ) + { options: { legend: { calcs: ['lastNotNull'], displayMode: 'table', placement: 'bottom', showLegend: true, sortBy: 'Last *', sortDesc: true }, tooltip: { mode: 'single', sort: 'desc' } } }, + + + $.timeSeriesPanel( + lineInterpolation='linear', + lineWidth=1, + drawStyle='line', + axisPlacement='auto', + title='User List by Operations', + datasource='${datasource}', + gridPosition={ h: 8, w: 6, x: 12, y: 78 }, + fillOpacity=0, + pointSize=5, + showPoints='auto', + unit='none', + displayMode='table', + showLegend=true, + placement='bottom', + tooltip={ mode: 'single', sort: 'desc' }, + stackingMode='none', + spanNulls=true, + decimals=2, + thresholdsMode='absolute', + sortBy='Last *', + sortDesc=true + ) + .addThresholds([ + { color: 'green' }, + ]) + .addOverrides([ + { matcher: { id: 'byType', unit: 'number' }, properties: [{ id: 'color' }, { id: 'color', value: { mode: 'palette-classic' } }] }, + ]) + .addTargets( + [ + $.addTargetSchema( + expr='sum by (user, ceph_daemon) ((ceph_rgw_op_per_user_list_obj_ops) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource='${datasource}', + format='time_series', + instant=false, + legendFormat='{{ceph_daemon}} - {{user}}', + step=300, + range=true, + ), + ] + ) + { options: { legend: { calcs: ['lastNotNull'], displayMode: 'table', placement: 'bottom', showLegend: true, sortBy: 'Last *', sortDesc: true }, tooltip: { mode: 'single', sort: 'desc' } } }, + + + $.timeSeriesPanel( + lineInterpolation='linear', + lineWidth=1, + drawStyle='line', + axisPlacement='auto', + title='User Delete by Operations', + datasource='${datasource}', + gridPosition={ h: 8, w: 6, x: 18, y: 78 }, + fillOpacity=0, + pointSize=5, + showPoints='auto', + unit='none', + displayMode='table', + showLegend=true, + placement='bottom', + tooltip={ mode: 'single', sort: 'desc' }, + stackingMode='none', + spanNulls=true, + decimals=2, + thresholdsMode='absolute', + sortBy='Last *', + sortDesc=true + ) + .addThresholds([ + { color: 'green' }, + ]) + .addOverrides([ + { matcher: { id: 'byType', unit: 'number' }, properties: [{ id: 'color' }, { id: 'color', value: { mode: 'palette-classic' } }] }, + ]) + .addTargets( + [ + $.addTargetSchema( + expr='sum by (user, ceph_daemon) ((ceph_rgw_op_per_user_del_obj_ops) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource='${datasource}', + format='time_series', + instant=false, + legendFormat='{{ceph_daemon}} - {{user}}', + step=300, + range=true, + ), + ] + ) + { options: { legend: { calcs: ['lastNotNull'], displayMode: 'table', placement: 'bottom', showLegend: true, sortBy: 'Last *', sortDesc: true }, tooltip: { mode: 'single', sort: 'desc' } } }, + + + $.timeSeriesPanel( + lineInterpolation='linear', + lineWidth=1, + drawStyle='line', + axisPlacement='auto', + title='User Copy by Operations', + datasource='${datasource}', + gridPosition={ h: 8, w: 12, x: 0, y: 86 }, + fillOpacity=0, + pointSize=5, + showPoints='auto', + unit='none', + displayMode='table', + showLegend=true, + placement='bottom', + tooltip={ mode: 'single', sort: 'desc' }, + stackingMode='none', + spanNulls=true, + decimals=2, + thresholdsMode='absolute', + sortBy='Last *', + sortDesc=true + ) + .addThresholds([ + { color: 'green' }, + ]) + .addOverrides([ + { matcher: { id: 'byType', unit: 'number' }, properties: [{ id: 'color' }, { id: 'color', value: { mode: 'palette-classic' } }] }, + ]) + .addTargets( + [ + $.addTargetSchema( + expr='sum by (user, ceph_daemon) ((ceph_rgw_op_per_user_copy_obj_ops) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource='${datasource}', + format='time_series', + instant=false, + legendFormat='{{ceph_daemon}} - {{user}}', + step=300, + range=true, + ), + ] + ) + { options: { legend: { calcs: ['lastNotNull'], displayMode: 'table', placement: 'bottom', showLegend: true, sortBy: 'Last *', sortDesc: true }, tooltip: { mode: 'single', sort: 'desc' } } }, + + + $.addTableExtended( + datasource='${datasource}', + title='Summary Per User By Operations', + gridPosition={ h: 8, w: 12, x: 12, y: 86 }, + options={ + footer: { + fields: '', + reducer: ['sum'], + countRows: false, + enablePagination: false, + show: false, + }, + frameIndex: 1, + showHeader: true, + sortBy: [ + { + desc: true, + displayName: 'PUTs', + }, + ], + }, + custom={ align: 'auto', cellOptions: { type: 'auto' }, filterable: false, inspect: false }, + thresholds={ + mode: 'absolute', + steps: [ + { color: 'green' }, + ], + }, + overrides=[{ + matcher: { id: 'byType', options: 'number' }, + properties: [ + { id: 'unit', value: 'none' }, + ], + }], + pluginVersion='9.4.7' + ) + .addTransformations([ + { + id: 'merge', + options: {}, + }, + { + id: 'joinByField', + options: { + byField: 'User', + mode: 'outer', + }, + }, + { + id: 'groupBy', + options: { + fields: { + User: { + aggregations: [], + operation: 'groupby', + }, + 'Value #A': { + aggregations: [], + operation: 'groupby', + }, + 'Value #B': { + aggregations: [], + operation: 'groupby', + }, + 'Value #C': { + aggregations: [], + operation: 'groupby', + }, + 'Value #D': { + aggregations: [], + operation: 'groupby', + }, + 'Value #F': { + aggregations: [], + operation: 'groupby', + }, + ceph_daemon: { + aggregations: [], + operation: 'groupby', + }, + user: { + aggregations: [], + operation: 'groupby', + }, + }, + }, + }, + { + id: 'organize', + options: { + excludeByName: {}, + indexByName: { + 'Value #A': 2, + 'Value #B': 3, + 'Value #C': 4, + 'Value #D': 5, + 'Value #F': 6, + ceph_daemon: 0, + user: 1, + }, + renameByName: { + 'Value #A': 'PUTs', + 'Value #B': 'GETs', + 'Value #C': 'LIST', + 'Value #D': 'DELETE', + 'Value #F': 'COPY', + ceph_daemon: 'Daemon', + user: 'User', + }, + }, + }, + ]).addTargets([ + $.addTargetSchema( + expr='sum by (user, ceph_daemon) (ceph_rgw_op_per_user_put_obj_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource={ type: 'prometheus', uid: '${datasource}' }, + format='table', + hide=false, + exemplar=false, + instant=true, + interval='', + legendFormat='__auto', + range=false, + ), + $.addTargetSchema( + expr='sum by (user, ceph_daemon) (ceph_rgw_op_per_user_get_obj_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource={ type: 'prometheus', uid: '${datasource}' }, + format='table', + hide=false, + exemplar=false, + instant=true, + interval='', + legendFormat='__auto', + range=false, + ), + $.addTargetSchema( + expr='sum by (user, ceph_daemon) (ceph_rgw_op_per_user_del_obj_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource={ type: 'prometheus', uid: '${datasource}' }, + format='table', + hide=false, + exemplar=false, + instant=true, + interval='', + legendFormat='__auto', + range=false, + ), + $.addTargetSchema( + expr='sum by (user, ceph_daemon) (ceph_rgw_op_per_user_copy_obj_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource={ type: 'prometheus', uid: '${datasource}' }, + format='table', + hide=false, + exemplar=false, + instant=true, + interval='', + legendFormat='__auto', + range=false, + ), + $.addTargetSchema( + expr='sum by (user, ceph_daemon) (ceph_rgw_op_per_user_list_obj_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s})' % $.matchers(), + datasource={ type: 'prometheus', uid: '${datasource}' }, + format='table', + hide=false, + exemplar=false, + instant=true, + interval='', + legendFormat='__auto', + range=false, + ), + ]), + ]) + { gridPos: { x: 0, y: 29, w: 24, h: 1 } }, + ]), +} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards/rgw.libsonnet b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards/rgw.libsonnet new file mode 100644 index 000000000..79a4b7a14 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards/rgw.libsonnet @@ -0,0 +1,939 @@ +local g = import 'grafonnet/grafana.libsonnet'; + +(import 'utils.libsonnet') { + 'radosgw-sync-overview.json': + local RgwSyncOverviewPanel(title, formatY1, labelY1, rgwMetric, x, y, w, h) = + $.graphPanelSchema({}, + title, + '', + 'null as zero', + true, + formatY1, + 'short', + labelY1, + null, + 0, + 1, + '$datasource') + .addTargets( + [ + $.addTargetSchema( + 'sum by (source_zone) (rate(%(rgwMetric)s{%(matchers)s}[$__rate_interval]))' + % ($.matchers() + { rgwMetric: rgwMetric }), + '{{source_zone}}' + ), + ] + ) + { type: 'timeseries' } + { fieldConfig: { defaults: { unit: formatY1, custom: { fillOpacity: 8, showPoints: 'never' } } } } + { gridPos: { x: x, y: y, w: w, h: h } }; + + $.dashboardSchema( + 'RGW Sync Overview', + '', + 'rgw-sync-overview', + 'now-1h', + '30s', + 16, + $._config.dashboardTags + ['overview'], + '' + ) + .addAnnotation( + $.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' + ) + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.0.0' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addTemplate( + g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') + ) + .addTemplate( + $.addClusterTemplate() + ) + + .addTemplate( + $.addTemplateSchema( + 'rgw_servers', + '$datasource', + 'label_values(ceph_rgw_metadata{%(matchers)s}, ceph_daemon)' % $.matchers(), + 1, + true, + 1, + null, + 'rgw.(.*)' + ) + ) + .addPanels([ + RgwSyncOverviewPanel( + 'Replication (throughput) from Source Zone', + 'Bps', + null, + 'ceph_data_sync_from_zone_fetch_bytes_sum', + 0, + 0, + 8, + 7 + ), + RgwSyncOverviewPanel( + 'Replication (objects) from Source Zone', + 'short', + 'Objects/s', + 'ceph_data_sync_from_zone_fetch_bytes_count', + 8, + 0, + 8, + 7 + ), + RgwSyncOverviewPanel( + 'Polling Request Latency from Source Zone', + 'ms', + null, + 'ceph_data_sync_from_zone_poll_latency_sum', + 16, + 0, + 8, + 7 + ), + RgwSyncOverviewPanel( + 'Unsuccessful Object Replications from Source Zone', + 'short', + 'Count/s', + 'ceph_data_sync_from_zone_fetch_errors', + 0, + 7, + 8, + 7 + ), + $.timeSeriesPanel( + lineInterpolation='linear', + lineWidth=1, + drawStyle='line', + axisPlacement='auto', + title='Replication(Time) Delta per shard', + datasource='$datasource', + gridPosition={ h: 7, w: 16, x: 8, y: 7 }, + fillOpacity=0, + pointSize=5, + showPoints='auto', + unit='s', + displayMode='table', + showLegend=true, + placement='right', + tooltip={ mode: 'multi', sort: 'desc' }, + stackingMode='none', + spanNulls=false, + decimals=2, + thresholdsMode='absolute', + sortBy='Last *', + sortDesc=true + ) + .addCalcs(['lastNotNull']) + .addThresholds([ + { color: 'green', value: null }, + { color: 'red', value: 80 }, + ]) + .addTargets( + [ + $.addTargetSchema( + expr='rate(ceph_rgw_sync_delta_sync_delta[$__rate_interval])', + datasource='$datasource', + instant=false, + legendFormat='{{instance_id}} - {{shard_id}}', + range=true, + ), + ] + ), + ]), + 'radosgw-overview.json': + local RgwOverviewPanel( + title, + description, + formatY1, + formatY2, + expr1, + legendFormat1, + x, + y, + w, + h, + datasource='$datasource', + legend_alignAsTable=false, + legend_avg=false, + legend_min=false, + legend_max=false, + legend_current=false, + legend_values=false + ) = + $.graphPanelSchema( + {}, + title, + description, + 'null as zero', + false, + formatY1, + formatY2, + null, + null, + 0, + 1, + datasource, + legend_alignAsTable, + legend_avg, + legend_min, + legend_max, + legend_current, + legend_values + ) + .addTargets( + [$.addTargetSchema(expr1, legendFormat1)] + ) + { type: 'timeseries' } + { fieldConfig: { defaults: { unit: formatY1, custom: { fillOpacity: 8, showPoints: 'never' } } } } + { gridPos: { x: x, y: y, w: w, h: h } }; + + $.dashboardSchema( + 'RGW Overview', + '', + 'WAkugZpiz', + 'now-1h', + '30s', + 16, + $._config.dashboardTags + ['overview'], + '' + ) + .addAnnotation( + $.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' + ) + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.0.0' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addTemplate( + g.template.datasource('datasource', + 'prometheus', + 'default', + label='Data Source') + ) + .addTemplate( + $.addClusterTemplate() + ) + .addTemplate( + $.addTemplateSchema( + 'rgw_servers', + '$datasource', + 'label_values(ceph_rgw_metadata{%(matchers)s}, ceph_daemon)' % $.matchers(), + 1, + true, + 1, + '', + '.*' + ) + ) + .addTemplate( + $.addTemplateSchema( + 'code', + '$datasource', + 'label_values(haproxy_server_http_responses_total{job=~"$job_haproxy", instance=~"$ingress_service"}, code)', + 1, + true, + 1, + 'HTTP Code', + '' + ) + ) + .addTemplate( + $.addTemplateSchema( + 'job_haproxy', + '$datasource', + 'label_values(haproxy_server_status, job)', + 1, + true, + 1, + 'job haproxy', + '(.*)', + multi=true, + allValues='.+', + ), + ) + .addTemplate( + $.addTemplateSchema( + 'ingress_service', + '$datasource', + 'label_values(haproxy_server_status{job=~"$job_haproxy"}, instance)', + 1, + true, + 1, + 'Ingress Service', + '' + ) + ) + .addPanels([ + $.addRowSchema(false, + true, + 'RGW Overview - All Gateways') + + { + gridPos: { x: 0, y: 0, w: 24, h: 1 }, + }, + RgwOverviewPanel( + 'Average GET/PUT Latencies by RGW Instance', + '', + 's', + 'short', + ||| + label_replace( + rate(ceph_rgw_op_get_obj_lat_sum{%(matchers)s}[$__rate_interval]) / + rate(ceph_rgw_op_get_obj_lat_count{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}, + "rgw_host", "$1", "ceph_daemon", "rgw.(.*)" + ) + ||| % $.matchers(), + 'GET {{rgw_host}}', + 0, + 1, + 8, + 7 + ).addTargets( + [ + $.addTargetSchema( + ||| + label_replace( + rate(ceph_rgw_op_put_obj_lat_sum{%(matchers)s}[$__rate_interval]) / + rate(ceph_rgw_op_put_obj_lat_count{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}, + "rgw_host", "$1", "ceph_daemon", "rgw.(.*)" + ) + ||| % $.matchers(), + 'PUT {{rgw_host}}' + ), + ] + ), + RgwOverviewPanel( + 'Total Requests/sec by RGW Instance', + '', + 'none', + 'short', + ||| + sum by (rgw_host) ( + label_replace( + rate(ceph_rgw_req{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}, + "rgw_host", "$1", "ceph_daemon", "rgw.(.*)" + ) + ) + ||| % $.matchers(), + '{{rgw_host}}', + 8, + 1, + 7, + 7 + ), + RgwOverviewPanel( + 'GET Latencies by RGW Instance', + 'Latencies are shown stacked, without a yaxis to provide a visual indication of GET latency imbalance across RGW hosts', + 's', + 'short', + ||| + label_replace( + rate(ceph_rgw_op_get_obj_lat_sum{%(matchers)s}[$__rate_interval]) / + rate(ceph_rgw_op_get_obj_lat_count{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}, + "rgw_host", "$1", "ceph_daemon", "rgw.(.*)" + ) + ||| % $.matchers(), + '{{rgw_host}}', + 15, + 1, + 6, + 7 + ), + RgwOverviewPanel( + 'Bandwidth Consumed by Type', + 'Total bytes transferred in/out of all radosgw instances within the cluster', + 'bytes', + 'short', + 'sum(rate(ceph_rgw_op_get_obj_bytes{%(matchers)s}[$__rate_interval]))' % $.matchers(), + 'GETs', + 0, + 8, + 8, + 6 + ).addTargets( + [$.addTargetSchema('sum(rate(ceph_rgw_op_put_obj_bytes{%(matchers)s}[$__rate_interval]))' % $.matchers(), + 'PUTs')] + ), + RgwOverviewPanel( + 'Bandwidth by RGW Instance', + 'Total bytes transferred in/out through get/put operations, by radosgw instance', + 'bytes', + 'short', + ||| + label_replace(sum by (instance_id) ( + rate(ceph_rgw_op_get_obj_bytes{%(matchers)s}[$__rate_interval]) + + rate(ceph_rgw_op_put_obj_bytes{%(matchers)s}[$__rate_interval])) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}, + "rgw_host", "$1", "ceph_daemon", "rgw.(.*)" + ) + ||| % $.matchers(), + '{{rgw_host}}', + 8, + 8, + 7, + 6 + ), + RgwOverviewPanel( + 'PUT Latencies by RGW Instance', + 'Latencies are shown stacked, without a yaxis to provide a visual indication of PUT latency imbalance across RGW hosts', + 's', + 'short', + ||| + label_replace( + rate(ceph_rgw_op_put_obj_lat_sum{%(matchers)s}[$__rate_interval]) / + rate(ceph_rgw_op_put_obj_lat_count{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}, + "rgw_host", "$1", "ceph_daemon", "rgw.(.*)" + ) + ||| % $.matchers(), + '{{rgw_host}}', + 15, + 8, + 6, + 6 + ), + $.addRowSchema( + false, true, 'RGW Overview - HAProxy Metrics' + ) + { gridPos: { x: 0, y: 12, w: 9, h: 12 } }, + RgwOverviewPanel( + 'Total responses by HTTP code', + '', + 'short', + 'short', + ||| + sum( + rate( + haproxy_frontend_http_responses_total{code=~"$code", job=~"$job_haproxy", instance=~"$ingress_service", proxy=~"frontend"}[$__rate_interval] + ) + ) by (code) + |||, + 'Frontend {{ code }}', + 0, + 12, + 5, + 12, + '$datasource', + true, + true, + true, + true, + true, + true + ) + .addTargets( + [ + $.addTargetSchema( + ||| + sum( + rate( + haproxy_backend_http_responses_total{code=~"$code", job=~"$job_haproxy", instance=~"$ingress_service", proxy=~"backend"}[$__rate_interval] + ) + ) by (code) + |||, 'Backend {{ code }}' + ), + ] + ) + .addSeriesOverride([ + { + alias: '/.*Back.*/', + transform: 'negative-Y', + }, + { alias: '/.*1.*/' }, + { alias: '/.*2.*/' }, + { alias: '/.*3.*/' }, + { alias: '/.*4.*/' }, + { alias: '/.*5.*/' }, + { alias: '/.*other.*/' }, + ]), + RgwOverviewPanel( + 'Total requests / responses', + '', + 'short', + 'short', + ||| + sum( + rate( + haproxy_frontend_http_requests_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] + ) + ) by (instance) + |||, + 'Requests', + 5, + 12, + 5, + 12, + '$datasource', + true, + true, + true, + true, + true, + true + ) + .addTargets( + [ + $.addTargetSchema( + ||| + sum( + rate( + haproxy_backend_response_errors_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] + ) + ) by (instance) + |||, 'Response errors', 'time_series', 2 + ), + $.addTargetSchema( + ||| + sum( + rate( + haproxy_frontend_request_errors_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] + ) + ) by (instance) + |||, 'Requests errors' + ), + $.addTargetSchema( + ||| + sum( + rate( + haproxy_backend_redispatch_warnings_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] + ) + ) by (instance) + |||, 'Backend redispatch', 'time_series', 2 + ), + $.addTargetSchema( + ||| + sum( + rate( + haproxy_backend_retry_warnings_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] + ) + ) by (instance) + |||, 'Backend retry', 'time_series', 2 + ), + $.addTargetSchema( + ||| + sum( + rate( + haproxy_frontend_requests_denied_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] + ) + ) by (instance) + |||, 'Request denied', 'time_series', 2 + ), + $.addTargetSchema( + ||| + sum( + haproxy_backend_current_queue{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"} + ) by (instance) + |||, 'Backend Queued', 'time_series', 2 + ), + ] + ) + .addSeriesOverride([ + { + alias: '/.*Response.*/', + transform: 'negative-Y', + }, + { + alias: '/.*Backend.*/', + transform: 'negative-Y', + }, + ]), + RgwOverviewPanel( + 'Total number of connections', + '', + 'short', + 'short', + ||| + sum( + rate( + haproxy_frontend_connections_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] + ) + ) by (instance) + |||, + 'Front', + 10, + 12, + 5, + 12, + '$datasource', + true, + true, + true, + true, + true, + true + ) + .addTargets( + [ + $.addTargetSchema( + ||| + sum( + rate( + haproxy_backend_connection_attempts_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] + ) + ) by (instance) + |||, 'Back' + ), + $.addTargetSchema( + ||| + sum( + rate( + haproxy_backend_connection_errors_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] + ) + ) by (instance) + |||, 'Back errors' + ), + ] + ) + .addSeriesOverride([ + { + alias: '/.*Back.*/', + transform: 'negative-Y', + }, + ]), + RgwOverviewPanel( + 'Current total of incoming / outgoing bytes', + '', + 'short', + 'short', + ||| + sum( + rate( + haproxy_frontend_bytes_in_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] + ) * 8 + ) by (instance) + |||, + 'IN Front', + 15, + 12, + 6, + 12, + '$datasource', + true, + true, + true, + true, + true, + true + ) + .addTargets( + [ + $.addTargetSchema( + ||| + sum( + rate( + haproxy_frontend_bytes_out_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] + ) * 8 + ) by (instance) + |||, 'OUT Front', 'time_series', 2 + ), + $.addTargetSchema( + ||| + sum( + rate( + haproxy_backend_bytes_in_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] + ) * 8 + ) by (instance) + |||, 'IN Back', 'time_series', 2 + ), + $.addTargetSchema( + ||| + sum( + rate( + haproxy_backend_bytes_out_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] + ) * 8 + ) by (instance) + |||, 'OUT Back', 'time_series', 2 + ), + ] + ) + .addSeriesOverride([ + { + alias: '/.*OUT.*/', + transform: 'negative-Y', + }, + ]), + ]), + 'radosgw-detail.json': + local RgwDetailsPanel(aliasColors, + title, + description, + formatY1, + formatY2, + expr1, + expr2, + legendFormat1, + legendFormat2, + x, + y, + w, + h) = + $.graphPanelSchema(aliasColors, + title, + description, + 'null as zero', + false, + formatY1, + formatY2, + null, + null, + 0, + 1, + '$datasource') + .addTargets( + [$.addTargetSchema(expr1, legendFormat1), $.addTargetSchema(expr2, legendFormat2)] + ) + { type: 'timeseries' } + { fieldConfig: { defaults: { unit: formatY1, custom: { fillOpacity: 8, showPoints: 'never' } } } } + { gridPos: { x: x, y: y, w: w, h: h } }; + + $.dashboardSchema( + 'RGW Instance Detail', + '', + 'x5ARzZtmk', + 'now-1h', + '30s', + 16, + $._config.dashboardTags + ['overview'], + '' + ) + .addAnnotation( + $.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' + ) + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.0.0' + ) + .addRequired( + type='panel', + id='grafana-piechart-panel', + name='Pie Chart', + version='1.3.3' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addTemplate( + g.template.datasource('datasource', + 'prometheus', + 'default', + label='Data Source') + ) + .addTemplate( + $.addClusterTemplate() + ) + .addTemplate( + $.addTemplateSchema('rgw_servers', + '$datasource', + 'label_values(ceph_rgw_metadata{%(matchers)s}, ceph_daemon)' % $.matchers(), + 1, + true, + 1, + '', + '') + ) + .addPanels([ + $.addRowSchema(false, true, 'RGW Host Detail : $rgw_servers') + { gridPos: { x: 0, y: 0, w: 24, h: 1 } }, + RgwDetailsPanel( + {}, + '$rgw_servers GET/PUT Latencies', + '', + 's', + 'short', + ||| + sum by (instance_id) ( + rate(ceph_rgw_op_get_obj_lat_sum{%(matchers)s}[$__rate_interval]) / + rate(ceph_rgw_op_get_obj_lat_count{%(matchers)s}[$__rate_interval]) + ) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s} + ||| % $.matchers(), + ||| + sum by (instance_id) ( + rate(ceph_rgw_op_put_obj_lat_sum{%(matchers)s}[$__rate_interval]) / + rate(ceph_rgw_op_put_obj_lat_count{%(matchers)s}[$__rate_interval]) + ) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s} + ||| % $.matchers(), + 'GET {{ceph_daemon}}', + 'PUT {{ceph_daemon}}', + 0, + 1, + 6, + 8 + ), + RgwDetailsPanel( + {}, + 'Bandwidth by HTTP Operation', + '', + 'bytes', + 'short', + ||| + rate(ceph_rgw_op_get_obj_bytes{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s} + ||| % $.matchers(), + ||| + rate(ceph_rgw_op_put_obj_bytes{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) + ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s} + ||| % $.matchers(), + 'GETs {{ceph_daemon}}', + 'PUTs {{ceph_daemon}}', + 6, + 1, + 7, + 8 + ), + RgwDetailsPanel( + { + GETs: '#7eb26d', + Other: '#447ebc', + PUTs: '#eab839', + Requests: '#3f2b5b', + 'Requests Failed': '#bf1b00', + }, + 'HTTP Request Breakdown', + '', + 'short', + 'short', + ||| + rate(ceph_rgw_failed_req{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s} + ||| % $.matchers(), + ||| + rate(ceph_rgw_get{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s} + ||| % $.matchers(), + 'Requests Failed {{ceph_daemon}}', + 'GETs {{ceph_daemon}}', + 13, + 1, + 7, + 8 + ) + .addTargets( + [ + $.addTargetSchema( + ||| + rate(ceph_rgw_put{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s} + ||| % $.matchers(), + 'PUTs {{ceph_daemon}}' + ), + $.addTargetSchema( + ||| + ( + rate(ceph_rgw_req{%(matchers)s}[$__rate_interval]) - + ( + rate(ceph_rgw_get{%(matchers)s}[$__rate_interval]) + + rate(ceph_rgw_put{%(matchers)s}[$__rate_interval]) + ) + ) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s} + ||| % $.matchers(), + 'Other {{ceph_daemon}}' + ), + ] + ), + + $.pieChartPanel('Workload Breakdown', + '', + '$datasource', + { x: 20, y: 1, w: 4, h: 8 }, + 'table', + 'bottom', + true, + [], + { mode: 'single', sort: 'none' }, + 'pie', + ['percent', 'value'], + 'palette-classic', + overrides=[ + { + matcher: { id: 'byName', options: 'Failures' }, + properties: [ + { id: 'color', value: { mode: 'fixed', fixedColor: '#bf1b00' } }, + ], + }, + { + matcher: { id: 'byName', options: 'GETs' }, + properties: [ + { id: 'color', value: { mode: 'fixed', fixedColor: '#7eb26d' } }, + ], + }, + { + matcher: { id: 'byName', options: 'Other (HEAD,POST,DELETE)' }, + properties: [ + { id: 'color', value: { mode: 'fixed', fixedColor: '#447ebc' } }, + ], + }, + { + matcher: { id: 'byName', options: 'PUTs' }, + properties: [ + { id: 'color', value: { mode: 'fixed', fixedColor: '#eab839' } }, + ], + }, + { + matcher: { id: 'byName', options: 'Requests' }, + properties: [ + { id: 'color', value: { mode: 'fixed', fixedColor: '#3f2b5b' } }, + ], + }, + ], + reduceOptions={ values: false, calcs: ['lastNotNull'], fields: '' }) + .addTarget($.addTargetSchema( + ||| + rate(ceph_rgw_failed_req{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s} + ||| % $.matchers(), + 'Failures {{ceph_daemon}}' + )) + .addTarget($.addTargetSchema( + ||| + rate(ceph_rgw_get{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s} + ||| % $.matchers(), + 'GETs {{ceph_daemon}}' + )) + .addTarget($.addTargetSchema( + ||| + rate(ceph_rgw_put{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s} + ||| % $.matchers(), + 'PUTs {{ceph_daemon}}' + )) + .addTarget($.addTargetSchema( + ||| + ( + rate(ceph_rgw_req{%(matchers)s}[$__rate_interval]) - + ( + rate(ceph_rgw_get{%(matchers)s}[$__rate_interval]) + + rate(ceph_rgw_put{%(matchers)s}[$__rate_interval]) + ) + ) * on (instance_id) group_left (ceph_daemon) + ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s} + ||| % $.matchers(), + 'Other (DELETE,LIST) {{ceph_daemon}}' + )), + ]), +} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards/timeseries_panel.libsonnet b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards/timeseries_panel.libsonnet new file mode 100644 index 000000000..7da147cf5 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards/timeseries_panel.libsonnet @@ -0,0 +1,141 @@ +{ + /** + * Creates a [Time series panel](https://grafana.com/docs/grafana/latest/panels-visualizations/visualizations/time-series/). + * + * @name timeseries_panel.new + * + * @param title (default `''`) Panel title. + * @param description (default null) Panel description. + */ + new( + title='', + description=null, + pluginVersion='9.1.3', + gridPos={}, + datasource='', + colorMode='palette-classic', + axisCenteredZero=false, + axisColorMode='text', + axisLabel='', + axisPlacement='auto', + barAlignment=0, + drawStyle='line', + fillOpacity=0, + gradientMode='none', + lineInterpolation='linear', + lineWidth=0, + pointSize=0, + scaleDistributionType='linear', + showPoints='', + spanNulls=false, + stackingGroup='A', + stackingMode='none', + thresholdsStyleMode='off', + decimals=null, + thresholdsMode='absolute', + unit='none', + tooltip={}, + legend={}, + displayMode='list', + placement='bottom', + showLegend=true, + min=null, + scaleDistributionLog=null, + sortBy=null, + sortDesc=null, + ):: { + title: title, + type: 'timeseries', + [if description != null then 'description']: description, + pluginVersion: pluginVersion, + gridPos: gridPos, + datasource: datasource, + fieldConfig: { + defaults: { + color: { mode: colorMode }, + custom: { + axisCenteredZero: axisCenteredZero, + axisColorMode: axisColorMode, + axisLabel: axisLabel, + axisPlacement: axisPlacement, + barAlignment: barAlignment, + drawStyle: drawStyle, + fillOpacity: fillOpacity, + gradientMode: gradientMode, + hideFrom: { + legend: false, + tooltip: false, + viz: false, + }, + lineInterpolation: lineInterpolation, + lineWidth: lineWidth, + pointSize: pointSize, + scaleDistribution: { + [if scaleDistributionLog != null then 'scaleDistributionLog']: scaleDistributionLog, + type: scaleDistributionType, + }, + showPoints: showPoints, + spanNulls: spanNulls, + stacking: { + group: stackingGroup, + mode: stackingMode, + }, + thresholdsStyle: { + mode: thresholdsStyleMode, + }, + }, + [if decimals != null then 'decimals']: decimals, + [if min != null then 'min']: min, + thresholds: { + mode: thresholdsMode, + steps: [], + }, + unit: unit, + }, + overrides: [], + }, + options: { + legend: { + calcs: [], + displayMode: displayMode, + placement: placement, + showLegend: showLegend, + [if sortBy != null then 'sortBy']: sortBy, + [if sortDesc != null then 'sortDesc']: sortDesc, + }, + tooltip: tooltip, + }, + // Overrides + addOverride( + matcher=null, + properties=null, + ):: self { + fieldConfig+: { + overrides+: [ + { + [if matcher != null then 'matcher']: matcher, + [if properties != null then 'properties']: properties, + }, + ], + }, + }, + // thresholds + addThreshold(step):: self { + fieldConfig+: { defaults+: { thresholds+: { steps+: [step] } } }, + }, + addCalc(calc):: self { + options+: { legend+: { calcs+: [calc] } }, + }, + _nextTarget:: 0, + addTarget(target):: self { + // automatically ref id in added targets. + local nextTarget = super._nextTarget, + _nextTarget: nextTarget + 1, + targets+: [target { refId: std.char(std.codepoint('A') + nextTarget) }], + }, + addTargets(targets):: std.foldl(function(p, t) p.addTarget(t), targets, self), + addThresholds(steps):: std.foldl(function(p, s) p.addThreshold(s), steps, self), + addCalcs(calcs):: std.foldl(function(p, t) p.addCalc(t), calcs, self), + addOverrides(overrides):: std.foldl(function(p, o) p.addOverride(o.matcher, o.properties), overrides, self), + }, +} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards/utils.libsonnet b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards/utils.libsonnet new file mode 100644 index 000000000..333a444dd --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards/utils.libsonnet @@ -0,0 +1,754 @@ +local g = import 'grafonnet/grafana.libsonnet'; +local pieChartPanel = import 'piechart_panel.libsonnet'; +local timeSeries = import 'timeseries_panel.libsonnet'; + +{ + _config:: error 'must provide _config', + + dashboardSchema(title, + description, + uid, + time_from, + refresh, + schemaVersion, + tags, + timezone):: + g.dashboard.new(title=title, + description=description, + uid=uid, + time_from=time_from, + refresh=refresh, + schemaVersion=schemaVersion, + tags=tags, + timezone=timezone), + + graphPanelSchema(aliasColors, + title, + description, + nullPointMode, + stack, + formatY1, + formatY2, + labelY1, + labelY2, + min, + fill, + datasource, + legend_alignAsTable=false, + legend_avg=false, + legend_min=false, + legend_max=false, + legend_current=false, + legend_values=false):: + g.graphPanel.new(aliasColors=aliasColors, + title=title, + description=description, + nullPointMode=nullPointMode, + stack=stack, + formatY1=formatY1, + formatY2=formatY2, + labelY1=labelY1, + labelY2=labelY2, + min=min, + fill=fill, + datasource=datasource, + legend_alignAsTable=legend_alignAsTable, + legend_avg=legend_avg, + legend_min=legend_min, + legend_max=legend_max, + legend_current=legend_current, + legend_values=legend_values), + + + addTargetSchema( + expr, + legendFormat='', + format='time_series', + intervalFactor=1, + instant=null, + datasource=null, + step=null, + interval=null, + range=null, + hide=null, + metric=null, + aggregation=null, + alias=null, + decimals=null, + displayAliasType=null, + displayType=null, + displayValueWithAlias=null, + units=null, + valueHandler=null, + warn=null, + crit=null, + exemplar=null, + ):: + g.prometheus.target(expr=expr, + legendFormat=legendFormat, + format=format, + intervalFactor=intervalFactor, + instant=instant, + datasource=datasource) + { + [if step != null then 'step']: step, + [if interval != null then 'interval']: interval, + [if range != null then 'range']: range, + [if hide != null then 'hide']: hide, + [if metric != null then 'metric']: metric, + [if aggregation != null then 'aggregation']: aggregation, + [if alias != null then 'alias']: alias, + [if decimals != null then 'decimals']: decimals, + [if displayAliasType != null then 'displayAliasType']: displayAliasType, + [if displayType != null then 'displayType']: displayType, + [if displayValueWithAlias != null then 'displayValueWithAlias']: displayValueWithAlias, + [if units != null then 'units']: units, + [if valueHandler != null then 'valueHandler']: valueHandler, + [if warn != null then 'warn']: warn, + [if crit != null then 'crit']: crit, + [if exemplar != null then 'exemplar']: exemplar, + }, + + addTemplateSchema(name, + datasource, + query, + refresh, + includeAll, + sort, + label, + regex, + hide='', + multi=false, + allValues=null, + current=null):: + g.template.new(name=name, + datasource=datasource, + query=query, + refresh=refresh, + includeAll=includeAll, + sort=sort, + label=label, + regex=regex, + hide=hide, + multi=multi, + allValues=allValues, + current=current), + + addAnnotationSchema(builtIn, + datasource, + enable, + hide, + iconColor, + name, + type):: + g.annotation.datasource(builtIn=builtIn, + datasource=datasource, + enable=enable, + hide=hide, + iconColor=iconColor, + name=name, + type=type), + + addRowSchema( + collapse, + showTitle, + title, + collapsed=null + ):: + g.row.new(collapse=collapse, showTitle=showTitle, title=title) + { + [if collapsed != null then 'collapsed']: collapsed, + }, + + addSingleStatSchema(colors, + datasource, + format, + title, + description, + valueName, + colorValue, + gaugeMaxValue, + gaugeShow, + sparklineShow, + thresholds):: + g.singlestat.new(colors=colors, + datasource=datasource, + format=format, + title=title, + description=description, + valueName=valueName, + colorValue=colorValue, + gaugeMaxValue=gaugeMaxValue, + gaugeShow=gaugeShow, + sparklineShow=sparklineShow, + thresholds=thresholds), + + addPieChartSchema(aliasColors, + datasource, + description, + legendType, + pieType, + title, + valueName):: + g.pieChartPanel.new(aliasColors=aliasColors, + datasource=datasource, + description=description, + legendType=legendType, + pieType=pieType, + title=title, + valueName=valueName), + + addStyle(alias, + colorMode, + colors, + dateFormat, + decimals, + mappingType, + pattern, + thresholds, + type, + unit, + valueMaps):: + { + alias: alias, + colorMode: colorMode, + colors: colors, + dateFormat: dateFormat, + decimals: decimals, + mappingType: mappingType, + pattern: pattern, + thresholds: thresholds, + type: type, + unit: unit, + valueMaps: valueMaps, + }, + + matchers():: + local clusterMatcher = '%s=~"$cluster"' % $._config.clusterLabel; + { + // Common labels + matchers: (if $._config.showMultiCluster then clusterMatcher + ', ' else ''), + }, + + + addClusterTemplate():: + $.addTemplateSchema( + 'cluster', + '$datasource', + 'label_values(ceph_health_status, %s)' % $._config.clusterLabel, + 1, + false, + 1, + 'cluster', + '(.*)', + if !$._config.showMultiCluster then 'variable' else '', + multi=false, + allValues=null, + ), + + overviewStyle(alias, + pattern, + type, + unit, + colorMode=null, + thresholds=[], + valueMaps=[]):: + $.addStyle(alias, + colorMode, + [ + 'rgba(245, 54, 54, 0.9)', + 'rgba(237, 129, 40, 0.89)', + 'rgba(50, 172, 45, 0.97)', + ], + 'YYYY-MM-DD HH:mm:ss', + 2, + 1, + pattern, + thresholds, + type, + unit, + valueMaps), + + simpleGraphPanel(alias, + title, + description, + formatY1, + labelY1, + min, + expr, + legendFormat, + x, + y, + w, + h):: + $.graphPanelSchema(alias, + title, + description, + 'null', + false, + formatY1, + 'short', + labelY1, + null, + min, + 1, + '$datasource') + .addTargets( + [$.addTargetSchema(expr, legendFormat)] + ) + { type: 'timeseries' } + { fieldConfig: { defaults: { unit: formatY1, custom: { fillOpacity: 8, showPoints: 'never' } } } } + { gridPos: { x: x, y: y, w: w, h: h } }, + + simpleSingleStatPanel(format, + title, + description, + valueName, + expr, + instant, + targetFormat, + x, + y, + w, + h):: + $.addSingleStatSchema(['#299c46', 'rgba(237, 129, 40, 0.89)', '#d44a3a'], + '$datasource', + format, + title, + description, + valueName, + false, + 100, + false, + false, + '') + .addTarget($.addTargetSchema(expr, '', targetFormat, 1, instant)) + { + gridPos: { x: x, y: y, w: w, h: h }, + }, + gaugeSingleStatPanel(format, + title, + description, + valueName, + colorValue, + gaugeMaxValue, + gaugeShow, + sparkLineShow, + thresholds, + expr, + targetFormat, + x, + y, + w, + h):: + $.addSingleStatSchema(['#299c46', 'rgba(237, 129, 40, 0.89)', '#d44a3a'], + '$datasource', + format, + title, + description, + valueName, + colorValue, + gaugeMaxValue, + gaugeShow, + sparkLineShow, + thresholds) + .addTarget($.addTargetSchema(expr, '', targetFormat)) + { gridPos: { x: + x, y: y, w: w, h: h } }, + + simplePieChart(alias, description, title):: + $.addPieChartSchema(alias, + '$datasource', + description, + 'Under graph', + 'pie', + title, + 'current'), + + addStatPanel( + title, + description='', + transparent=false, + datasource=null, + color={}, + unit='none', + overrides=[], + gridPosition={}, + colorMode='none', + graphMode='none', + justifyMode='auto', + orientation='horizontal', + textMode='auto', + reducerFunction='lastNotNull', + pluginVersion='9.1.3', + decimals=0, + interval=null, + maxDataPoints=null, + thresholdsMode='absolute', + rootColorMode=null, + rootColors=null, + cornerRadius=null, + flipCard=null, + flipTime=null, + fontFormat=null, + displayName=null, + isAutoScrollOnOverflow=null, + isGrayOnNoData=null, + isHideAlertsOnDisable=null, + isIgnoreOKColors=null, + ):: + g.statPanel.new( + title=title, + description=description, + transparent=transparent, + datasource=datasource, + unit=unit, + colorMode=colorMode, + graphMode=graphMode, + justifyMode=justifyMode, + orientation=orientation, + textMode=textMode, + reducerFunction=reducerFunction, + pluginVersion=pluginVersion, + decimals=decimals, + thresholdsMode=thresholdsMode, + ) + { + [if interval != null then 'interval']: interval, + [if maxDataPoints != null then 'maxDataPoints']: maxDataPoints, + [if gridPosition != {} then 'gridPos']: gridPosition, + [if rootColorMode != null then 'colorMode']: rootColorMode, + [if rootColors != {} then 'colors']: rootColors, + [if cornerRadius != null then 'cornerRadius']: cornerRadius, + [if flipCard != null then 'flipCard']: flipCard, + [if flipTime != null then 'flipTime']: flipTime, + [if fontFormat != null then 'fontFormat']: fontFormat, + [if displayName != null then 'displayName']: displayName, + [if isAutoScrollOnOverflow != null then 'isAutoScrollOnOverflow']: isAutoScrollOnOverflow, + [if isGrayOnNoData != null then 'isGrayOnNoData']: isGrayOnNoData, + [if isHideAlertsOnDisable != null then 'isHideAlertsOnDisable']: isHideAlertsOnDisable, + [if isIgnoreOKColors != null then 'isIgnoreOKColors']: isIgnoreOKColors, + }, + + addAlertListPanel( + title, + datasource=null, + gridPosition={}, + alertInstanceLabelFilter=null, + alertName=null, + dashboardAlerts=null, + groupBy=null, + groupMode=null, + maxItems=null, + sortOrder=null, + stateFilter=null, + viewMode='list' + ):: + g.alertlist.new( + title=title, + datasource=datasource, + ) + { + gridPos: gridPosition, + options: { + [if alertInstanceLabelFilter != null then 'alertInstanceLabelFilter']: alertInstanceLabelFilter, + [if alertName != null then 'alertName']: alertName, + [if dashboardAlerts != null then 'dashboardAlerts']: dashboardAlerts, + [if groupBy != null then 'groupBy']: groupBy, + [if groupMode != null then 'groupMode']: groupMode, + [if maxItems != null then 'maxItems']: maxItems, + [if sortOrder != null then 'sortOrder']: sortOrder, + [if stateFilter != null then 'stateFilter']: stateFilter, + viewMode: viewMode, + }, + }, + + addCustomTemplate(name='', + query='', + current='', + valuelabels={}, + refresh=0, + label='Interval', + auto_count=10, + auto_min='2m', + options=[], + auto=null):: + g.template.interval(name=name, + query=query, + current=current, + label=label, + auto_count=auto_count, + auto_min=auto_min,) + { + options: options, + refresh: refresh, + valuelabels: valuelabels, + [if auto != null then 'auto']: auto, + }, + + addGaugePanel(title='', + description='', + transparent=false, + datasource='$datasource', + gridPosition={}, + pluginVersion='9.1.3', + unit='percentunit', + instant=false, + reducerFunction='lastNotNull', + steps=[], + max=1, + min=0, + maxDataPoints=100, + interval='1m'):: + g.gaugePanel.new(title=title, + description=description, + transparent=transparent, + datasource=datasource, + pluginVersion=pluginVersion, + unit=unit, + reducerFunction=reducerFunction, + max=max, + min=min) + { + gridPos: gridPosition, + maxDataPoints: maxDataPoints, + interval: interval, + }, + + addBarGaugePanel(title='', + description='', + datasource='${DS_PROMETHEUS}', + gridPosition={}, + unit='percentunit', + thresholds={}):: + g.barGaugePanel.new(title, description, datasource, unit, thresholds) + { + gridPos: gridPosition, + }, + addTableExtended( + title='', + datasource=null, + description=null, + sort=null, + styles='', + transform=null, + pluginVersion='9.1.3', + options=null, + gridPosition={}, + custom=null, + decimals=null, + thresholds=null, + unit=null, + overrides=[], + color=null + ):: + g.tablePanel.new(datasource=datasource, + description=description, + sort=sort, + styles=styles, + title=title, + transform=transform) + { + pluginVersion: pluginVersion, + gridPos: gridPosition, + [if options != null then 'options']: options, + fieldConfig+: { + defaults+: { + [if custom != null then 'custom']: custom, + [if decimals != null then 'decimals']: decimals, + [if thresholds != null then 'thresholds']: thresholds, + [if unit != null then 'unit']: unit, + [if color != null then 'color']: color, + + }, + overrides: overrides, + }, + }, + timeSeriesPanel( + title='', + datasource=null, + gridPosition={}, + colorMode='palette-classic', + axisCenteredZero=false, + axisColorMode='text', + axisLabel='', + axisPlacement='auto', + barAlignment=0, + drawStyle='line', + fillOpacity=0, + gradientMode='none', + lineInterpolation='linear', + lineWidth=0, + pointSize=0, + scaleDistributionType='linear', + showPoints='', + spanNulls=false, + stackingGroup='A', + stackingMode='none', + thresholdsStyleMode='off', + decimals=null, + thresholdsMode='absolute', + unit='none', + tooltip={ mode: 'multi', sort: 'none' }, + pluginVersion='9.1.3', + displayMode='list', + placement='bottom', + showLegend=true, + interval=null, + min=null, + scaleDistributionLog=null, + sortBy=null, + sortDesc=null, + ):: + timeSeries.new( + title=title, + gridPos=gridPosition, + datasource=datasource, + colorMode=colorMode, + axisCenteredZero=axisCenteredZero, + axisColorMode=axisColorMode, + axisLabel=axisLabel, + axisPlacement=axisPlacement, + barAlignment=barAlignment, + drawStyle=drawStyle, + fillOpacity=fillOpacity, + gradientMode=gradientMode, + lineInterpolation=lineInterpolation, + lineWidth=lineWidth, + pointSize=pointSize, + scaleDistributionType=scaleDistributionType, + showPoints=showPoints, + spanNulls=spanNulls, + stackingGroup=stackingGroup, + stackingMode=stackingMode, + thresholdsStyleMode=thresholdsStyleMode, + decimals=decimals, + thresholdsMode=thresholdsMode, + unit=unit, + displayMode=displayMode, + placement=placement, + showLegend=showLegend, + tooltip=tooltip, + min=min, + scaleDistributionLog=scaleDistributionLog, + sortBy=sortBy, + sortDesc=sortDesc, + ) + { + pluginVersion: pluginVersion, + [if interval != null then 'interval']: interval, + }, + + pieChartPanel( + title, + description='', + datasource=null, + gridPos={}, + displayMode='table', + placement='bottom', + showLegend=true, + displayLabels=[], + tooltip={}, + pieType='pie', + values=[], + colorMode='auto', + overrides=[], + reduceOptions={}, + ):: + pieChartPanel.new( + title, + description=description, + datasource=datasource, + gridPos=gridPos, + displayMode=displayMode, + placement=placement, + showLegend=showLegend, + displayLabels=displayLabels, + tooltip=tooltip, + pieType=pieType, + values=values, + colorMode=colorMode, + overrides=overrides, + reduceOptions=reduceOptions, + ), + + heatMapPanel( + title='', + datasource=null, + gridPosition={}, + colorMode='spectrum', + cardColor='#b4ff00', + colorScale='sqrt', + colorScheme='interpolateOranges', + colorExponent=0.5, + pluginVersion='9.1.3', + dataFormat='timeseries', + hideFrom={ legend: false, tooltip: false, viz: false }, + scaleDistributionType='linear', + legendShow=false, + optionsCalculate=false, + optionsCalculation={ + yBuckets: { + mode: 'count', + scale: { log: 2, type: 'log' }, + value: '1', + }, + }, + optionsCellGap=2, + optionsCellValues={}, + optionsColor={}, + optionsExemplars={}, + optionsFilterValues={}, + optionsLegend={}, + optionsRowFrame={}, + optionsShowValue='never', + optionsToolTip={}, + optionsYAxis={}, + xBucketSize=null, + yAxisDecimals=null, + yAxisFormat='short', + yAxisLogBase=1, + yAxisMin=null, + yAxisMax=null, + yAxisShow=true, + yAxisSplitFactor=1, + yBucketSize=null, + yBucketBound='auto' + ) + :: g.heatmapPanel.new( + title=title, + datasource=datasource, + color_mode=colorMode, + color_cardColor=cardColor, + color_colorScale=colorScale, + color_colorScheme=colorScheme, + color_exponent=colorExponent, + legend_show=legendShow, + xBucketSize=xBucketSize, + yAxis_decimals=yAxisDecimals, + yAxis_format=yAxisFormat, + yAxis_logBase=yAxisLogBase, + yAxis_min=yAxisMin, + yAxis_max=yAxisMax, + yAxis_show=yAxisShow, + yAxis_splitFactor=yAxisSplitFactor, + yBucketSize=yBucketSize, + yBucketBound=yBucketBound + ) + { + gridPos: gridPosition, + pluginVersion: pluginVersion, + color+: { + colorScheme: colorScheme, + }, + fieldConfig: { + defaults: { + custom: { + hideFrom: hideFrom, + scaleDistribution: { + type: scaleDistributionType, + }, + }, + }, + }, + options: { + calculate: optionsCalculate, + calculation: optionsCalculation, + cellGap: optionsCellGap, + cellValues: optionsCellValues, + color: optionsColor, + exemplars: optionsExemplars, + filterValues: optionsFilterValues, + legend: optionsLegend, + rowsFrame: optionsRowFrame, + showValue: optionsShowValue, + tooltip: optionsToolTip, + yAxis: optionsYAxis, + }, + }, +} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/ceph-cluster-advanced.json b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/ceph-cluster-advanced.json new file mode 100644 index 000000000..ff31ebf23 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/ceph-cluster-advanced.json @@ -0,0 +1,3813 @@ +{ + "__inputs": [ ], + "__requires": [ + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "5.3.2" + }, + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "5.0.0" + }, + { + "id": "heatmap", + "name": "Heatmap", + "type": "panel", + "version": "5.0.0" + }, + { + "id": "singlestat", + "name": "Singlestat", + "type": "panel", + "version": "5.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "showIn": 0, + "tags": [ ], + "type": "dashboard" + } + ] + }, + "description": "Ceph cluster overview", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ ], + "panels": [ + { + "collapse": false, + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 2, + "panels": [ ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "CLUSTER STATE", + "titleSize": "h6", + "type": "row" + }, + { + "colors": null, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ + { + "id": 0, + "options": { + "0": { + "text": "HEALTHY" + }, + "1": { + "text": "WARNING" + }, + "2": { + "text": "ERROR" + } + }, + "type": "value" + }, + { + "id": 1, + "options": { + "match": null, + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#9ac48a" + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 1 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 2 + } + ] + }, + "unit": "none" + } + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 0, + "y": 1 + }, + "id": 3, + "interval": "1m", + "links": [ ], + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "$datasource", + "expr": "ceph_health_status{cluster=~\"$cluster\", }", + "format": "time_series", + "instant": true, + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 300 + } + ], + "title": "Ceph health status", + "transparent": true, + "type": "stat" + }, + { + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "links": [ ], + "mappings": [ + { + "id": 0, + "options": { + "match": null, + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(245, 54, 54, 0.9)" + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 0.10000000000000001 + }, + { + "color": "rgba(50, 172, 45, 0.97)", + "value": 0.29999999999999999 + } + ] + }, + "unit": "percentunit" + } + }, + "gridPos": { + "h": 6, + "w": 3, + "x": 3, + "y": 1 + }, + "id": 4, + "interval": "1m", + "links": [ ], + "maxDataPoints": 100, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "$datasource", + "expr": "(ceph_cluster_total_bytes{cluster=~\"$cluster\", }-ceph_cluster_total_used_bytes{cluster=~\"$cluster\", })/ceph_cluster_total_bytes{cluster=~\"$cluster\", }", + "format": "time_series", + "instant": true, + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 300 + } + ], + "title": "Available Capacity", + "transparent": false, + "type": "gauge" + }, + { + "colors": null, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 2, + "links": [ ], + "mappings": [ + { + "id": 0, + "options": { + "match": null, + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)" + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 0.025000000000000001 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 1 + } + ] + }, + "unit": "decbytes" + } + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 6, + "y": 1 + }, + "id": 5, + "interval": "1m", + "links": [ ], + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "$datasource", + "expr": "ceph_cluster_total_bytes{cluster=~\"$cluster\", }", + "format": "time_series", + "instant": true, + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 300 + } + ], + "title": "Cluster Capacity", + "transparent": false, + "type": "stat" + }, + { + "colors": null, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 1, + "links": [ ], + "mappings": [ + { + "id": 0, + "options": { + "match": null, + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "Bps" + } + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 9, + "y": 1 + }, + "id": 6, + "links": [ ], + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "$datasource", + "expr": "sum(irate(ceph_osd_op_w_in_bytes{cluster=~\"$cluster\", }[5m]))", + "format": "time_series", + "instant": true, + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Write Throughput", + "transparent": false, + "type": "stat" + }, + { + "colors": null, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 1, + "links": [ ], + "mappings": [ + { + "id": 0, + "options": { + "match": null, + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#d44a3a" + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 0 + }, + { + "color": "#9ac48a", + "value": 0 + } + ] + }, + "unit": "Bps" + } + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 12, + "y": 1 + }, + "id": 7, + "links": [ ], + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "$datasource", + "expr": "sum(irate(ceph_osd_op_r_out_bytes{cluster=~\"$cluster\", }[5m]))", + "format": "time_series", + "instant": true, + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Read Throughput", + "transparent": false, + "type": "stat" + }, + { + "colorMode": "Panel", + "colors": { + "crit": "rgb(255, 0, 0)", + "disable": "rgba(128, 128, 128, 0.9)", + "ok": "rgba(50, 128, 45, 0.9)", + "warn": "rgba(237, 129, 40, 0.9)" + }, + "cornerRadius": 0, + "datasource": "$datasource", + "description": "", + "displayName": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + } + }, + "flipCard": false, + "flipTime": 5, + "fontFormat": "Regular", + "gridPos": { + "h": 3, + "w": 6, + "x": 15, + "y": 1 + }, + "id": 8, + "isAutoScrollOnOverflow": false, + "isGrayOnNoData": false, + "isHideAlertsOnDisable": false, + "isIgnoreOKColors": false, + "links": [ ], + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "aggregation": "Last", + "alias": "All", + "datasource": "$datasource", + "decimals": 2, + "displayAliasType": "Always", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "count(ceph_osd_metadata{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "All", + "refId": "A", + "units": "none", + "valueHandler": "Number Threshold" + }, + { + "aggregation": "Last", + "alias": "In", + "datasource": "$datasource", + "decimals": 2, + "displayAliasType": "Always", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "count(ceph_osd_in{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "In", + "refId": "B", + "units": "none", + "valueHandler": "Number Threshold" + }, + { + "aggregation": "Last", + "alias": "Out", + "datasource": "$datasource", + "decimals": 2, + "displayAliasType": "Warning / Critical", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "sum(ceph_osd_in{cluster=~\"$cluster\", } == bool 0)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Out", + "refId": "C", + "units": "none", + "valueHandler": "Number Threshold", + "warn": 1 + }, + { + "aggregation": "Last", + "alias": "Up", + "datasource": "$datasource", + "decimals": 2, + "displayAliasType": "Always", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "sum(ceph_osd_up{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Up", + "refId": "D", + "units": "none", + "valueHandler": "Number Threshold" + }, + { + "aggregation": "Last", + "alias": "Down", + "datasource": "$datasource", + "decimals": 2, + "displayAliasType": "Warning / Critical", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "sum(ceph_osd_up{cluster=~\"$cluster\", } == bool 0)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Down", + "refId": "E", + "units": "none", + "valueHandler": "Number Threshold", + "warn": 1 + } + ], + "title": "OSDs", + "transparent": false, + "type": "stat" + }, + { + "colorMode": "Panel", + "colors": { + "crit": "rgba(245, 54, 54, 0.9)", + "disable": "rgba(128, 128, 128, 0.9)", + "ok": "rgba(50, 128, 45, 0.9)", + "warn": "rgba(237, 129, 40, 0.9)" + }, + "cornerRadius": 1, + "datasource": "$datasource", + "description": "", + "displayName": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + } + }, + "flipCard": false, + "flipTime": 5, + "fontFormat": "Regular", + "gridPos": { + "h": 6, + "w": 3, + "x": 21, + "y": 1 + }, + "id": 9, + "isAutoScrollOnOverflow": false, + "isGrayOnNoData": false, + "isHideAlertsOnDisable": false, + "isIgnoreOKColors": false, + "links": [ ], + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "aggregation": "Last", + "alias": "Active", + "datasource": "$datasource", + "decimals": 2, + "displayAliasType": "Always", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "count(ceph_mgr_status{cluster=~\"$cluster\", } == 1) or vector(0)", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "Active", + "refId": "A", + "units": "none", + "valueHandler": "Number Threshold" + }, + { + "aggregation": "Last", + "alias": "Standby", + "datasource": "$datasource", + "decimals": 2, + "displayAliasType": "Always", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "count(ceph_mgr_status{cluster=~\"$cluster\", } == 0) or vector(0)", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "Standby", + "refId": "B", + "units": "none", + "valueHandler": "Number Threshold" + } + ], + "title": "MGRs", + "transparent": false, + "type": "stat" + }, + { + "colorMode": "Panel", + "colors": { + "crit": "rgba(245, 54, 54, 0.9)", + "disable": "rgba(128, 128, 128, 0.9)", + "ok": "rgba(50, 128, 45, 0.9)", + "warn": "rgba(237, 129, 40, 0.9)" + }, + "cornerRadius": 1, + "datasource": "$datasource", + "description": "", + "displayName": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Critical" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Warning" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#987d24", + "mode": "fixed" + } + } + ] + } + ] + }, + "flipCard": false, + "flipTime": 5, + "fontFormat": "Regular", + "gridPos": { + "h": 3, + "w": 3, + "x": 0, + "y": 4 + }, + "id": 10, + "isAutoScrollOnOverflow": false, + "isGrayOnNoData": false, + "isHideAlertsOnDisable": false, + "isIgnoreOKColors": false, + "links": [ ], + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "aggregation": "Last", + "alias": "Active", + "datasource": "$datasource", + "decimals": 2, + "displayAliasType": "Always", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "count(ALERTS{alertstate=\"firing\",alertname=~\"^Ceph.+\", severity=\"critical\", cluster=~\"$cluster\", }) OR vector(0)", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "Critical", + "refId": "A", + "units": "none", + "valueHandler": "Number Threshold" + }, + { + "aggregation": "Last", + "alias": "Standby", + "datasource": "$datasource", + "decimals": 2, + "displayAliasType": "Always", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "count(ALERTS{alertstate=\"firing\",alertname=~\"^Ceph.+\", severity=\"warning\", cluster=~\"$cluster\", }) OR vector(0)", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "Warning", + "refId": "B", + "units": "none", + "valueHandler": "Number Threshold" + } + ], + "title": "Firing Alerts", + "transparent": false, + "type": "stat" + }, + { + "colors": null, + "datasource": "$datasource", + "description": "", + "displayName": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ + { + "id": 0, + "options": { + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 0.025000000000000001 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 0.10000000000000001 + } + ] + }, + "unit": "decbytes" + } + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 6, + "y": 4 + }, + "id": 11, + "links": [ ], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "$datasource", + "expr": "ceph_cluster_total_used_bytes{cluster=~\"$cluster\", }", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Used Capacity", + "transparent": false, + "type": "stat" + }, + { + "colors": null, + "datasource": "$datasource", + "description": "", + "displayName": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ + { + "id": 0, + "options": { + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 9, + "y": 4 + }, + "id": 12, + "links": [ ], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "$datasource", + "expr": "sum(irate(ceph_osd_op_w{cluster=~\"$cluster\", }[1m]))", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Write IOPS", + "transparent": false, + "type": "stat" + }, + { + "colors": null, + "datasource": "$datasource", + "description": "", + "displayName": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ + { + "id": 0, + "options": { + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#d44a3a", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 0 + }, + { + "color": "#9ac48a", + "value": 0 + } + ] + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 12, + "y": 4 + }, + "id": 13, + "links": [ ], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "$datasource", + "expr": "sum(irate(ceph_osd_op_r{cluster=~\"$cluster\", }[1m]))", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Read IOPS", + "transparent": false, + "type": "stat" + }, + { + "colorMode": "Panel", + "colors": { + "crit": "rgba(245, 54, 54, 0.9)", + "disable": "rgba(128, 128, 128, 0.9)", + "ok": "rgba(50, 128, 45, 0.9)", + "warn": "rgba(237, 129, 40, 0.9)" + }, + "cornerRadius": 1, + "datasource": "$datasource", + "description": "", + "displayName": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + } + }, + "flipCard": false, + "flipTime": 5, + "fontFormat": "Regular", + "gridPos": { + "h": 3, + "w": 6, + "x": 15, + "y": 4 + }, + "id": 14, + "isAutoScrollOnOverflow": false, + "isGrayOnNoData": false, + "isHideAlertsOnDisable": false, + "isIgnoreOKColors": false, + "links": [ ], + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "aggregation": "Last", + "alias": "In Quorum", + "datasource": "$datasource", + "decimals": 2, + "displayAliasType": "Always", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "sum(ceph_mon_quorum_status{cluster=~\"$cluster\", })", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "In Quorum", + "refId": "A", + "units": "none", + "valueHandler": "Text Only" + }, + { + "aggregation": "Last", + "alias": "Total", + "crit": 1, + "datasource": "$datasource", + "decimals": 2, + "displayAliasType": "Always", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "count(ceph_mon_quorum_status{cluster=~\"$cluster\", })", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Total", + "refId": "B", + "units": "none", + "valueHandler": "Text Only", + "warn": 2 + }, + { + "aggregation": "Last", + "alias": "MONs out of Quorum", + "crit": 1.6000000000000001, + "datasource": "$datasource", + "decimals": 2, + "displayAliasType": "Warning / Critical", + "displayType": "Annotation", + "displayValueWithAlias": "Never", + "expr": "count(ceph_mon_quorum_status{cluster=~\"$cluster\", }) - sum(ceph_mon_quorum_status{cluster=~\"$cluster\", })", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "MONs out of Quorum", + "range": true, + "refId": "C", + "units": "none", + "valueHandler": "Number Threshold", + "warn": 1.1000000000000001 + } + ], + "title": "Monitors", + "transparent": false, + "type": "stat" + }, + { + "collapse": false, + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 7 + }, + "id": 15, + "panels": [ ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "CLUSTER STATS", + "titleSize": "h6", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 8 + }, + "id": 16, + "limit": 10, + "onlyAlertsOnDashboard": true, + "options": { + "alertInstanceLabelFilter": "{alertname=~\"^Ceph.+\", cluster=~\"$cluster\", }", + "alertName": "", + "dashboardAlerts": false, + "groupBy": [ ], + "groupMode": "default", + "maxItems": 20, + "sortOrder": 1, + "stateFilter": { + "error": true, + "firing": true, + "noData": false, + "normal": false, + "pending": true + }, + "viewMode": "list" + }, + "show": "current", + "sortOrder": 1, + "stateFilter": [ ], + "title": "Alerts", + "type": "alertlist" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 40, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 0, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#c0921f", + "value": 75 + }, + { + "color": "#E02F44", + "value": 85 + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Total Capacity" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Used" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + }, + { + "id": "custom.thresholdsStyle", + "value": { + "mode": "dashed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 8 + }, + "id": 17, + "interval": "$interval", + "options": { + "legend": { + "calcs": [ + "last" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "$datasource", + "expr": "ceph_cluster_total_bytes{cluster=~\"$cluster\", }", + "format": "time_series", + "instant": false, + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Total Capacity", + "range": true, + "refId": "A", + "step": 300 + }, + { + "datasource": "$datasource", + "expr": "ceph_cluster_total_used_bytes{cluster=~\"$cluster\", }", + "format": "time_series", + "instant": false, + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Used", + "range": true, + "refId": "B", + "step": 300 + } + ], + "title": "Capacity", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 85 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 8 + }, + "id": 18, + "interval": "$interval", + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "$datasource", + "expr": "sum(irate(ceph_osd_op_w_in_bytes{cluster=~\"$cluster\", }[5m]))", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Write", + "range": true, + "refId": "A", + "step": 300 + }, + { + "datasource": "$datasource", + "expr": "sum(irate(ceph_osd_op_r_out_bytes{cluster=~\"$cluster\", }[5m]))", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Read", + "range": true, + "refId": "B", + "step": 300 + } + ], + "title": "Cluster Throughput", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 16 + }, + "id": 19, + "interval": "$interval", + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "$datasource", + "expr": "sum(irate(ceph_osd_op_w{cluster=~\"$cluster\", }[1m]))", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Write", + "range": true, + "refId": "A", + "step": 300 + }, + { + "datasource": "$datasource", + "expr": "sum(irate(ceph_osd_op_r{cluster=~\"$cluster\", }[1m]))", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Read", + "range": true, + "refId": "B", + "step": 300 + } + ], + "title": "IOPS", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 16 + }, + "id": 20, + "interval": "$interval", + "options": { + "legend": { + "calcs": [ ], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "$datasource", + "expr": "(ceph_pool_bytes_used{cluster=~\"$cluster\", }) *on (pool_id) group_left(name)(ceph_pool_metadata{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "{{name}}", + "refId": "A", + "step": 300 + } + ], + "title": "Pool Used Bytes", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "rbd Stored" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "transparent", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 16 + }, + "id": 21, + "interval": "$interval", + "options": { + "legend": { + "calcs": [ ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "$datasource", + "expr": "(ceph_pool_stored_raw{cluster=~\"$cluster\", }) *on (pool_id) group_left(name)(ceph_pool_metadata{cluster=~\"$cluster\", })", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{name}}", + "range": true, + "refId": "A", + "step": 300 + } + ], + "title": "Pool Used RAW Bytes", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 24 + }, + "id": 22, + "interval": "$interval", + "options": { + "legend": { + "calcs": [ ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "$datasource", + "expr": "(ceph_pool_quota_objects{cluster=~\"$cluster\", }) *on (pool_id) group_left(name)(ceph_pool_metadata{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{name}}", + "refId": "A", + "step": 300 + } + ], + "title": "Pool Objects Quota", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 24 + }, + "id": 23, + "interval": "$interval", + "options": { + "legend": { + "calcs": [ ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "$datasource", + "expr": "(ceph_pool_quota_bytes{cluster=~\"$cluster\", }) *on (pool_id) group_left(name)(ceph_pool_metadata{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{name}}", + "refId": "A", + "step": 300 + } + ], + "title": "Pool Quota Bytes", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 24 + }, + "id": 24, + "interval": "$interval", + "options": { + "legend": { + "calcs": [ ], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "$datasource", + "expr": "(ceph_pool_objects{cluster=~\"$cluster\", }) * on (pool_id) group_left(name)(ceph_pool_metadata{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{name}}", + "refId": "A" + } + ], + "title": "Objects Per Pool", + "type": "timeseries" + }, + { + "collapse": false, + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 31 + }, + "id": 25, + "panels": [ ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "OBJECTS", + "titleSize": "h6", + "type": "row" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/^Total.*$/" + }, + "properties": [ + { + "id": "custom.stacking", + "value": { + "group": false, + "mode": "normal" + } + } + ] + } + ] + }, + "gridPos": { + "h": 12, + "w": 6, + "x": 0, + "y": 32 + }, + "id": 26, + "interval": "$interval", + "options": { + "legend": { + "calcs": [ ], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "asc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "$datasource", + "expr": "sum(ceph_pool_objects{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Total", + "range": true, + "refId": "A", + "step": 200 + } + ], + "title": "OSD Type Count", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/^Total.*$/" + }, + "properties": [ + { + "id": "custom.stacking", + "value": { + "group": false, + "mode": "normal" + } + } + ] + } + ] + }, + "gridPos": { + "h": 12, + "w": 8, + "x": 6, + "y": 32 + }, + "id": 27, + "interval": "$interval", + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "asc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_active{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Active", + "range": true, + "refId": "A" + }, + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_clean{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Clean", + "range": true, + "refId": "B" + }, + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_peering{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Peering", + "range": true, + "refId": "C" + }, + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_degraded{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Degraded", + "range": true, + "refId": "D", + "step": 300 + }, + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_stale{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Stale", + "range": true, + "refId": "E", + "step": 300 + }, + { + "datasource": "$datasource", + "expr": "sum(ceph_unclean_pgs{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Unclean", + "range": true, + "refId": "F", + "step": 300 + }, + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_undersized{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Undersized", + "range": true, + "refId": "G", + "step": 300 + }, + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_incomplete{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Incomplete", + "range": true, + "refId": "H" + }, + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_forced_backfill{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Forced Backfill", + "range": true, + "refId": "I" + }, + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_forced_recovery{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Forced Recovery", + "range": true, + "refId": "J" + }, + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_creating{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Creating", + "range": true, + "refId": "K" + }, + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_wait_backfill{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Wait Backfill", + "range": true, + "refId": "L" + }, + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_deep{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Deep", + "range": true, + "refId": "M" + }, + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_scrubbing{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Scrubbing", + "range": true, + "refId": "N" + }, + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_recovering{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Recovering", + "range": true, + "refId": "O" + }, + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_repair{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Repair", + "range": true, + "refId": "P" + }, + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_down{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Down", + "range": true, + "refId": "Q" + }, + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_peered{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Peered", + "range": true, + "refId": "R" + }, + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_backfill{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Backfill", + "range": true, + "refId": "S" + }, + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_remapped{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Remapped", + "range": true, + "refId": "T" + }, + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_backfill_toofull{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Backfill Toofull", + "range": true, + "refId": "U" + } + ], + "title": "PGs State", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/^Total.*$/" + }, + "properties": [ + { + "id": "custom.stacking", + "value": { + "group": false, + "mode": "normal" + } + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 10, + "x": 14, + "y": 32 + }, + "id": 28, + "interval": "$interval", + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "asc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_degraded{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Degraded", + "range": true, + "refId": "A", + "step": 300 + }, + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_stale{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Stale", + "range": true, + "refId": "B", + "step": 300 + }, + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_undersized{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Undersized", + "range": true, + "refId": "C", + "step": 300 + } + ], + "title": "Stuck PGs", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 6, + "w": 10, + "x": 14, + "y": 38 + }, + "id": 29, + "interval": "$interval", + "options": { + "legend": { + "calcs": [ ], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "$datasource", + "expr": "sum(irate(ceph_osd_recovery_ops{cluster=~\"$cluster\", }[$interval]))", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "OPS", + "refId": "A", + "step": 300 + } + ], + "title": "Recovery Operations", + "type": "timeseries" + }, + { + "collapse": false, + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 44 + }, + "id": 30, + "panels": [ + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateOranges", + "exponent": 0.5, + "mode": "opacity" + }, + "dataFormat": "timeseries", + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 42 + }, + "heatmap": { }, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 31, + "legend": { + "show": true + }, + "options": { + "calculate": true, + "calculation": { + "yBuckets": { + "mode": "count", + "scale": { + "log": 2, + "type": "log" + }, + "value": "1" + } + }, + "cellGap": 2, + "cellValues": { }, + "color": { + "exponent": 0.5, + "fill": "#b4ff00", + "mode": "opacity", + "reverse": false, + "scale": "exponential", + "scheme": "Oranges", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1.0000000000000001e-09 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "show": true, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "min": "0", + "reverse": false, + "unit": "ms" + } + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "$datasource", + "expr": "ceph_osd_apply_latency_ms{cluster=~\"$cluster\", }", + "format": "time_series", + "instant": false, + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "OSD Apply Latency Distribution", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": "", + "yAxis": { + "decimals": null, + "format": "ms", + "logBase": 2, + "max": null, + "min": "0", + "show": true, + "splitFactor": 1 + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": 10 + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#65c5db", + "colorScale": "sqrt", + "colorScheme": "interpolateOranges", + "exponent": 0.5, + "mode": "opacity" + }, + "dataFormat": "timeseries", + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 42 + }, + "heatmap": { }, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 32, + "legend": { + "show": true + }, + "options": { + "calculate": true, + "calculation": { + "yBuckets": { + "mode": "count", + "scale": { + "log": 2, + "type": "log" + } + } + }, + "cellGap": 2, + "cellValues": { }, + "color": { + "exponent": 0.5, + "fill": "#65c5db", + "mode": "opacity", + "reverse": false, + "scale": "exponential", + "scheme": "Oranges", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1.0000000000000001e-09 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "show": true, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "min": "0", + "reverse": false, + "unit": "ms" + } + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "$datasource", + "expr": "ceph_osd_commit_latency_ms{cluster=~\"$cluster\", }", + "format": "time_series", + "instant": false, + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "OSD Commit Latency Distribution", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": "", + "yAxis": { + "decimals": null, + "format": "ms", + "logBase": 2, + "max": null, + "min": "0", + "show": true, + "splitFactor": 1 + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": 10 + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#806eb7", + "colorScale": "sqrt", + "colorScheme": "interpolateOranges", + "exponent": 0.5, + "mode": "opacity" + }, + "dataFormat": "timeseries", + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 50 + }, + "heatmap": { }, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 33, + "legend": { + "show": true + }, + "options": { + "calculate": true, + "calculation": { + "yBuckets": { + "mode": "count", + "scale": { + "log": 2, + "type": "log" + } + } + }, + "cellGap": 2, + "cellValues": { }, + "color": { + "exponent": 0.5, + "fill": "#806eb7", + "mode": "opacity", + "reverse": false, + "scale": "exponential", + "scheme": "Oranges", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1.0000000000000001e-09 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "show": true, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "decimals": 2, + "min": "0", + "reverse": false, + "unit": "ms" + } + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "$datasource", + "expr": "rate(ceph_osd_op_r_latency_sum{cluster=~\"$cluster\", }[5m]) / rate(ceph_osd_op_r_latency_count{cluster=~\"$cluster\", }[5m]) >= 0", + "format": "time_series", + "instant": false, + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "OSD Read Op Latency Distribution", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": "", + "yAxis": { + "decimals": null, + "format": "ms", + "logBase": 2, + "max": null, + "min": "0", + "show": true, + "splitFactor": 1 + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#f9934e", + "colorScale": "sqrt", + "colorScheme": "interpolateOranges", + "exponent": 0.5, + "mode": "opacity" + }, + "dataFormat": "timeseries", + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 50 + }, + "heatmap": { }, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 34, + "legend": { + "show": true + }, + "options": { + "calculate": true, + "calculation": { + "yBuckets": { + "mode": "count", + "scale": { + "log": 2, + "type": "log" + } + } + }, + "cellGap": 2, + "cellValues": { }, + "color": { + "exponent": 0.5, + "fill": "#f9934e", + "mode": "opacity", + "reverse": false, + "scale": "exponential", + "scheme": "Oranges", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1.0000000000000001e-09 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "show": true, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "decimals": 2, + "min": "0", + "reverse": false, + "unit": "ms" + } + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "$datasource", + "expr": "rate(ceph_osd_op_w_latency_sum{cluster=~\"$cluster\", }[5m]) / rate(ceph_osd_op_w_latency_count{cluster=~\"$cluster\", }[5m]) >= 0", + "format": "time_series", + "instant": false, + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "OSD Write Op Latency Distribution", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": "", + "yAxis": { + "decimals": null, + "format": "ms", + "logBase": 2, + "max": null, + "min": "0", + "show": true, + "splitFactor": 1 + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 58 + }, + "id": 35, + "interval": "$interval", + "options": { + "legend": { + "calcs": [ ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "$datasource", + "expr": "avg(rate(ceph_osd_op_r_latency_sum{cluster=~\"$cluster\", }[5m]) / rate(ceph_osd_op_r_latency_count{cluster=~\"$cluster\", }[5m]) >= 0)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Read", + "refId": "A" + }, + { + "datasource": "$datasource", + "expr": "avg(rate(ceph_osd_op_w_latency_sum{cluster=~\"$cluster\", }[5m]) / rate(ceph_osd_op_w_latency_count{cluster=~\"$cluster\", }[5m]) >= 0)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Write", + "refId": "B" + } + ], + "title": "Recovery Operations", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 58 + }, + "id": 36, + "interval": "$interval", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "$datasource", + "expr": "avg(ceph_osd_apply_latency_ms{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "apply", + "metric": "ceph_osd_perf_apply_latency_seconds", + "refId": "A", + "step": 4 + }, + { + "datasource": "$datasource", + "expr": "avg(ceph_osd_commit_latency_ms{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "commit", + "metric": "ceph_osd_perf_commit_latency_seconds", + "refId": "B", + "step": 4 + } + ], + "title": "AVG OSD Apply + Commit Latency", + "type": "timeseries" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "LATENCY", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": true, + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 45 + }, + "id": 37, + "panels": [ ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "", + "titleSize": "h6", + "type": "row" + }, + { + "columns": [ ], + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "align": "left", + "cellOptions": { + "type": "auto" + }, + "filterable": false, + "inspect": false + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 46 + }, + "id": 38, + "links": [ ], + "options": { + "footer": { + "countRows": false, + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 1, + "showHeader": true + }, + "pluginVersion": "9.4.7", + "styles": "", + "targets": [ + { + "datasource": "$datasource", + "exemplar": false, + "expr": "count by (ceph_version)(ceph_osd_metadata{cluster=~\"$cluster\", })", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "OSD Services", + "range": false, + "refId": "A" + }, + { + "datasource": "$datasource", + "exemplar": false, + "expr": "count by (ceph_version)(ceph_mon_metadata{cluster=~\"$cluster\", })", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Mon Services", + "range": false, + "refId": "B" + }, + { + "datasource": "$datasource", + "exemplar": false, + "expr": "count by (ceph_version)(ceph_mds_metadata{cluster=~\"$cluster\", })", + "format": "table", + "hide": false, + "instant": true, + "intervalFactor": 1, + "legendFormat": "MDS Services", + "range": false, + "refId": "C" + }, + { + "datasource": "$datasource", + "exemplar": false, + "expr": "count by (ceph_version)(ceph_rgw_metadata{cluster=~\"$cluster\", })", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "RGW Services", + "range": false, + "refId": "D" + }, + { + "datasource": "$datasource", + "exemplar": false, + "expr": "count by (ceph_version)(ceph_mgr_metadata{cluster=~\"$cluster\", })", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "MGR Services", + "range": false, + "refId": "E" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Ceph Versions", + "transformations": [ + { + "id": "merge", + "options": { } + }, + { + "id": "organize", + "options": { + "excludeByName": { }, + "indexByName": { }, + "renameByName": { + "Time": "", + "Value #A": "OSD Services", + "Value #B": "Mon Services", + "Value #C": "MDS Services", + "Value #D": "RGW Services", + "Value #E": "MGR Services", + "ceph_version": "Ceph Version" + } + } + } + ], + "type": "table" + } + ], + "refresh": "1m", + "rows": [ ], + "schemaVersion": 38, + "style": "dark", + "tags": [ + "ceph-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": "Data Source", + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_health_status, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "auto": true, + "auto_count": 10, + "auto_min": "1m", + "current": { + "text": "$__auto_interval_interval", + "value": "$__auto_interval_interval" + }, + "hide": 0, + "label": "Interval", + "name": "interval", + "options": [ + { + "selected": true, + "text": "auto", + "value": "$__auto_interval_interval" + }, + { + "selected": false, + "text": "5s", + "value": "5s" + }, + { + "selected": false, + "text": "10s", + "value": "10s" + }, + { + "selected": false, + "text": "30s", + "value": "30s" + }, + { + "selected": false, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "10m", + "value": "10m" + }, + { + "selected": false, + "text": "30m", + "value": "30m" + }, + { + "selected": false, + "text": "1h", + "value": "1h" + }, + { + "selected": false, + "text": "6h", + "value": "6h" + }, + { + "selected": false, + "text": "12h", + "value": "12h" + }, + { + "selected": false, + "text": "1d", + "value": "1d" + }, + { + "selected": false, + "text": "7d", + "value": "7d" + }, + { + "selected": false, + "text": "14d", + "value": "14d" + }, + { + "selected": false, + "text": "30d", + "value": "30d" + } + ], + "query": "5s,10s,30s,1m,10m,30m,1h,6h,12h,1d,7d,14d,30d", + "refresh": 2, + "type": "interval", + "valuelabels": { } + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Ceph Cluster - Advanced", + "uid": "dn13KBeTv", + "version": 0 +} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/ceph-cluster.json b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/ceph-cluster.json new file mode 100644 index 000000000..1fd7821a6 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/ceph-cluster.json @@ -0,0 +1,1434 @@ +{ + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "5.0.0" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "5.0.0" + }, + { + "type": "panel", + "id": "heatmap", + "name": "Heatmap", + "version": "5.0.0" + }, + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "5.0.0" + } + ], + "annotations": { + "list": [] + }, + "description": "Ceph cluster overview", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "iteration": 1525415495309, + "links": [], + "panels": [ + { + "cacheTimeout": null, + "colorBackground": true, + "colorValue": false, + "colors": [ + "rgba(50, 128, 45, 0.9)", + "rgba(237, 129, 40, 0.9)", + "rgb(255, 0, 0)" + ], + "datasource": "$datasource", + "editable": false, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 0, + "y": 0 + }, + "hideTimeOverride": true, + "id": 21, + "interval": "1m", + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 2, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "ceph_health_status{cluster=~'$cluster'}", + "format": "time_series", + "instant": true, + "interval": "$interval", + "intervalFactor": 1, + "refId": "A", + "step": 60 + } + ], + "thresholds": "1,2", + "timeFrom": null, + "title": "Health Status", + "transparent": false, + "type": "singlestat", + "valueFontSize": "50%", + "valueMaps": [ + { + "op": "=", + "text": "OK", + "value": "0" + }, + { + "op": "=", + "text": "WARN", + "value": "1" + }, + { + "op": "=", + "text": "ERR", + "value": "2" + } + ], + "valueName": "current" + }, + { + "colorMode": "Panel", + "colors": { + "crit": "rgb(255, 0, 0)", + "disable": "rgba(128, 128, 128, 0.9)", + "ok": "rgba(50, 128, 45, 0.9)", + "warn": "rgba(237, 129, 40, 0.9)" + }, + "cornerRadius": 0, + "datasource": "$datasource", + "displayName": "", + "flipCard": false, + "flipTime": 5, + "fontFormat": "Regular", + "gridPos": { + "h": 3, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 43, + "isAutoScrollOnOverflow": false, + "isGrayOnNoData": false, + "isHideAlertsOnDisable": false, + "isIgnoreOKColors": false, + "links": [], + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "targets": [ + { + "aggregation": "Last", + "alias": "All", + "decimals": 2, + "displayAliasType": "Always", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "count(ceph_osd_metadata{cluster=~'$cluster'})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "All", + "refId": "A", + "units": "none", + "valueHandler": "Number Threshold" + }, + { + "aggregation": "Last", + "alias": "In", + "decimals": 2, + "displayAliasType": "Always", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "sum(ceph_osd_in{cluster=~'$cluster'})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "In", + "refId": "B", + "units": "none", + "valueHandler": "Number Threshold" + }, + { + "aggregation": "Last", + "alias": "Out", + "decimals": 2, + "displayAliasType": "Warning / Critical", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "sum(ceph_osd_in{cluster=~'$cluster'} == bool 0)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Out", + "refId": "C", + "units": "none", + "valueHandler": "Number Threshold", + "warn": 1 + }, + { + "aggregation": "Last", + "alias": "Up", + "decimals": 2, + "displayAliasType": "Always", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "sum(ceph_osd_up{cluster=~'$cluster'})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Up", + "refId": "D", + "units": "none", + "valueHandler": "Number Threshold" + }, + { + "aggregation": "Last", + "alias": "Down", + "crit": 2, + "decimals": 2, + "displayAliasType": "Warning / Critical", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "sum(ceph_osd_up{cluster=~'$cluster'} == bool 0)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Down", + "refId": "E", + "units": "none", + "valueHandler": "Number Threshold", + "warn": 1 + } + ], + "title": "OSDs", + "type": "stat", + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "All" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Out" + }, + "properties": [ + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 0.1 + }, + { + "value": 10, + "color": "red" + } + ] + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Down" + }, + "properties": [ + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 0.1 + }, + { + "value": 10, + "color": "red" + } + ] + } + } + ] + } + ] + } + }, + { + "clusterName": "", + "colorMode": "Panel", + "colors": { + "crit": "rgba(245, 54, 54, 0.9)", + "disable": "rgba(128, 128, 128, 0.9)", + "ok": "rgba(50, 128, 45, 0.9)", + "warn": "rgba(237, 129, 40, 0.9)" + }, + "cornerRadius": 1, + "datasource": "$datasource", + "displayName": "", + "flipCard": false, + "flipTime": 5, + "fontFormat": "Regular", + "gridPos": { + "h": 3, + "w": 6, + "x": 12, + "y": 0 + }, + "id": 41, + "isAutoScrollOnOverflow": false, + "isGrayOnNoData": false, + "isHideAlertsOnDisable": false, + "isIgnoreOKColors": false, + "links": [], + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "targets": [ + { + "aggregation": "Last", + "alias": "In Quorum", + "decimals": 2, + "displayAliasType": "Always", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "sum(ceph_mon_quorum_status{cluster=~'$cluster'})", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "In Quorum", + "refId": "A", + "units": "none", + "valueHandler": "Text Only" + }, + { + "aggregation": "Last", + "alias": "Total", + "crit": 1, + "decimals": 2, + "displayAliasType": "Always", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "count(ceph_mon_quorum_status{cluster=~'$cluster'})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Total", + "refId": "B", + "units": "none", + "valueHandler": "Text Only", + "warn": 2 + }, + { + "aggregation": "Last", + "alias": "MONs out of Quorum", + "crit": 1.6, + "decimals": 2, + "displayAliasType": "Warning / Critical", + "displayType": "Annotation", + "displayValueWithAlias": "Never", + "expr": "count(ceph_mon_quorum_status{cluster=~'$cluster'}) - sum(ceph_mon_quorum_status{cluster=~'$cluster'})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "MONs out of Quorum", + "refId": "C", + "units": "none", + "valueHandler": "Number Threshold", + "warn": 1.1 + } + ], + "title": "Monitors", + "type": "stat" + }, + { + "colorMode": "Panel", + "colors": { + "crit": "rgba(245, 54, 54, 0.9)", + "disable": "rgba(128, 128, 128, 0.9)", + "ok": "rgba(50, 128, 45, 0.9)", + "warn": "rgba(237, 129, 40, 0.9)" + }, + "cornerRadius": 1, + "datasource": "$datasource", + "displayName": "", + "flipCard": false, + "flipTime": 5, + "fontFormat": "Regular", + "gridPos": { + "h": 3, + "w": 6, + "x": 18, + "y": 0 + }, + "id": 68, + "isAutoScrollOnOverflow": false, + "isGrayOnNoData": false, + "isHideAlertsOnDisable": false, + "isIgnoreOKColors": false, + "links": [], + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "targets": [ + { + "aggregation": "Last", + "alias": "Active", + "decimals": 2, + "displayAliasType": "Always", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "count(ceph_mgr_status{cluster=~'$cluster'} == 1) or vector(0)", + "format": "time_series", + "intervalFactor": 1, + "instant": true, + "legendFormat": "Active", + "refId": "A", + "units": "none", + "valueHandler": "Number Threshold" + }, + { + "aggregation": "Last", + "alias": "Standby", + "decimals": 2, + "displayAliasType": "Always", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "count(ceph_mgr_status{cluster=~'$cluster'} == 0) or vector(0)", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "Standby", + "refId": "B", + "units": "none", + "valueHandler": "Number Threshold" + } + ], + "title": "MGRs", + "type": "stat" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "decimals": 2, + "format": "percentunit", + "gauge": { + "maxValue": 1, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 6 + }, + "id": 47, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(ceph_osd_stat_bytes_used{cluster=~'$cluster'})/sum(ceph_osd_stat_bytes{cluster=~'$cluster'})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Used", + "refId": "A" + } + ], + "thresholds": "0.7,0.8", + "title": "Capacity used", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "short" + } + }, + "fill": 0, + "gridPos": { + "h": 6, + "w": 9, + "x": 6, + "y": 6 + }, + "id": 53, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "Active", + "color": "#508642", + "fill": 1, + "stack": "A" + }, + { + "alias": "Total", + "color": "#f9e2d2" + }, + { + "alias": "Degraded", + "color": "#eab839" + }, + { + "alias": "Undersized", + "color": "#f9934e" + }, + { + "alias": "Inconsistent", + "color": "#e24d42" + }, + { + "alias": "Down", + "color": "#bf1b00" + }, + { + "alias": "Inactive", + "color": "#bf1b00", + "fill": 4, + "linewidth": 0, + "stack": "A" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(ceph_pg_total{cluster=~'$cluster'})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Total", + "refId": "A" + }, + { + "expr": "sum(ceph_pg_active{cluster=~'$cluster'})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Active", + "refId": "B" + }, + { + "expr": "sum(ceph_pg_total{cluster=~'$cluster'} - ceph_pg_active{cluster=~'$cluster'})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Inactive", + "refId": "G" + }, + { + "expr": "sum(ceph_pg_undersized{cluster=~'$cluster'})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Undersized", + "refId": "F" + }, + { + "expr": "sum(ceph_pg_degraded{cluster=~'$cluster'})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Degraded", + "refId": "C" + }, + { + "expr": "sum(ceph_pg_inconsistent{cluster=~'$cluster'})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Inconsistent", + "refId": "D" + }, + { + "expr": "sum(ceph_pg_down{cluster=~'$cluster'})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Down", + "refId": "E" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "PG States", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "ms" + } + }, + "fill": 0, + "gridPos": { + "h": 6, + "w": 9, + "x": 15, + "y": 6 + }, + "id": 66, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "Avg Apply Latency", + "color": "#7eb26d" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "quantile(0.95, ceph_osd_apply_latency_ms{cluster=~'$cluster'})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Apply Latency P_95", + "refId": "A" + }, + { + "expr": "quantile(0.95, ceph_osd_commit_latency_ms{cluster=~'$cluster'})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Commit Latency P_95", + "refId": "B" + }, + { + "expr": "avg(ceph_osd_apply_latency_ms{cluster=~'$cluster'})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Avg Apply Latency", + "refId": "C" + }, + { + "expr": "avg(ceph_osd_commit_latency_ms{cluster=~'$cluster'})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Avg Commit Latency", + "refId": "D" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "OSD Latencies", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "Bps" + } + }, + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 45, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 0.5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "Reads", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(irate(ceph_osd_op_w_in_bytes{cluster=~'$cluster'}[1m]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Writes", + "refId": "A" + }, + { + "expr": "sum(irate(ceph_osd_op_r_out_bytes{cluster=~'$cluster'}[1m]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Reads", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Cluster I/O", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": "Read (-) / Write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "Bps" + } + }, + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 62, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(deriv(ceph_pool_stored{cluster=~'$cluster'}[1m]))", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "In-/Egress", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": " Egress (-) / Ingress (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "cards": { + "cardPadding": null, + "cardRound": 1 + }, + "color": { + "cardColor": "rgb(0, 254, 255)", + "colorScale": "sqrt", + "colorScheme": "interpolateBlues", + "exponent": 0.5, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "timeseries", + "datasource": "$datasource", + "gridPos": { + "h": 9, + "w": 6, + "x": 0, + "y": 15 + }, + "heatmap": {}, + "highlightCards": true, + "id": 55, + "legend": { + "show": true + }, + "links": [], + "span": 12, + "targets": [ + { + "expr": "ceph_osd_stat_bytes_used{cluster=~'$cluster'} / ceph_osd_stat_bytes{cluster=~'$cluster'}", + "format": "time_series", + "interval": "1m", + "intervalFactor": 1, + "legendFormat": "Util (%)", + "refId": "A", + "step": 60 + } + ], + "timeFrom": null, + "title": "OSD Capacity Utilization", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": "", + "yAxis": { + "decimals": 2, + "format": "percentunit", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketNumber": null, + "yBucketSize": null + }, + { + "cards": { + "cardPadding": null, + "cardRound": 1 + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateBlues", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "timeseries", + "datasource": "$datasource", + "gridPos": { + "h": 9, + "w": 6, + "x": 6, + "y": 15 + }, + "heatmap": {}, + "highlightCards": true, + "id": 59, + "legend": { + "show": true + }, + "links": [], + "targets": [ + { + "expr": "ceph_osd_numpg{cluster=~'$cluster'}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "#PGs", + "refId": "A" + } + ], + "title": "PGs per OSD", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": "", + "yAxis": { + "decimals": null, + "format": "none", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketNumber": null, + "yBucketSize": null + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "fill": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 15 + }, + "id": 64, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(irate(ceph_osd_recovery_ops{cluster=~'$cluster'}[1m]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Op/s", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Recovery Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ops", + "label": "Recovery Ops/s", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "refresh": "30s", + "schemaVersion": 16, + "style": "dark", + "tags": [ + "ceph", + "cluster" + ], + "templating": { + "list": [ + { + "hide": 0, + "label": null, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(ceph_health_status, cluster)", + "hide": 0, + "includeAll": false, + "label": "Cluster", + "multi": false, + "name": "cluster", + "options": [], + "query": { + "query": "label_values(ceph_health_status, cluster)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "auto": true, + "auto_count": 10, + "auto_min": "1m", + "current": { + "text": "auto", + "value": "$__auto_interval_interval" + }, + "datasource": null, + "hide": 0, + "includeAll": false, + "label": "Interval", + "multi": false, + "name": "interval", + "options": [ + { + "selected": true, + "text": "auto", + "value": "$__auto_interval_interval" + }, + { + "selected": false, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "10m", + "value": "10m" + }, + { + "selected": false, + "text": "30m", + "value": "30m" + }, + { + "selected": false, + "text": "1h", + "value": "1h" + }, + { + "selected": false, + "text": "6h", + "value": "6h" + }, + { + "selected": false, + "text": "12h", + "value": "12h" + }, + { + "selected": false, + "text": "1d", + "value": "1d" + }, + { + "selected": false, + "text": "7d", + "value": "7d" + }, + { + "selected": false, + "text": "14d", + "value": "14d" + }, + { + "selected": false, + "text": "30d", + "value": "30d" + } + ], + "query": "1m,10m,30m,1h,6h,12h,1d,7d,14d,30d", + "refresh": 2, + "type": "interval" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Ceph - Cluster", + "uid": "edtb0oxdq", + "version": 0 +} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/cephfs-overview.json b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/cephfs-overview.json new file mode 100644 index 000000000..f65ce4da6 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/cephfs-overview.json @@ -0,0 +1,360 @@ +{ + "__inputs": [ ], + "__requires": [ + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "5.3.2" + }, + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "5.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "showIn": 0, + "tags": [ ], + "type": "dashboard" + } + ] + }, + "description": "", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ ], + "panels": [ + { + "collapse": false, + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 2, + "panels": [ ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "MDS Performance", + "titleSize": "h6", + "type": "row" + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "none" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 3, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "/.*Reads/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(ceph_objecter_op_r{ceph_daemon=~\"($mds_servers).*\", cluster=~\"$cluster\", }[$__rate_interval]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Read Ops", + "refId": "A" + }, + { + "expr": "sum(rate(ceph_objecter_op_w{ceph_daemon=~\"($mds_servers).*\", cluster=~\"$cluster\", }[$__rate_interval]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Write Ops", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "MDS Workload - $mds_servers", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "none", + "label": "Reads(-) / Writes (+)", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "none" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "ceph_mds_server_handle_client_request{ceph_daemon=~\"($mds_servers).*\", cluster=~\"$cluster\", }", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Client Request Load - $mds_servers", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "none", + "label": "Client Requests", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "refresh": "30s", + "rows": [ ], + "schemaVersion": 16, + "style": "dark", + "tags": [ + "ceph-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": "Data Source", + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_health_status, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "MDS Server", + "multi": false, + "name": "mds_servers", + "options": [ ], + "query": "label_values(ceph_mds_inodes{cluster=~\"$cluster\", }, ceph_daemon)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "MDS Performance", + "uid": "tbO9LAiZz", + "version": 0 +} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/host-details.json b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/host-details.json new file mode 100644 index 000000000..ef357d34b --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/host-details.json @@ -0,0 +1,1434 @@ +{ + "__inputs": [ ], + "__requires": [ + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "5.3.2" + }, + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "5.0.0" + }, + { + "id": "singlestat", + "name": "Singlestat", + "type": "panel", + "version": "5.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "showIn": 0, + "tags": [ ], + "type": "dashboard" + } + ] + }, + "description": "", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ ], + "panels": [ + { + "collapse": false, + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 2, + "panels": [ ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "$ceph_hosts System Overview", + "titleSize": "h6", + "type": "row" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 0, + "y": 1 + }, + "id": 3, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "count(sum by (ceph_daemon) (ceph_osd_metadata{cluster=~\"$cluster\", }))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "OSDs", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "aliasColors": { + "interrupt": "#447EBC", + "steal": "#6D1F62", + "system": "#890F02", + "user": "#3F6833", + "wait": "#C15C17" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows the CPU breakdown. When multiple servers are selected, only the first host's cpu data is shown", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "percent" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 6, + "x": 3, + "y": 1 + }, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (mode) (\n rate(node_cpu{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\", mode=~\"(irq|nice|softirq|steal|system|user|iowait)\"}[$__rate_interval]) or\n rate(node_cpu_seconds_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\", mode=~\"(irq|nice|softirq|steal|system|user|iowait)\"}[$__rate_interval])\n) / (\n scalar(\n sum(rate(node_cpu{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]) or\n rate(node_cpu_seconds_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]))\n ) * 100\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{mode}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Utilization", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percent", + "label": "% Utilization", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + "Available": "#508642", + "Free": "#508642", + "Total": "#bf1b00", + "Used": "#bf1b00", + "total": "#bf1b00", + "used": "#0a50a1" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "bytes" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 6, + "x": 9, + "y": 1 + }, + "id": 5, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "total", + "color": "#bf1b00", + "fill": 0, + "linewidth": 2, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_MemFree{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"} or\n node_memory_MemFree_bytes{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Free", + "refId": "A" + }, + { + "expr": "node_memory_MemTotal{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"} or\n node_memory_MemTotal_bytes{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "total", + "refId": "B" + }, + { + "expr": "(\n node_memory_Cached{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"} or\n node_memory_Cached_bytes{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}\n) + (\n node_memory_Buffers{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"} or\n node_memory_Buffers_bytes{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}\n) + (\n node_memory_Slab{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"} or\n node_memory_Slab_bytes{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "buffers/cache", + "refId": "C" + }, + { + "expr": "(\n node_memory_MemTotal{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"} or\n node_memory_MemTotal_bytes{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}\n) - (\n (\n node_memory_MemFree{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"} or\n node_memory_MemFree_bytes{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}\n ) + (\n node_memory_Cached{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"} or\n node_memory_Cached_bytes{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}\n ) + (\n node_memory_Buffers{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"} or\n node_memory_Buffers_bytes{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}\n ) +\n (\n node_memory_Slab{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"} or\n node_memory_Slab_bytes{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}\n )\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "used", + "refId": "D" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "RAM Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "bytes", + "label": "RAM used", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Show the network load (rx,tx) across all interfaces (excluding loopback 'lo')", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "decbytes" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 6, + "x": 15, + "y": 1 + }, + "id": 6, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "/.*tx/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (device) (\n rate(\n node_network_receive_bytes{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\",device!=\"lo\"}[$__rate_interval]) or\n rate(node_network_receive_bytes_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\",device!=\"lo\"}[$__rate_interval]\n )\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}}.rx", + "refId": "A" + }, + { + "expr": "sum by (device) (\n rate(node_network_transmit_bytes{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\",device!=\"lo\"}[$__rate_interval]) or\n rate(node_network_transmit_bytes_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\",device!=\"lo\"}[$__rate_interval])\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}}.tx", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Network Load", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "decbytes", + "label": "Send (-) / Receive (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "pps" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 3, + "x": 21, + "y": 1 + }, + "id": 7, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "/.*tx/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_receive_drop{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[$__rate_interval]) or\n rate(node_network_receive_drop_total{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[$__rate_interval])\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}}.rx", + "refId": "A" + }, + { + "expr": "rate(node_network_transmit_drop{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[$__rate_interval]) or\n rate(node_network_transmit_drop_total{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[$__rate_interval])\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}}.tx", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Network drop rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "pps", + "label": "Send (-) / Receive (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "description": "Each OSD consists of a Journal/WAL partition and a data partition. The RAW Capacity shown is the sum of the data partitions across all OSDs on the selected OSD hosts.", + "format": "bytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 0, + "y": 6 + }, + "id": 8, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(\n ceph_osd_stat_bytes{cluster=~\"$cluster\", } and\n on (ceph_daemon) ceph_disk_occupation{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\", cluster=~\"$cluster\", }\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Raw Capacity", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "pps" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 3, + "x": 21, + "y": 6 + }, + "id": 9, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "/.*tx/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_receive_errs{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[$__rate_interval]) or\n rate(node_network_receive_errs_total{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[$__rate_interval])\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}}.rx", + "refId": "A" + }, + { + "expr": "rate(node_network_transmit_errs{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[$__rate_interval]) or\n rate(node_network_transmit_errs_total{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[$__rate_interval])\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}}.tx", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Network error rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "pps", + "label": "Send (-) / Receive (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "collapse": false, + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 11 + }, + "id": 10, + "panels": [ ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "OSD Disk Performance Statistics", + "titleSize": "h6", + "type": "row" + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "For any OSD devices on the host, this chart shows the iops per physical device. Each device is shown by it's name and corresponding OSD id value", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 11, + "x": 0, + "y": 12 + }, + "id": 11, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "/.*reads/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "label_replace(\n (\n rate(node_disk_writes_completed{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]) or\n rate(node_disk_writes_completed_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval])\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) * on(instance, device) group_left(ceph_daemon) label_replace(\n label_replace(\n ceph_disk_occupation_human{cluster=~\"$cluster\", }, \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}}({{ceph_daemon}}) writes", + "refId": "A" + }, + { + "expr": "label_replace(\n (\n rate(node_disk_reads_completed{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]) or\n rate(node_disk_reads_completed_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval])\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) * on(instance, device) group_left(ceph_daemon) label_replace(\n label_replace(\n ceph_disk_occupation_human{cluster=~\"$cluster\", },\"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}}({{ceph_daemon}}) reads", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "$ceph_hosts Disk IOPS", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "ops", + "label": "Read (-) / Write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "For OSD hosts, this chart shows the disk bandwidth (read bytes/sec + write bytes/sec) of the physical OSD device. Each device is shown by device name, and corresponding OSD id", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "Bps" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 11, + "x": 12, + "y": 12 + }, + "id": 12, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "/.*read/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "label_replace(\n (\n rate(node_disk_bytes_written{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]) or\n rate(node_disk_written_bytes_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval])\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") * on(instance, device)\n group_left(ceph_daemon) label_replace(\n label_replace(ceph_disk_occupation_human{cluster=~\"$cluster\", }, \"device\", \"$1\", \"device\", \"/dev/(.*)\"),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n )\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}}({{ceph_daemon}}) write", + "refId": "A" + }, + { + "expr": "label_replace(\n (\n rate(node_disk_bytes_read{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]) or\n rate(node_disk_read_bytes_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval])\n ),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") * on(instance, device)\n group_left(ceph_daemon) label_replace(\n label_replace(ceph_disk_occupation_human{cluster=~\"$cluster\", }, \"device\", \"$1\", \"device\", \"/dev/(.*)\"),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n )\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}}({{ceph_daemon}}) read", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "$ceph_hosts Throughput by Disk", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "Bps", + "label": "Read (-) / Write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "For OSD hosts, this chart shows the latency at the physical drive. Each drive is shown by device name, with it's corresponding OSD id", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "s" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 11, + "x": 0, + "y": 21 + }, + "id": 13, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "max by(instance, device) (label_replace(\n (rate(node_disk_write_time_seconds_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval])) /\n clamp_min(rate(node_disk_writes_completed_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]), 0.001) or\n (rate(node_disk_read_time_seconds_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval])) /\n clamp_min(rate(node_disk_reads_completed_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]), 0.001),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)) * on(instance, device) group_left(ceph_daemon) label_replace(\n label_replace(\n ceph_disk_occupation_human{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"},\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}}({{ceph_daemon}})", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "$ceph_hosts Disk Latency", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Show disk utilization % (util) of any OSD devices on the host by the physical device name and associated OSD id.", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "percent" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 11, + "x": 12, + "y": 21 + }, + "id": 14, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "label_replace(\n (\n (rate(node_disk_io_time_ms{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]) / 10) or\n rate(node_disk_io_time_seconds_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]) * 100\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) * on(instance, device) group_left(ceph_daemon) label_replace(\n label_replace(ceph_disk_occupation_human{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\", cluster=~\"$cluster\", },\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}}({{ceph_daemon}})", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "$ceph_hosts Disk utilization", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percent", + "label": "%Util", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "columns": [ ], + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "custom": { + "align": "null", + "cellOptions": { + "type": "auto" + }, + "filterable": true, + "inspect": false + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "instance" + }, + "properties": [ + { + "id": "displayName", + "value": "Instance" + }, + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": null + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "Slow Ops" + }, + { + "id": "unit", + "value": "none" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": null + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 30 + }, + "id": 15, + "links": [ ], + "options": { + "footer": { + "countRows": false, + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 1, + "showHeader": true + }, + "pluginVersion": "10.4.0", + "styles": "", + "targets": [ + { + "expr": "topk(10,\n (sum by (instance)(ceph_daemon_health_metrics{type=\"SLOW_OPS\", ceph_daemon=~\"osd.*\", cluster=~\"$cluster\", }))\n)\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Top Slow Ops per Host", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "cluster": true + }, + "includeByName": { }, + "indexByName": { }, + "renameByName": { } + } + } + ], + "type": "table" + } + ], + "refresh": "30s", + "rows": [ ], + "schemaVersion": 16, + "style": "dark", + "tags": [ + "ceph-mixin", + "overview" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": "Data Source", + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_health_status, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "ceph_hosts", + "options": [ ], + "query": "label_values({__name__=~\"ceph_.+_metadata\", cluster=~\"$cluster\", }, hostname)", + "refresh": 1, + "regex": "([^.]*).*", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Host Details", + "uid": "rtOg0AiWz", + "version": 0 +} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/hosts-overview.json b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/hosts-overview.json new file mode 100644 index 000000000..adbf676f5 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/hosts-overview.json @@ -0,0 +1,892 @@ +{ + "__inputs": [ ], + "__requires": [ + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "5.3.2" + }, + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "5.0.0" + }, + { + "id": "singlestat", + "name": "Singlestat", + "type": "panel", + "version": "5.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "showIn": 0, + "tags": [ ], + "type": "dashboard" + } + ] + }, + "description": "", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ ], + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 0, + "y": 0 + }, + "id": 2, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "count(sum by (hostname) (ceph_osd_metadata{cluster=~\"$cluster\", }))", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "OSD Hosts", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "description": "Average CPU busy across all hosts (OSD, RGW, MON etc) within the cluster", + "format": "percentunit", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 4, + "y": 0 + }, + "id": 3, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "avg(1 - (\n avg by(instance) (\n rate(node_cpu_seconds_total{mode='idle',instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}[$__rate_interval]) or\n rate(node_cpu{mode='idle',instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}[$__rate_interval])\n )\n))\n", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "AVG CPU Busy", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "description": "Average Memory Usage across all hosts in the cluster (excludes buffer/cache usage)", + "format": "percentunit", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 8, + "y": 0 + }, + "id": 4, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "avg ((\n (\n node_memory_MemTotal{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or\n node_memory_MemTotal_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}\n ) - ((\n node_memory_MemFree{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or\n node_memory_MemFree_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}) +\n (\n node_memory_Cached{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or\n node_memory_Cached_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}\n ) + (\n node_memory_Buffers{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or\n node_memory_Buffers_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}\n ) + (\n node_memory_Slab{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or\n node_memory_Slab_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}\n )\n )\n) / (\n node_memory_MemTotal{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or\n node_memory_MemTotal_bytes{instance=~\"($osd_hosts|$rgw_hosts|$mon_hosts|$mds_hosts).*\"}\n))\n", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "AVG RAM Utilization", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "description": "IOPS Load at the device as reported by the OS on all OSD hosts", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 12, + "y": 0 + }, + "id": 5, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum ((\n rate(node_disk_reads_completed{instance=~\"($osd_hosts).*\"}[$__rate_interval]) or\n rate(node_disk_reads_completed_total{instance=~\"($osd_hosts).*\"}[$__rate_interval])\n) + (\n rate(node_disk_writes_completed{instance=~\"($osd_hosts).*\"}[$__rate_interval]) or\n rate(node_disk_writes_completed_total{instance=~\"($osd_hosts).*\"}[$__rate_interval])\n))\n", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Physical IOPS", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "description": "Average Disk utilization for all OSD data devices (i.e. excludes journal/WAL)", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 16, + "y": 0 + }, + "id": 6, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "avg (\n label_replace(\n (rate(node_disk_io_time_ms[$__rate_interval]) / 10 ) or\n (rate(node_disk_io_time_seconds_total[$__rate_interval]) * 100),\n \"instance\", \"$1\", \"instance\", \"([^.:]*).*\"\n ) * on(instance, device) group_left(ceph_daemon) label_replace(\n label_replace(\n ceph_disk_occupation_human{instance=~\"($osd_hosts).*\", cluster=~\"$cluster\", },\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^.:]*).*\"\n )\n)\n", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "AVG Disk Utilization", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "description": "Total send/receive network load across all hosts in the ceph cluster", + "format": "bytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 20, + "y": 0 + }, + "id": 7, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum (\n (\n rate(node_network_receive_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[$__rate_interval]) or\n rate(node_network_receive_bytes_total{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[$__rate_interval])\n ) unless on (device, instance)\n label_replace((node_bonding_slaves > 0), \"device\", \"$1\", \"master\", \"(.+)\")\n) +\nsum (\n (\n rate(node_network_transmit_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[$__rate_interval]) or\n rate(node_network_transmit_bytes_total{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[$__rate_interval])\n ) unless on (device, instance)\n label_replace((node_bonding_slaves > 0), \"device\", \"$1\", \"master\", \"(.+)\")\n)\n", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Network Load", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Show the top 10 busiest hosts by cpu", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "percent" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 5 + }, + "id": 8, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "topk(10,\n 100 * (\n 1 - (\n avg by(instance) (\n rate(node_cpu_seconds_total{mode='idle',instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}[$__rate_interval]) or\n rate(node_cpu{mode='idle',instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}[$__rate_interval])\n )\n )\n )\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Busy - Top 10 Hosts", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Top 10 hosts by network load", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "Bps" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 5 + }, + "id": 9, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "topk(10, (sum by(instance) (\n(\n rate(node_network_receive_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[$__rate_interval]) or\n rate(node_network_receive_bytes_total{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[$__rate_interval])\n) +\n(\n rate(node_network_transmit_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[$__rate_interval]) or\n rate(node_network_transmit_bytes_total{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[$__rate_interval])\n) unless on (device, instance)\n label_replace((node_bonding_slaves > 0), \"device\", \"$1\", \"master\", \"(.+)\"))\n))\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Network Load - Top 10 Hosts", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "refresh": "30s", + "rows": [ ], + "schemaVersion": 16, + "style": "dark", + "tags": [ + "ceph-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": "Data Source", + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_health_status, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "osd_hosts", + "options": [ ], + "query": "label_values(ceph_osd_metadata{cluster=~\"$cluster\", }, hostname)", + "refresh": 1, + "regex": "([^.]*).*", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "mon_hosts", + "options": [ ], + "query": "label_values(ceph_mon_metadata{cluster=~\"$cluster\", }, hostname)", + "refresh": 1, + "regex": "mon.(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "mds_hosts", + "options": [ ], + "query": "label_values(ceph_mds_inodes{hostname, cluster=~\"$cluster\", })", + "refresh": 1, + "regex": "mds.(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "rgw_hosts", + "options": [ ], + "query": "label_values(ceph_rgw_metadata{hostname, cluster=~\"$cluster\", })", + "refresh": 1, + "regex": "rgw.(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Host Overview", + "uid": "y0KGL0iZz", + "version": 0 +} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/multi-cluster-overview.json b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/multi-cluster-overview.json new file mode 100644 index 000000000..25648cc0a --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/multi-cluster-overview.json @@ -0,0 +1,2060 @@ +{ + "__inputs": [ ], + "__requires": [ ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "showIn": 0, + "tags": [ ], + "type": "dashboard" + } + ] + }, + "description": "", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ ], + "panels": [ + { + "collapse": false, + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 1 + }, + "id": 2, + "panels": [ ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Clusters", + "titleSize": "h6", + "type": "row" + }, + { + "colors": null, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "text", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Warning" + }, + "properties": [ + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "text", + "value": null + }, + { + "color": "semi-dark-yellow", + "value": 1 + } + ] + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Error" + }, + "properties": [ + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "text", + "value": null + }, + { + "color": "semi-dark-red", + "value": 1 + } + ] + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Healthy" + }, + "properties": [ + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "text", + "value": null + }, + { + "color": "semi-dark-green", + "value": 1 + } + ] + } + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 5, + "x": 0, + "y": 2 + }, + "id": 3, + "links": [ ], + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "$datasource", + "expr": "count(ceph_health_status==0) or vector(0)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Healthy", + "refId": "A" + }, + { + "datasource": "$datasource", + "expr": "count(ceph_health_status==1)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Warning", + "refId": "B" + }, + { + "datasource": "$datasource", + "expr": "count(ceph_health_status==2)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Error", + "refId": "C" + } + ], + "title": "Status", + "transparent": false, + "type": "stat" + }, + { + "columns": [ ], + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "align": "left", + "cellOptions": { + "type": "color-text" + }, + "filterable": false, + "inspect": false + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "text" + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Value #A" + }, + "properties": [ + { + "id": "mappings", + "value": [ + { + "options": { + "0": { + "color": "semi-dark-green", + "index": 2, + "text": "Healthy" + }, + "1": { + "color": "semi-dark-yellow", + "index": 0, + "text": "Warning" + }, + "2": { + "color": "semi-dark-red", + "index": 1, + "text": "Error" + } + }, + "type": "value" + } + ] + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Capacity Used" + }, + "properties": [ + { + "id": "unit", + "value": "bytes" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cluster" + }, + "properties": [ + { + "id": "links", + "value": [ + { + "title": "", + "url": "/d/edtb0oxdq/ceph-cluster?var-cluster=${__data.fields.Cluster}&${DS_PROMETHEUS:queryparam}" + } + ] + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Alerts" + }, + "properties": [ + { + "id": "mappings", + "value": [ + { + "options": { + "match": null, + "result": { + "index": 0, + "text": "0" + } + }, + "type": "special" + } + ] + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 19, + "x": 5, + "y": 2 + }, + "id": 4, + "links": [ ], + "options": { + "footer": { + "countRows": false, + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 1, + "showHeader": true + }, + "pluginVersion": "9.4.7", + "styles": "", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "ceph_health_status", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": false, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "ceph_mgr_metadata", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": false, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "count(ALERTS{alertstate=\"firing\", cluster=~\"$cluster\"})", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": false, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "ceph_cluster_by_class_total_used_bytes", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": false, + "refId": "D" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Details", + "transformations": [ + { + "id": "joinByField", + "options": { + "byField": "cluster", + "mode": "outer" + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time 1": true, + "Time 2": true, + "Time 3": true, + "Time 4": true, + "Time 5": true, + "Time 6": true, + "Value #B": true, + "__name__ 1": true, + "__name__ 2": true, + "__name__ 3": true, + "ceph_daemon": true, + "device_class": true, + "hostname": true, + "instance 1": true, + "instance 2": true, + "instance 3": true, + "job 1": true, + "job 2": true, + "job 3": true, + "replica 1": true, + "replica 2": true, + "replica 3": true + }, + "indexByName": { + "Time 1": 8, + "Time 2": 13, + "Time 3": 21, + "Time 4": 7, + "Time 5": 22, + "Time 6": 23, + "Value #A": 1, + "Value #B": 20, + "Value #C": 3, + "Value #D": 6, + "__name__ 1": 9, + "__name__ 2": 14, + "__name__ 3": 24, + "ceph_daemon": 15, + "ceph_version": 2, + "cluster": 0, + "device_class": 25, + "hostname": 16, + "instance 1": 10, + "instance 2": 17, + "instance 3": 26, + "job 1": 11, + "job 2": 18, + "job 3": 27, + "replica 1": 12, + "replica 2": 19, + "replica 3": 28 + }, + "renameByName": { + "Value #A": "Status", + "Value #C": "Alerts", + "Value #D": "Capacity Used", + "ceph_version": "Version", + "cluster": "Cluster" + } + } + } + ], + "type": "table" + }, + { + "collapse": false, + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 9 + }, + "id": 5, + "panels": [ ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Overview", + "titleSize": "h6", + "type": "row" + }, + { + "colors": null, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "text", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + } + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 0, + "y": 10 + }, + "id": 6, + "links": [ ], + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "count(ceph_health_status{cluster=~\"$cluster\"}) or vector(0)", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Cluster Count", + "transparent": false, + "type": "stat" + }, + { + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "links": [ ], + "mappings": [ ], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "semi-dark-yellow", + "value": 0.75 + }, + { + "color": "red", + "value": 0.84999999999999998 + } + ] + }, + "unit": "percentunit" + } + }, + "gridPos": { + "h": 8, + "w": 4, + "x": 3, + "y": 10 + }, + "id": 7, + "interval": "1m", + "links": [ ], + "maxDataPoints": 100, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "$datasource", + "expr": "sum(ceph_cluster_total_used_bytes{cluster=~\"$cluster\"}) / sum(ceph_cluster_total_bytes{cluster=~\"$cluster\"})", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "Used", + "refId": "A" + } + ], + "title": "Capacity Used", + "transparent": false, + "type": "gauge" + }, + { + "colors": null, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + } + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 7, + "y": 10 + }, + "id": 8, + "links": [ ], + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(ceph_cluster_total_bytes{cluster=~\"$cluster\"})", + "format": "table", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Total Capacity", + "transparent": false, + "type": "stat" + }, + { + "colors": null, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "none" + } + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 10, + "y": 10 + }, + "id": 9, + "links": [ ], + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "count(ceph_osd_metadata{cluster=~\"$cluster\"})", + "format": "table", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "OSDs", + "transparent": false, + "type": "stat" + }, + { + "colors": null, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "none" + } + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 13, + "y": 10 + }, + "id": 10, + "links": [ ], + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "count(sum by (hostname) (ceph_osd_metadata{cluster=~\"$cluster\"}))", + "format": "table", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Hosts", + "transparent": false, + "type": "stat" + }, + { + "colors": null, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 16, + "y": 10 + }, + "id": 11, + "links": [ ], + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(irate(ceph_pool_wr{cluster=~\"$cluster\"}[$__interval]))", + "format": "time_series", + "hide": false, + "instant": false, + "intervalFactor": 1, + "legendFormat": "Write", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(irate(ceph_pool_rd{cluster=~\"$cluster\"}[$__interval]))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Read", + "range": true, + "refId": "B" + } + ], + "title": "Client IOPS", + "transparent": false, + "type": "stat" + }, + { + "colors": null, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ms" + } + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 20, + "y": 10 + }, + "id": 12, + "links": [ ], + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "avg(ceph_osd_apply_latency_ms{cluster=~\"$cluster\"})", + "format": "time_series", + "hide": false, + "instant": false, + "intervalFactor": 1, + "legendFormat": "Apply", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "avg(ceph_osd_commit_latency_ms{cluster=~\"$cluster\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Commit", + "range": true, + "refId": "B" + } + ], + "title": "OSD Latencies", + "transparent": false, + "type": "stat" + }, + { + "colors": null, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "text", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + } + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 0, + "y": 14 + }, + "id": 13, + "links": [ ], + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "count(ALERTS{alertstate=\"firing\", cluster=~\"$cluster\"}) or vector(0)", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Alert Count", + "transparent": false, + "type": "stat" + }, + { + "colors": null, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + } + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 7, + "y": 14 + }, + "id": 14, + "links": [ ], + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(ceph_cluster_total_used_bytes{cluster=~\"$cluster\"})", + "format": "table", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Total Used", + "transparent": false, + "type": "stat" + }, + { + "colors": null, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + } + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 10, + "y": 14 + }, + "id": 15, + "links": [ ], + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "predict_linear(avg(increase(ceph_cluster_total_used_bytes{cluster=~\"${Cluster}\"}[1d]))[7d:1h],120)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Capacity Prediction", + "transparent": false, + "type": "stat" + }, + { + "colors": null, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "none" + } + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 13, + "y": 14 + }, + "id": 16, + "links": [ ], + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "count(ceph_pool_metadata{cluster=~\"$cluster\"})", + "format": "table", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Pools", + "transparent": false, + "type": "stat" + }, + { + "colors": null, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "binBps" + } + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 16, + "y": 14 + }, + "id": 17, + "links": [ ], + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(irate(ceph_pool_rd_bytes{cluster=~\"$cluster\"}[$__interval]))", + "format": "time_series", + "hide": false, + "instant": false, + "intervalFactor": 1, + "legendFormat": "Write", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(irate(ceph_pool_wr_bytes{cluster=~\"$cluster\"}[$__interval]))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Read", + "range": true, + "refId": "B" + } + ], + "title": "Client Bandwidth", + "transparent": false, + "type": "stat" + }, + { + "colors": null, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "binBps" + } + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 20, + "y": 14 + }, + "id": 18, + "links": [ ], + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(irate(ceph_osd_recovery_ops{cluster=~\"$cluster\"}[$__interval]))", + "format": "time_series", + "hide": false, + "instant": false, + "intervalFactor": 1, + "legendFormat": "Write", + "range": true, + "refId": "A" + } + ], + "title": "Recovery Rate", + "transparent": false, + "type": "stat" + }, + { + "collapse": false, + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 18 + }, + "id": 19, + "panels": [ + { + "colors": null, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "text", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Critical" + }, + "properties": [ + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "text", + "value": null + }, + { + "color": "semi-dark-red", + "value": 1 + } + ] + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Warning" + }, + "properties": [ + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "text", + "value": null + }, + { + "color": "semi-dark-yellow", + "value": 1 + } + ] + } + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 5, + "x": 0, + "y": 19 + }, + "id": 20, + "links": [ ], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "$datasource", + "expr": "count(ALERTS{alertstate=\"firing\",severity=\"critical\", cluster=~\"$cluster\"}) OR vector(0)", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "Critical", + "range": false, + "refId": "A" + }, + { + "datasource": "$datasource", + "expr": "count(ALERTS{alertstate=\"firing\",severity=\"warning\", cluster=~\"$cluster\"}) OR vector(0)", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "Warning", + "range": false, + "refId": "B" + } + ], + "title": "Status", + "transparent": false, + "type": "stat" + }, + { + "columns": [ ], + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "filterable": true, + "inspect": false + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ ] + }, + "gridPos": { + "h": 7, + "w": 19, + "x": 5, + "y": 19 + }, + "id": 21, + "links": [ ], + "options": { + "footer": { + "countRows": false, + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 1, + "showHeader": true, + "sortBy": [ + { + "desc": false, + "displayName": "Severity" + } + ] + }, + "pluginVersion": "9.4.7", + "styles": "", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "ALERTS{alertstate=\"firing\", cluster=~\"$cluster\", }", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Alerts", + "transformations": [ + { + "id": "joinByField", + "options": { + "byField": "cluster", + "mode": "outer" + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Value": true, + "__name__": true, + "instance": true, + "job": true, + "oid": true, + "replica": true, + "type": true + }, + "indexByName": { + "Time": 0, + "Value": 9, + "__name__": 1, + "alertname": 2, + "alertstate": 4, + "cluster": 3, + "instance": 6, + "job": 7, + "severity": 5, + "type": 8 + }, + "renameByName": { + "alertname": "Name", + "alertstate": "State", + "cluster": "Cluster", + "severity": "Severity" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 26 + }, + "id": 22, + "limit": 10, + "onlyAlertsOnDashboard": true, + "options": { + "alertName": "", + "dashboardAlerts": false, + "groupBy": [ ], + "groupMode": "default", + "maxItems": 20, + "sortOrder": 1, + "stateFilter": { + "error": true, + "firing": true, + "noData": false, + "normal": false, + "pending": true + }, + "viewMode": "list" + }, + "show": "current", + "sortOrder": 1, + "stateFilter": [ ], + "title": "Alerts(Grouped)", + "type": "alertlist" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Alerts", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 29 + }, + "id": 23, + "panels": [ + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "percentunit" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 30 + }, + "id": 24, + "options": { + "legend": { + "calcs": [ + "last" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "$datasource", + "expr": "topk(5, ceph_cluster_total_used_bytes/ceph_cluster_total_bytes)", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{cluster}}", + "range": true, + "refId": "A", + "step": 300 + } + ], + "title": "Top 5 - Capacity Utilization(%)", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "ops" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 30 + }, + "id": 25, + "options": { + "legend": { + "calcs": [ + "last" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "$datasource", + "expr": "topk(10, sum by (cluster) (irate(ceph_osd_op_w[$__interval])) \n+ sum by (cluster) (irate(ceph_osd_op_r[$__interval])) )", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{cluster}}", + "range": true, + "refId": "A", + "step": 300 + } + ], + "title": "Top 5 - Cluster IOPS", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "percentunit" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 30 + }, + "id": 26, + "options": { + "legend": { + "calcs": [ + "last" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "$datasource", + "expr": "topk(10, ceph_pool_bytes_used{cluster=~\"$cluster\", }/ceph_pool_max_avail{cluster=~\"$cluster\", } * on(pool_id, cluster) group_left(instance, name) ceph_pool_metadata{cluster=~\"$cluster\", })", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{cluster}} - {{name}}", + "range": true, + "refId": "A", + "step": 300 + } + ], + "title": "Top 10 - Capacity Utilization(%) by Pool", + "type": "timeseries" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Cluster Stats", + "titleSize": "h6", + "type": "row" + } + ], + "refresh": "30s", + "rows": [ ], + "schemaVersion": 22, + "style": "dark", + "tags": [ + "ceph-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": "Data Source", + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": ".*", + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "cluster", + "multi": true, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_health_status, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Ceph - Multi-cluster", + "uid": "BnxelG7Sx", + "version": 0 +} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/osd-device-details.json b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/osd-device-details.json new file mode 100644 index 000000000..60f1ecc5a --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/osd-device-details.json @@ -0,0 +1,914 @@ +{ + "__inputs": [ ], + "__requires": [ + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "5.3.2" + }, + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "5.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "showIn": 0, + "tags": [ ], + "type": "dashboard" + } + ] + }, + "description": "", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ ], + "panels": [ + { + "collapse": false, + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 2, + "panels": [ ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "OSD Performance", + "titleSize": "h6", + "type": "row" + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "s" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 6, + "x": 0, + "y": 1 + }, + "id": 3, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "read", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(ceph_osd_op_r_latency_sum{ceph_daemon=~\"$osd\", cluster=~\"$cluster\", }[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_r_latency_count{cluster=~\"$cluster\", }[$__rate_interval])\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "read", + "refId": "A" + }, + { + "expr": "rate(ceph_osd_op_w_latency_sum{ceph_daemon=~\"$osd\", cluster=~\"$cluster\", }[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_w_latency_count{cluster=~\"$cluster\", }[$__rate_interval])\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "write", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "$osd Latency", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": "Read (-) / Write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "short" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 6, + "x": 6, + "y": 1 + }, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "Reads", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(ceph_osd_op_r{ceph_daemon=~\"$osd\", cluster=~\"$cluster\", }[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Reads", + "refId": "A" + }, + { + "expr": "rate(ceph_osd_op_w{ceph_daemon=~\"$osd\", cluster=~\"$cluster\", }[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Writes", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "$osd R/W IOPS", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": "Read (-) / Write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "bytes" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 6, + "x": 12, + "y": 1 + }, + "id": 5, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "Read Bytes", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(ceph_osd_op_r_out_bytes{ceph_daemon=~\"$osd\", cluster=~\"$cluster\", }[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Read Bytes", + "refId": "A" + }, + { + "expr": "rate(ceph_osd_op_w_in_bytes{ceph_daemon=~\"$osd\", cluster=~\"$cluster\", }[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Write Bytes", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "$osd R/W Bytes", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "bytes", + "label": "Read (-) / Write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "collapse": false, + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 10 + }, + "id": 6, + "panels": [ ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Physical Device Performance", + "titleSize": "h6", + "type": "row" + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "s" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 6, + "x": 0, + "y": 11 + }, + "id": 7, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "/.*Reads/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "(\n label_replace(\n rate(node_disk_read_time_seconds_total[$__rate_interval]) /\n rate(node_disk_reads_completed_total[$__rate_interval]),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n ) and on (instance, device) label_replace(\n label_replace(\n ceph_disk_occupation_human{ceph_daemon=~\"$osd\", cluster=~\"$cluster\", },\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n )\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{instance}}/{{device}} Reads", + "refId": "A" + }, + { + "expr": "(\n label_replace(\n rate(node_disk_write_time_seconds_total[$__rate_interval]) /\n rate(node_disk_writes_completed_total[$__rate_interval]),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") and on (instance, device)\n label_replace(\n label_replace(\n ceph_disk_occupation_human{ceph_daemon=~\"$osd\", cluster=~\"$cluster\", }, \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n )\n )\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{instance}}/{{device}} Writes", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Physical Device Latency for $osd", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": "Read (-) / Write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "short" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 6, + "x": 6, + "y": 11 + }, + "id": 8, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "/.*Reads/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "label_replace(\n rate(node_disk_writes_completed_total[$__rate_interval]),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) and on (instance, device) label_replace(\n label_replace(\n ceph_disk_occupation_human{ceph_daemon=~\"$osd\", cluster=~\"$cluster\", },\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} on {{instance}} Writes", + "refId": "A" + }, + { + "expr": "label_replace(\n rate(node_disk_reads_completed_total[$__rate_interval]),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) and on (instance, device) label_replace(\n label_replace(\n ceph_disk_occupation_human{ceph_daemon=~\"$osd\", cluster=~\"$cluster\", },\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} on {{instance}} Reads", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Physical Device R/W IOPS for $osd", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": "Read (-) / Write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "Bps" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 6, + "x": 12, + "y": 11 + }, + "id": 9, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "/.*Reads/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "label_replace(\n rate(node_disk_read_bytes_total[$__rate_interval]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) and on (instance, device) label_replace(\n label_replace(\n ceph_disk_occupation_human{ceph_daemon=~\"$osd\", cluster=~\"$cluster\", },\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{instance}} {{device}} Reads", + "refId": "A" + }, + { + "expr": "label_replace(\n rate(node_disk_written_bytes_total[$__rate_interval]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) and on (instance, device) label_replace(\n label_replace(\n ceph_disk_occupation_human{ceph_daemon=~\"$osd\", cluster=~\"$cluster\", },\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{instance}} {{device}} Writes", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Physical Device R/W Bytes for $osd", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "Bps", + "label": "Read (-) / Write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "percentunit" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 6, + "x": 18, + "y": 11 + }, + "id": 10, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "label_replace(\n rate(node_disk_io_time_seconds_total[$__rate_interval]),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) and on (instance, device) label_replace(\n label_replace(\n ceph_disk_occupation_human{ceph_daemon=~\"$osd\", cluster=~\"$cluster\", }, \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} on {{instance}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Physical Device Util% for $osd", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "refresh": "30s", + "rows": [ ], + "schemaVersion": 16, + "style": "dark", + "tags": [ + "ceph-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": "Data Source", + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_health_status, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "OSD", + "multi": false, + "name": "osd", + "options": [ ], + "query": "label_values(ceph_osd_metadata{cluster=~\"$cluster\", }, ceph_daemon)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "OSD device details", + "uid": "CrAHE0iZz", + "version": 0 +} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/osds-overview.json b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/osds-overview.json new file mode 100644 index 000000000..948f0d721 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/osds-overview.json @@ -0,0 +1,1339 @@ +{ + "__inputs": [ ], + "__requires": [ + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "5.0.0" + }, + { + "id": "grafana-piechart-panel", + "name": "Pie Chart", + "type": "panel", + "version": "1.3.3" + }, + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "5.0.0" + }, + { + "id": "table", + "name": "Table", + "type": "panel", + "version": "5.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "showIn": 0, + "tags": [ ], + "type": "dashboard" + } + ] + }, + "description": "", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ ], + "panels": [ + { + "aliasColors": { + "@95%ile": "#e0752d" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "ms" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 0 + }, + "id": 2, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "avg (\n rate(ceph_osd_op_r_latency_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_r_latency_count{cluster=~\"$cluster\", }[$__rate_interval]) * 1000\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "AVG read", + "refId": "A" + }, + { + "expr": "max(\n rate(ceph_osd_op_r_latency_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_r_latency_count{cluster=~\"$cluster\", }[$__rate_interval]) * 1000\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "MAX read", + "refId": "B" + }, + { + "expr": "quantile(0.95,\n (\n rate(ceph_osd_op_r_latency_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_r_latency_count{cluster=~\"$cluster\", }[$__rate_interval])\n * 1000\n )\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "@95%ile", + "refId": "C" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "OSD Read Latencies", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ] + }, + { + "columns": [ ], + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "custom": { + "align": "null", + "cellOptions": { + "type": "auto" + }, + "filterable": true, + "inspect": false + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "ceph_daemon" + }, + "properties": [ + { + "id": "displayName", + "value": "OSD ID" + }, + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": null + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "Latency (ms)" + }, + { + "id": "unit", + "value": "none" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": null + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 4, + "x": 8, + "y": 0 + }, + "id": 3, + "links": [ ], + "options": { + "footer": { + "countRows": false, + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 1, + "showHeader": true + }, + "pluginVersion": "10.4.0", + "styles": "", + "targets": [ + { + "expr": "topk(10,\n (sort(\n (\n rate(ceph_osd_op_r_latency_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_r_latency_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n 1000\n )\n ))\n)\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Highest READ Latencies", + "transformations": [ + { + "id": "merge", + "options": { + "reducers": [ ] + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "cluster": true + }, + "includeByName": { }, + "indexByName": { }, + "renameByName": { } + } + } + ], + "type": "table" + }, + { + "aliasColors": { + "@95%ile write": "#e0752d" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "ms" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 8, + "x": 12, + "y": 0 + }, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "avg(\n rate(ceph_osd_op_w_latency_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_w_latency_count{cluster=~\"$cluster\", }[$__rate_interval])\n * 1000\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "AVG write", + "refId": "A" + }, + { + "expr": "max(\n rate(ceph_osd_op_w_latency_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_w_latency_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n 1000\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "MAX write", + "refId": "B" + }, + { + "expr": "quantile(0.95, (\n rate(ceph_osd_op_w_latency_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_w_latency_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n 1000\n))\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "@95%ile write", + "refId": "C" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "OSD Write Latencies", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ] + }, + { + "columns": [ ], + "datasource": "${datasource}", + "description": "This table shows the osd's that are delivering the 10 highest write latencies within the cluster", + "fieldConfig": { + "defaults": { + "custom": { + "align": "null", + "cellOptions": { + "type": "auto" + }, + "filterable": true, + "inspect": false + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "ceph_daemon" + }, + "properties": [ + { + "id": "displayName", + "value": "OSD ID" + }, + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": null + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "Latency (ms)" + }, + { + "id": "unit", + "value": "none" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": null + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "mappings", + "value": [ + { + "options": { + "NaN": { + "index": 0, + "text": "0.00" + } + }, + "type": "value" + } + ] + }, + { + "id": "unit", + "value": "none" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": null + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 4, + "x": 20, + "y": 0 + }, + "id": 5, + "links": [ ], + "options": { + "footer": { + "countRows": false, + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 1, + "showHeader": true + }, + "pluginVersion": "10.4.0", + "styles": "", + "targets": [ + { + "expr": "topk(10,\n (sort(\n (rate(ceph_osd_op_w_latency_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_w_latency_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n 1000)\n ))\n)\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Highest WRITE Latencies", + "transformations": [ + { + "id": "merge", + "options": { + "reducers": [ ] + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "cluster": true + }, + "includeByName": { }, + "indexByName": { }, + "renameByName": { } + } + } + ], + "type": "table" + }, + { + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [ ] + }, + "overrides": [ ] + }, + "gridPos": { + "h": 8, + "w": 4, + "x": 0, + "y": 8 + }, + "id": 6, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "calcs": [ ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "values": [ + "percent", + "value" + ] + }, + "pieType": "pie", + "reduceOptions": { }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "count by (device_class) (ceph_osd_metadata{cluster=~\"$cluster\", })", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device_class}}", + "refId": "A" + } + ], + "title": "OSD Types Summary", + "type": "piechart" + }, + { + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [ ] + }, + "overrides": [ ] + }, + "gridPos": { + "h": 8, + "w": 4, + "x": 4, + "y": 8 + }, + "id": 7, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "calcs": [ ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "values": [ + "percent", + "value" + ] + }, + "pieType": "pie", + "reduceOptions": { }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "count(ceph_bluefs_wal_total_bytes{cluster=~\"$cluster\", })", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "bluestore", + "refId": "A" + }, + { + "expr": "absent(ceph_bluefs_wal_total_bytes{cluster=~\"$cluster\", }) * count(ceph_osd_metadata{cluster=~\"$cluster\", })", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "filestore", + "refId": "B" + } + ], + "title": "OSD Objectstore Types", + "type": "piechart" + }, + { + "datasource": "$datasource", + "description": "The pie chart shows the various OSD sizes used within the cluster", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [ ] + }, + "overrides": [ ] + }, + "gridPos": { + "h": 8, + "w": 4, + "x": 8, + "y": 8 + }, + "id": 8, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "calcs": [ ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "values": [ + "percent", + "value" + ] + }, + "pieType": "pie", + "reduceOptions": { }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "count(ceph_osd_stat_bytes{cluster=~\"$cluster\", } < 1099511627776)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "<1TB", + "refId": "A" + }, + { + "expr": "count(ceph_osd_stat_bytes{cluster=~\"$cluster\", } >= 1099511627776 < 2199023255552)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "<2TB", + "refId": "B" + }, + { + "expr": "count(ceph_osd_stat_bytes{cluster=~\"$cluster\", } >= 2199023255552 < 3298534883328)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "<3TB", + "refId": "C" + }, + { + "expr": "count(ceph_osd_stat_bytes{cluster=~\"$cluster\", } >= 3298534883328 < 4398046511104)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "<4TB", + "refId": "D" + }, + { + "expr": "count(ceph_osd_stat_bytes{cluster=~\"$cluster\", } >= 4398046511104 < 6597069766656)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "<6TB", + "refId": "E" + }, + { + "expr": "count(ceph_osd_stat_bytes{cluster=~\"$cluster\", } >= 6597069766656 < 8796093022208)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "<8TB", + "refId": "F" + }, + { + "expr": "count(ceph_osd_stat_bytes{cluster=~\"$cluster\", } >= 8796093022208 < 10995116277760)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "<10TB", + "refId": "G" + }, + { + "expr": "count(ceph_osd_stat_bytes{cluster=~\"$cluster\", } >= 10995116277760 < 13194139533312)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "<12TB", + "refId": "H" + }, + { + "expr": "count(ceph_osd_stat_bytes{cluster=~\"$cluster\", } >= 13194139533312)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "<12TB+", + "refId": "I" + } + ], + "title": "OSD Size Summary", + "type": "piechart" + }, + { + "aliasColors": { }, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "short" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 8, + "x": 12, + "y": 8 + }, + "id": 9, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "ceph_osd_numpg{cluster=~\"$cluster\", }", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "PGs per OSD", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Distribution of PGs per OSD", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": 20, + "mode": "histogram", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": "# of OSDs", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "description": "This gauge panel shows onode Hits ratio to help determine if increasing RAM per OSD could help improve the performance of the cluster", + "format": "percentunit", + "gauge": { + "maxValue": 1, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 8, + "w": 4, + "x": 20, + "y": 8 + }, + "id": 10, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(ceph_bluestore_onode_hits{cluster=~\"$cluster\", }) / (\n sum(ceph_bluestore_onode_hits{cluster=~\"$cluster\", }) +\n sum(ceph_bluestore_onode_misses{cluster=~\"$cluster\", })\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": ".75", + "title": "OSD onode Hits Ratio", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "collapse": false, + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 16 + }, + "id": 11, + "panels": [ ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "R/W Profile", + "titleSize": "h6", + "type": "row" + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Show the read/write workload profile overtime", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "short" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 17 + }, + "id": 12, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "round(sum(rate(ceph_pool_rd{cluster=~\"$cluster\", }[$__rate_interval])))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Reads", + "refId": "A" + }, + { + "expr": "round(sum(rate(ceph_pool_wr{cluster=~\"$cluster\", }[$__rate_interval])))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Writes", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Read/Write Profile", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "columns": [ ], + "datasource": "${datasource}", + "description": "This table shows the 10 OSDs with the highest number of slow ops", + "fieldConfig": { + "defaults": { + "custom": { + "align": "null", + "cellOptions": { + "type": "auto" + }, + "filterable": true, + "inspect": false + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "ceph_daemon" + }, + "properties": [ + { + "id": "displayName", + "value": "OSD ID" + }, + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": null + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "Slow Ops" + }, + { + "id": "unit", + "value": "none" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": null + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 5, + "x": 0, + "y": 25 + }, + "id": 13, + "links": [ ], + "options": { + "footer": { + "countRows": false, + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 1, + "showHeader": true + }, + "pluginVersion": "10.4.0", + "styles": "", + "targets": [ + { + "expr": "topk(10,\n (ceph_daemon_health_metrics{type=\"SLOW_OPS\", ceph_daemon=~\"osd.*\"})\n)\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Top Slow Ops", + "transformations": [ + { + "id": "merge", + "options": { + "reducers": [ ] + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "__name__": true, + "cluster": true, + "instance": true, + "job": true, + "type": true + }, + "includeByName": { }, + "indexByName": { }, + "renameByName": { } + } + } + ], + "type": "table" + } + ], + "refresh": "30s", + "rows": [ ], + "schemaVersion": 16, + "style": "dark", + "tags": [ + "ceph-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": "Data Source", + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_health_status, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "OSD Overview", + "uid": "lo02I1Aiz", + "version": 0 +} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/pool-detail.json b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/pool-detail.json new file mode 100644 index 000000000..5e5bf6e9b --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/pool-detail.json @@ -0,0 +1,724 @@ +{ + "__inputs": [ ], + "__requires": [ + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "5.3.2" + }, + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "5.0.0" + }, + { + "id": "singlestat", + "name": "Singlestat", + "type": "panel", + "version": "5.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "showIn": 0, + "tags": [ ], + "type": "dashboard" + } + ] + }, + "description": "", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ ], + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "format": "percentunit", + "gauge": { + "maxValue": 1, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 7, + "w": 7, + "x": 0, + "y": 0 + }, + "id": 2, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "(ceph_pool_stored{cluster=~\"$cluster\", } / (ceph_pool_stored{cluster=~\"$cluster\", } + ceph_pool_max_avail{cluster=~\"$cluster\", })) *\n on(pool_id) group_left(instance, name) ceph_pool_metadata{name=~\"$pool_name\", cluster=~\"$cluster\", }\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": ".7,.8", + "title": "Capacity used", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": 100, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "description": "Time till pool is full assuming the average fill rate of the last 6 hours", + "format": "s", + "gauge": { + "maxValue": false, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 7, + "w": 5, + "x": 7, + "y": 0 + }, + "id": 3, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": "" + }, + "tableColumn": "", + "targets": [ + { + "expr": "(ceph_pool_max_avail{cluster=~\"$cluster\", } / deriv(ceph_pool_stored{cluster=~\"$cluster\", }[6h])) *\n on(pool_id) group_left(instance, name) ceph_pool_metadata{name=~\"$pool_name\", cluster=~\"$cluster\", } > 0\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "current", + "title": "Time till full", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": false + }, + { + "aliasColors": { + "read_op_per_sec": "#3F6833", + "write_op_per_sec": "#E5AC0E" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "deriv(ceph_pool_objects{cluster=~\"$cluster\", }[1m]) *\n on(pool_id) group_left(instance, name) ceph_pool_metadata{name=~\"$pool_name\", cluster=~\"$cluster\", }\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Objects per second", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "$pool_name Object Ingress/Egress", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "ops", + "label": "Objects out(-) / in(+) ", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + "read_op_per_sec": "#3F6833", + "write_op_per_sec": "#E5AC0E" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "iops" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 7 + }, + "id": 5, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "reads", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(ceph_pool_rd{cluster=~\"$cluster\", }[$__rate_interval]) *\n on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~\"$pool_name\", cluster=~\"$cluster\", }\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "reads", + "refId": "A" + }, + { + "expr": "rate(ceph_pool_wr{cluster=~\"$cluster\", }[$__rate_interval]) *\n on(pool_id) group_left(instance, name) ceph_pool_metadata{name=~\"$pool_name\", cluster=~\"$cluster\", }\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "writes", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "$pool_name Client IOPS", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "iops", + "label": "Read (-) / Write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + "read_op_per_sec": "#3F6833", + "write_op_per_sec": "#E5AC0E" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "Bps" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 7 + }, + "id": 6, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "reads", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(ceph_pool_rd_bytes{cluster=~\"$cluster\", }[$__rate_interval]) +\n on(pool_id) group_left(instance, name) ceph_pool_metadata{name=~\"$pool_name\", cluster=~\"$cluster\", }\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "reads", + "refId": "A" + }, + { + "expr": "rate(ceph_pool_wr_bytes{cluster=~\"$cluster\", }[$__rate_interval]) +\n on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~\"$pool_name\", cluster=~\"$cluster\", }\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "writes", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "$pool_name Client Throughput", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "Bps", + "label": "Read (-) / Write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + "read_op_per_sec": "#3F6833", + "write_op_per_sec": "#E5AC0E" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "short" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 14 + }, + "id": 7, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "ceph_pool_objects{cluster=~\"$cluster\", } *\n on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~\"$pool_name\", cluster=~\"$cluster\", }\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Number of Objects", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "$pool_name Objects", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": "Objects", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "refresh": "30s", + "rows": [ ], + "schemaVersion": 22, + "style": "dark", + "tags": [ + "ceph-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": "Data Source", + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_health_status, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "Pool Name", + "multi": false, + "name": "pool_name", + "options": [ ], + "query": "label_values(ceph_pool_metadata{cluster=~\"$cluster\", }, name)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Ceph Pool Details", + "uid": "-xyV8KCiz", + "version": 0 +} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/pool-overview.json b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/pool-overview.json new file mode 100644 index 000000000..fa32b3368 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/pool-overview.json @@ -0,0 +1,1691 @@ +{ + "__inputs": [ ], + "__requires": [ ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "showIn": 0, + "tags": [ ], + "type": "dashboard" + } + ] + }, + "description": "", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ ], + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 0, + "y": 0 + }, + "id": 2, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "count(ceph_pool_metadata{cluster=~\"$cluster\", })", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Pools", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "description": "Count of the pools that have compression enabled", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 3, + "y": 0 + }, + "id": 3, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "count(ceph_pool_metadata{compression_mode!=\"none\", cluster=~\"$cluster\", })", + "format": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Pools with Compression", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "description": "Total raw capacity available to the cluster", + "format": "bytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 6, + "y": 0 + }, + "id": 4, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(ceph_osd_stat_bytes{cluster=~\"$cluster\", })", + "format": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Total Raw Capacity", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "description": "Total raw capacity consumed by user data and associated overheads (metadata + redundancy)", + "format": "bytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 9, + "y": 0 + }, + "id": 5, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(ceph_pool_bytes_used{cluster=~\"$cluster\", })", + "format": "", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Raw Capacity Consumed", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "description": "Total of client data stored in the cluster", + "format": "bytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 12, + "y": 0 + }, + "id": 6, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(ceph_pool_stored{cluster=~\"$cluster\", })", + "format": "", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Logical Stored ", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "description": "A compression saving is determined as the data eligible to be compressed minus the capacity used to store the data after compression", + "format": "bytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 15, + "y": 0 + }, + "id": 7, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(\n ceph_pool_compress_under_bytes{cluster=~\"$cluster\", } -\n ceph_pool_compress_bytes_used{cluster=~\"$cluster\", }\n)\n", + "format": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Compression Savings", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "description": "Indicates how suitable the data is within the pools that are/have been enabled for compression - averaged across all pools holding compressed data", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 18, + "y": 0 + }, + "id": 8, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "(\n sum(ceph_pool_compress_under_bytes{cluster=~\"$cluster\", } > 0) /\n sum(ceph_pool_stored_raw{cluster=~\"$cluster\", } and ceph_pool_compress_under_bytes{cluster=~\"$cluster\", } > 0)\n) * 100\n", + "format": "table", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Compression Eligibility", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "description": "This factor describes the average ratio of data eligible to be compressed divided by the data actually stored. It does not account for data written that was ineligible for compression (too small, or compression yield too low)", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 21, + "y": 0 + }, + "id": 9, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(\n ceph_pool_compress_under_bytes{cluster=~\"$cluster\", } > 0)\n / sum(ceph_pool_compress_bytes_used{cluster=~\"$cluster\", } > 0\n)\n", + "format": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Compression Factor", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "columns": [ ], + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "filterable": true, + "inspect": false + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "instance" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "job" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "name" + }, + "properties": [ + { + "id": "displayName", + "value": "Pool Name" + }, + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "pool_id" + }, + "properties": [ + { + "id": "displayName", + "value": "Pool ID" + }, + { + "id": "unit", + "value": "none" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #A" + }, + "properties": [ + { + "id": "displayName", + "value": "Compression Factor" + }, + { + "id": "unit", + "value": "none" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #D" + }, + "properties": [ + { + "id": "displayName", + "value": "% Used" + }, + { + "id": "unit", + "value": "percentunit" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.cellOptions", + "value": { + "type": "color-text" + } + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(245, 54, 54, 0.9)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 70 + }, + { + "color": "rgba(50, 172, 45, 0.97)", + "value": 85 + } + ] + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #B" + }, + "properties": [ + { + "id": "displayName", + "value": "Usable Free" + }, + { + "id": "unit", + "value": "bytes" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #C" + }, + "properties": [ + { + "id": "displayName", + "value": "Compression Eligibility" + }, + { + "id": "unit", + "value": "percent" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #E" + }, + "properties": [ + { + "id": "displayName", + "value": "Compression Savings" + }, + { + "id": "unit", + "value": "bytes" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #F" + }, + "properties": [ + { + "id": "displayName", + "value": "Growth (5d)" + }, + { + "id": "unit", + "value": "bytes" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.cellOptions", + "value": { + "type": "color-text" + } + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(245, 54, 54, 0.9)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 70 + }, + { + "color": "rgba(50, 172, 45, 0.97)", + "value": 85 + } + ] + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #G" + }, + "properties": [ + { + "id": "displayName", + "value": "IOPS" + }, + { + "id": "unit", + "value": "none" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #H" + }, + "properties": [ + { + "id": "displayName", + "value": "Bandwidth" + }, + { + "id": "unit", + "value": "Bps" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "__name__" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "type" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "compression_mode" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "description" + }, + "properties": [ + { + "id": "displayName", + "value": "Type" + }, + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #J" + }, + "properties": [ + { + "id": "displayName", + "value": "Stored" + }, + { + "id": "unit", + "value": "bytes" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #I" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #K" + }, + "properties": [ + { + "id": "displayName", + "value": "Compression" + }, + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 3 + }, + "id": 10, + "links": [ ], + "options": { + "footer": { + "countRows": false, + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 1, + "showHeader": true + }, + "pluginVersion": "10.4.0", + "styles": "", + "targets": [ + { + "expr": "(\n ceph_pool_compress_under_bytes{cluster=~\"$cluster\", } /\n ceph_pool_compress_bytes_used{cluster=~\"$cluster\", } > 0\n) and on(pool_id) (\n (\n (ceph_pool_compress_under_bytes{cluster=~\"$cluster\", } > 0) /\n ceph_pool_stored_raw{cluster=~\"$cluster\", }\n ) * 100 > 0.5\n)\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "A", + "refId": "A" + }, + { + "expr": "ceph_pool_max_avail{cluster=~\"$cluster\", } *\n on(pool_id) group_left(name) ceph_pool_metadata{cluster=~\"$cluster\", }\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "B", + "refId": "B" + }, + { + "expr": "(\n (ceph_pool_compress_under_bytes{cluster=~\"$cluster\", } > 0) /\n ceph_pool_stored_raw{cluster=~\"$cluster\", }\n) * 100\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "C", + "refId": "C" + }, + { + "expr": "ceph_pool_percent_used{cluster=~\"$cluster\", } *\n on(pool_id) group_left(name) ceph_pool_metadata{cluster=~\"$cluster\", }\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "D", + "refId": "D" + }, + { + "expr": "ceph_pool_compress_under_bytes{cluster=~\"$cluster\", } -\n ceph_pool_compress_bytes_used{cluster=~\"$cluster\", } > 0\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "E", + "refId": "E" + }, + { + "expr": "delta(ceph_pool_stored{cluster=~\"$cluster\", }[5d])", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "F", + "refId": "F" + }, + { + "expr": "rate(ceph_pool_rd{cluster=~\"$cluster\", }[$__rate_interval])\n + rate(ceph_pool_wr{cluster=~\"$cluster\", }[$__rate_interval])\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "G", + "refId": "G" + }, + { + "expr": "rate(ceph_pool_rd_bytes{cluster=~\"$cluster\", }[$__rate_interval]) +\n rate(ceph_pool_wr_bytes{cluster=~\"$cluster\", }[$__rate_interval])\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "H", + "refId": "H" + }, + { + "expr": "ceph_pool_metadata{cluster=~\"$cluster\", }", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "I", + "refId": "I" + }, + { + "expr": "ceph_pool_stored{cluster=~\"$cluster\", } * on(pool_id) group_left ceph_pool_metadata{cluster=~\"$cluster\", }", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "J", + "refId": "J" + }, + { + "expr": "ceph_pool_metadata{compression_mode!=\"none\", cluster=~\"$cluster\", }", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "K", + "refId": "K" + }, + { + "expr": "", + "format": "", + "intervalFactor": "", + "legendFormat": "L", + "refId": "L" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Pool Overview", + "transformations": [ + { + "id": "merge", + "options": { } + }, + { + "id": "seriesToRows", + "options": { } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Value #A": true, + "Value #B": false, + "Value #C": true, + "Value #D": false, + "Value #E": true, + "Value #I": true, + "Value #K": true, + "__name__": true, + "cluster": true, + "compression_mode": true, + "instance": true, + "job": true, + "pool_id": true, + "type": true + }, + "includeByName": { }, + "indexByName": { }, + "renameByName": { } + } + } + ], + "type": "table" + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "This chart shows the sum of read and write IOPS from all clients by pool", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "short" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 11, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "topk($topk,\n round(\n (\n rate(ceph_pool_rd{cluster=~\"$cluster\", }[$__rate_interval]) +\n rate(ceph_pool_wr{cluster=~\"$cluster\", }[$__rate_interval])\n ), 1\n ) * on(pool_id) group_left(instance,name) ceph_pool_metadata{cluster=~\"$cluster\", })\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{name}} ", + "refId": "A" + }, + { + "expr": "topk($topk,\n rate(ceph_pool_wr{cluster=~\"$cluster\", }[$__rate_interval]) +\n on(pool_id) group_left(instance,name) ceph_pool_metadata{cluster=~\"$cluster\", }\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{name}} - write", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Top $topk Client IOPS by Pool", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": "IOPS", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "The chart shows the sum of read and write bytes from all clients, by pool", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "Bps" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 12, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "topk($topk,\n (\n rate(ceph_pool_rd_bytes{cluster=~\"$cluster\", }[$__rate_interval]) +\n rate(ceph_pool_wr_bytes{cluster=~\"$cluster\", }[$__rate_interval])\n ) * on(pool_id) group_left(instance, name) ceph_pool_metadata{cluster=~\"$cluster\", }\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{name}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Top $topk Client Bandwidth by Pool", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "Bps", + "label": "Throughput", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Historical view of capacity usage, to help identify growth and trends in pool consumption", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "bytes" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 17 + }, + "id": 13, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "ceph_pool_bytes_used{cluster=~\"$cluster\", } * on(pool_id) group_right ceph_pool_metadata{cluster=~\"$cluster\", }", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{name}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Pool Capacity Usage (RAW)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "bytes", + "label": "Capacity Used", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "refresh": "30s", + "rows": [ ], + "schemaVersion": 22, + "style": "dark", + "tags": [ + "ceph-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": "Data Source", + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_health_status, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "15", + "value": "15" + }, + "hide": 0, + "includeAll": false, + "label": "TopK", + "multi": false, + "name": "topk", + "options": [ + { + "text": "15", + "value": "15" + } + ], + "query": "15", + "refresh": 0, + "type": "custom" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Ceph Pools Overview", + "uid": "z99hzWtmk", + "version": 0 +} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/radosgw-detail.json b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/radosgw-detail.json new file mode 100644 index 000000000..35de6b09b --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/radosgw-detail.json @@ -0,0 +1,651 @@ +{ + "__inputs": [ ], + "__requires": [ + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "5.0.0" + }, + { + "id": "grafana-piechart-panel", + "name": "Pie Chart", + "type": "panel", + "version": "1.3.3" + }, + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "5.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "showIn": 0, + "tags": [ ], + "type": "dashboard" + } + ] + }, + "description": "", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ ], + "panels": [ + { + "collapse": false, + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 2, + "panels": [ ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "RGW Host Detail : $rgw_servers", + "titleSize": "h6", + "type": "row" + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "s" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 1 + }, + "id": 3, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (instance_id) (\n rate(ceph_rgw_op_get_obj_lat_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rgw_op_get_obj_lat_count{cluster=~\"$cluster\", }[$__rate_interval])\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "GET {{ceph_daemon}}", + "refId": "A" + }, + { + "expr": "sum by (instance_id) (\n rate(ceph_rgw_op_put_obj_lat_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rgw_op_put_obj_lat_count{cluster=~\"$cluster\", }[$__rate_interval])\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "PUT {{ceph_daemon}}", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "$rgw_servers GET/PUT Latencies", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "bytes" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 7, + "x": 6, + "y": 1 + }, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(ceph_rgw_op_get_obj_bytes{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "GETs {{ceph_daemon}}", + "refId": "A" + }, + { + "expr": "rate(ceph_rgw_op_put_obj_bytes{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon)\n ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "PUTs {{ceph_daemon}}", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Bandwidth by HTTP Operation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { + "GETs": "#7eb26d", + "Other": "#447ebc", + "PUTs": "#eab839", + "Requests": "#3f2b5b", + "Requests Failed": "#bf1b00" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "short" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 7, + "x": 13, + "y": 1 + }, + "id": 5, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(ceph_rgw_failed_req{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Requests Failed {{ceph_daemon}}", + "refId": "A" + }, + { + "expr": "rate(ceph_rgw_get{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "GETs {{ceph_daemon}}", + "refId": "B" + }, + { + "expr": "rate(ceph_rgw_put{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "PUTs {{ceph_daemon}}", + "refId": "C" + }, + { + "expr": "(\n rate(ceph_rgw_req{cluster=~\"$cluster\", }[$__rate_interval]) -\n (\n rate(ceph_rgw_get{cluster=~\"$cluster\", }[$__rate_interval]) +\n rate(ceph_rgw_put{cluster=~\"$cluster\", }[$__rate_interval])\n )\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Other {{ceph_daemon}}", + "refId": "D" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "HTTP Request Breakdown", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [ ] + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Failures" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#bf1b00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "GETs" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7eb26d", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Other (HEAD,POST,DELETE)" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#447ebc", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "PUTs" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#eab839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Requests" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#3f2b5b", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 4, + "x": 20, + "y": 1 + }, + "id": 6, + "options": { + "displayLabels": [ ], + "legend": { + "calcs": [ ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "values": [ + "percent", + "value" + ] + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "rate(ceph_rgw_failed_req{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Failures {{ceph_daemon}}", + "refId": "A" + }, + { + "expr": "rate(ceph_rgw_get{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "GETs {{ceph_daemon}}", + "refId": "B" + }, + { + "expr": "rate(ceph_rgw_put{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "PUTs {{ceph_daemon}}", + "refId": "C" + }, + { + "expr": "(\n rate(ceph_rgw_req{cluster=~\"$cluster\", }[$__rate_interval]) -\n (\n rate(ceph_rgw_get{cluster=~\"$cluster\", }[$__rate_interval]) +\n rate(ceph_rgw_put{cluster=~\"$cluster\", }[$__rate_interval])\n )\n) * on (instance_id) group_left (ceph_daemon)\n ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Other (DELETE,LIST) {{ceph_daemon}}", + "refId": "D" + } + ], + "title": "Workload Breakdown", + "type": "piechart" + } + ], + "refresh": "30s", + "rows": [ ], + "schemaVersion": 16, + "style": "dark", + "tags": [ + "ceph-mixin", + "overview" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": "Data Source", + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_health_status, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "", + "multi": false, + "name": "rgw_servers", + "options": [ ], + "query": "label_values(ceph_rgw_metadata{cluster=~\"$cluster\", }, ceph_daemon)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "RGW Instance Detail", + "uid": "x5ARzZtmk", + "version": 0 +} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/radosgw-overview.json b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/radosgw-overview.json new file mode 100644 index 000000000..5e185b63b --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/radosgw-overview.json @@ -0,0 +1,1336 @@ +{ + "__inputs": [ ], + "__requires": [ + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "5.0.0" + }, + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "5.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "showIn": 0, + "tags": [ ], + "type": "dashboard" + } + ] + }, + "description": "", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ ], + "panels": [ + { + "collapse": false, + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 2, + "panels": [ ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "RGW Overview - All Gateways", + "titleSize": "h6", + "type": "row" + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "s" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 1 + }, + "id": 3, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "label_replace(\n rate(ceph_rgw_op_get_obj_lat_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rgw_op_get_obj_lat_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "GET {{rgw_host}}", + "refId": "A" + }, + { + "expr": "label_replace(\n rate(ceph_rgw_op_put_obj_lat_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rgw_op_put_obj_lat_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "PUT {{rgw_host}}", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Average GET/PUT Latencies by RGW Instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "none" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 7, + "x": 8, + "y": 1 + }, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (rgw_host) (\n label_replace(\n rate(ceph_rgw_req{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n )\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{rgw_host}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Total Requests/sec by RGW Instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Latencies are shown stacked, without a yaxis to provide a visual indication of GET latency imbalance across RGW hosts", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "s" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 6, + "x": 15, + "y": 1 + }, + "id": 5, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "label_replace(\n rate(ceph_rgw_op_get_obj_lat_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rgw_op_get_obj_lat_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{rgw_host}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "GET Latencies by RGW Instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Total bytes transferred in/out of all radosgw instances within the cluster", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "bytes" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 8, + "x": 0, + "y": 8 + }, + "id": 6, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(ceph_rgw_op_get_obj_bytes{cluster=~\"$cluster\", }[$__rate_interval]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "GETs", + "refId": "A" + }, + { + "expr": "sum(rate(ceph_rgw_op_put_obj_bytes{cluster=~\"$cluster\", }[$__rate_interval]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "PUTs", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Bandwidth Consumed by Type", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Total bytes transferred in/out through get/put operations, by radosgw instance", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "bytes" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 7, + "x": 8, + "y": 8 + }, + "id": 7, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "label_replace(sum by (instance_id) (\n rate(ceph_rgw_op_get_obj_bytes{cluster=~\"$cluster\", }[$__rate_interval]) +\n rate(ceph_rgw_op_put_obj_bytes{cluster=~\"$cluster\", }[$__rate_interval])) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{rgw_host}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Bandwidth by RGW Instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Latencies are shown stacked, without a yaxis to provide a visual indication of PUT latency imbalance across RGW hosts", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "s" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 6, + "x": 15, + "y": 8 + }, + "id": 8, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "label_replace(\n rate(ceph_rgw_op_put_obj_lat_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rgw_op_put_obj_lat_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{rgw_host}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "PUT Latencies by RGW Instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "collapse": false, + "collapsed": false, + "gridPos": { + "h": 12, + "w": 9, + "x": 0, + "y": 12 + }, + "id": 9, + "panels": [ ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "RGW Overview - HAProxy Metrics", + "titleSize": "h6", + "type": "row" + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "short" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 12, + "w": 5, + "x": 0, + "y": 12 + }, + "id": 10, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + [ + { + "alias": "/.*Back.*/", + "transform": "negative-Y" + }, + { + "alias": "/.*1.*/" + }, + { + "alias": "/.*2.*/" + }, + { + "alias": "/.*3.*/" + }, + { + "alias": "/.*4.*/" + }, + { + "alias": "/.*5.*/" + }, + { + "alias": "/.*other.*/" + } + ] + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(\n rate(\n haproxy_frontend_http_responses_total{code=~\"$code\", job=~\"$job_haproxy\", instance=~\"$ingress_service\", proxy=~\"frontend\"}[$__rate_interval]\n )\n) by (code)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Frontend {{ code }}", + "refId": "A" + }, + { + "expr": "sum(\n rate(\n haproxy_backend_http_responses_total{code=~\"$code\", job=~\"$job_haproxy\", instance=~\"$ingress_service\", proxy=~\"backend\"}[$__rate_interval]\n )\n) by (code)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Backend {{ code }}", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Total responses by HTTP code", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "short" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 12, + "w": 5, + "x": 5, + "y": 12 + }, + "id": 11, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + [ + { + "alias": "/.*Response.*/", + "transform": "negative-Y" + }, + { + "alias": "/.*Backend.*/", + "transform": "negative-Y" + } + ] + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(\n rate(\n haproxy_frontend_http_requests_total{proxy=~\"frontend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n )\n) by (instance)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Requests", + "refId": "A" + }, + { + "expr": "sum(\n rate(\n haproxy_backend_response_errors_total{proxy=~\"backend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n )\n) by (instance)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Response errors", + "refId": "B" + }, + { + "expr": "sum(\n rate(\n haproxy_frontend_request_errors_total{proxy=~\"frontend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n )\n) by (instance)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Requests errors", + "refId": "C" + }, + { + "expr": "sum(\n rate(\n haproxy_backend_redispatch_warnings_total{proxy=~\"backend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n )\n) by (instance)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Backend redispatch", + "refId": "D" + }, + { + "expr": "sum(\n rate(\n haproxy_backend_retry_warnings_total{proxy=~\"backend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n )\n) by (instance)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Backend retry", + "refId": "E" + }, + { + "expr": "sum(\n rate(\n haproxy_frontend_requests_denied_total{proxy=~\"frontend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n )\n) by (instance)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Request denied", + "refId": "F" + }, + { + "expr": "sum(\n haproxy_backend_current_queue{proxy=~\"backend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}\n) by (instance)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Backend Queued", + "refId": "G" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Total requests / responses", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "short" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 12, + "w": 5, + "x": 10, + "y": 12 + }, + "id": 12, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + [ + { + "alias": "/.*Back.*/", + "transform": "negative-Y" + } + ] + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(\n rate(\n haproxy_frontend_connections_total{proxy=~\"frontend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n )\n) by (instance)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Front", + "refId": "A" + }, + { + "expr": "sum(\n rate(\n haproxy_backend_connection_attempts_total{proxy=~\"backend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n )\n) by (instance)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Back", + "refId": "B" + }, + { + "expr": "sum(\n rate(\n haproxy_backend_connection_errors_total{proxy=~\"backend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n )\n) by (instance)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Back errors", + "refId": "C" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Total number of connections", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "short" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 12, + "w": 6, + "x": 15, + "y": 12 + }, + "id": 13, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + [ + { + "alias": "/.*OUT.*/", + "transform": "negative-Y" + } + ] + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(\n rate(\n haproxy_frontend_bytes_in_total{proxy=~\"frontend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n ) * 8\n) by (instance)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "IN Front", + "refId": "A" + }, + { + "expr": "sum(\n rate(\n haproxy_frontend_bytes_out_total{proxy=~\"frontend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n ) * 8\n) by (instance)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "OUT Front", + "refId": "B" + }, + { + "expr": "sum(\n rate(\n haproxy_backend_bytes_in_total{proxy=~\"backend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n ) * 8\n) by (instance)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "IN Back", + "refId": "C" + }, + { + "expr": "sum(\n rate(\n haproxy_backend_bytes_out_total{proxy=~\"backend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n ) * 8\n) by (instance)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "OUT Back", + "refId": "D" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Current total of incoming / outgoing bytes", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "refresh": "30s", + "rows": [ ], + "schemaVersion": 16, + "style": "dark", + "tags": [ + "ceph-mixin", + "overview" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": "Data Source", + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_health_status, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "", + "multi": false, + "name": "rgw_servers", + "options": [ ], + "query": "label_values(ceph_rgw_metadata{cluster=~\"$cluster\", }, ceph_daemon)", + "refresh": 1, + "regex": ".*", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "HTTP Code", + "multi": false, + "name": "code", + "options": [ ], + "query": "label_values(haproxy_server_http_responses_total{job=~\"$job_haproxy\", instance=~\"$ingress_service\"}, code)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "job haproxy", + "multi": true, + "name": "job_haproxy", + "options": [ ], + "query": "label_values(haproxy_server_status, job)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "Ingress Service", + "multi": false, + "name": "ingress_service", + "options": [ ], + "query": "label_values(haproxy_server_status{job=~\"$job_haproxy\"}, instance)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "RGW Overview", + "uid": "WAkugZpiz", + "version": 0 +} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/radosgw-sync-overview.json b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/radosgw-sync-overview.json new file mode 100644 index 000000000..a7550d27c --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/radosgw-sync-overview.json @@ -0,0 +1,614 @@ +{ + "__inputs": [ ], + "__requires": [ + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "5.0.0" + }, + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "5.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "showIn": 0, + "tags": [ ], + "type": "dashboard" + } + ] + }, + "description": "", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ ], + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "Bps" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 0 + }, + "id": 2, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (source_zone) (rate(ceph_data_sync_from_zone_fetch_bytes_sum{cluster=~\"$cluster\", }[$__rate_interval]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{source_zone}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Replication (throughput) from Source Zone", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "short" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 0 + }, + "id": 3, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (source_zone) (rate(ceph_data_sync_from_zone_fetch_bytes_count{cluster=~\"$cluster\", }[$__rate_interval]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{source_zone}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Replication (objects) from Source Zone", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": "Objects/s", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "ms" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 0 + }, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (source_zone) (rate(ceph_data_sync_from_zone_poll_latency_sum{cluster=~\"$cluster\", }[$__rate_interval]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{source_zone}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Polling Request Latency from Source Zone", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "short" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 7 + }, + "id": 5, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (source_zone) (rate(ceph_data_sync_from_zone_fetch_errors{cluster=~\"$cluster\", }[$__rate_interval]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{source_zone}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Unsuccessful Object Replications from Source Zone", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": "Count/s", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 7, + "w": 16, + "x": 8, + "y": 7 + }, + "id": 6, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "$datasource", + "expr": "rate(ceph_rgw_sync_delta_sync_delta[$__rate_interval])", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{instance_id}} - {{shard_id}}", + "range": true, + "refId": "A" + } + ], + "title": "Replication(Time) Delta per shard", + "type": "timeseries" + } + ], + "refresh": "30s", + "rows": [ ], + "schemaVersion": 16, + "style": "dark", + "tags": [ + "ceph-mixin", + "overview" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": "Data Source", + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_health_status, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "rgw_servers", + "options": [ ], + "query": "label_values(ceph_rgw_metadata{cluster=~\"$cluster\", }, ceph_daemon)", + "refresh": 1, + "regex": "rgw.(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "RGW Sync Overview", + "uid": "rgw-sync-overview", + "version": 0 +} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/rbd-details.json b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/rbd-details.json new file mode 100644 index 000000000..500c51f4b --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/rbd-details.json @@ -0,0 +1,465 @@ +{ + "__inputs": [ ], + "__requires": [ + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "5.3.3" + }, + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "5.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "showIn": 0, + "tags": [ ], + "type": "dashboard" + } + ] + }, + "description": "Detailed Performance of RBD Images (IOPS/Throughput/Latency)", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ ], + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "RBD per-image IO statistics are disabled by default.\n\nPlease refer to https://docs.ceph.com/en/latest/mgr/prometheus/#rbd-io-statistics for information about how to enable those optionally.", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "iops" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 8, + "x": 0, + "y": 0 + }, + "id": 2, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(ceph_rbd_write_ops{pool=\"$pool\", image=\"$image\", cluster=~\"$cluster\", }[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{pool}} Write", + "refId": "A" + }, + { + "expr": "rate(ceph_rbd_read_ops{pool=\"$pool\", image=\"$image\", cluster=~\"$cluster\", }[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{pool}} Read", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "IOPS", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "iops", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "iops", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "RBD per-image IO statistics are disabled by default.\n\nPlease refer to https://docs.ceph.com/en/latest/mgr/prometheus/#rbd-io-statistics for information about how to enable those optionally.", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "Bps" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 8, + "x": 8, + "y": 0 + }, + "id": 3, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(ceph_rbd_write_bytes{pool=\"$pool\", image=\"$image\", cluster=~\"$cluster\", }[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{pool}} Write", + "refId": "A" + }, + { + "expr": "rate(ceph_rbd_read_bytes{pool=\"$pool\", image=\"$image\", cluster=~\"$cluster\", }[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{pool}} Read", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Throughput", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "RBD per-image IO statistics are disabled by default.\n\nPlease refer to https://docs.ceph.com/en/latest/mgr/prometheus/#rbd-io-statistics for information about how to enable those optionally.", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "ns" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 8, + "x": 16, + "y": 0 + }, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(ceph_rbd_write_latency_sum{pool=\"$pool\", image=\"$image\", cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rbd_write_latency_count{pool=\"$pool\", image=\"$image\", cluster=~\"$cluster\", }[$__rate_interval])\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{pool}} Write", + "refId": "A" + }, + { + "expr": "rate(ceph_rbd_read_latency_sum{pool=\"$pool\", image=\"$image\", cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rbd_read_latency_count{pool=\"$pool\", image=\"$image\", cluster=~\"$cluster\", }[$__rate_interval])\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{pool}} Read", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Average Latency", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "ns", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "ns", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "refresh": "30s", + "rows": [ ], + "schemaVersion": 16, + "style": "dark", + "tags": [ + "ceph-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": "Data Source", + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_health_status, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "", + "multi": false, + "name": "pool", + "options": [ ], + "query": "label_values(ceph_rbd_read_ops{cluster=~\"$cluster\", }, pool)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "", + "multi": false, + "name": "image", + "options": [ ], + "query": "label_values(ceph_rbd_read_ops{cluster=~\"$cluster\", , pool=\"$pool\"}, image)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "RBD Details", + "uid": "YhCYGcuZz", + "version": 0 +} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/rbd-overview.json b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/rbd-overview.json new file mode 100644 index 000000000..34666c67b --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/rbd-overview.json @@ -0,0 +1,885 @@ +{ + "__inputs": [ ], + "__requires": [ + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "5.4.2" + }, + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "5.0.0" + }, + { + "id": "prometheus", + "name": "Prometheus", + "type": "datasource", + "version": "5.0.0" + }, + { + "id": "table", + "name": "Table", + "type": "panel", + "version": "5.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "showIn": 0, + "tags": [ ], + "type": "dashboard" + } + ] + }, + "description": "", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ ], + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "RBD per-image IO statistics are disabled by default.\n\nPlease refer to https://docs.ceph.com/en/latest/mgr/prometheus/#rbd-io-statistics for information about how to enable those optionally.", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "short" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 0 + }, + "id": 2, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "round(sum(rate(ceph_rbd_write_ops{cluster=~\"$cluster\", }[$__rate_interval])))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Writes", + "refId": "A" + }, + { + "expr": "round(sum(rate(ceph_rbd_read_ops{cluster=~\"$cluster\", }[$__rate_interval])))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Reads", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "IOPS", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "RBD per-image IO statistics are disabled by default.\n\nPlease refer to https://docs.ceph.com/en/latest/mgr/prometheus/#rbd-io-statistics for information about how to enable those optionally.", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "Bps" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 0 + }, + "id": 3, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "round(sum(rate(ceph_rbd_write_bytes{cluster=~\"$cluster\", }[$__rate_interval])))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Write", + "refId": "A" + }, + { + "expr": "round(sum(rate(ceph_rbd_read_bytes{cluster=~\"$cluster\", }[$__rate_interval])))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Read", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Throughput", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "RBD per-image IO statistics are disabled by default.\n\nPlease refer to https://docs.ceph.com/en/latest/mgr/prometheus/#rbd-io-statistics for information about how to enable those optionally.", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "ns" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 0 + }, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "round(\n sum(rate(ceph_rbd_write_latency_sum{cluster=~\"$cluster\", }[$__rate_interval])) /\n sum(rate(ceph_rbd_write_latency_count{cluster=~\"$cluster\", }[$__rate_interval]))\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Write", + "refId": "A" + }, + { + "expr": "round(\n sum(rate(ceph_rbd_read_latency_sum{cluster=~\"$cluster\", }[$__rate_interval])) /\n sum(rate(ceph_rbd_read_latency_count{cluster=~\"$cluster\", }[$__rate_interval]))\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Read", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Average Latency", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "ns", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "columns": [ ], + "datasource": "${datasource}", + "description": "RBD per-image IO statistics are disabled by default.\n\nPlease refer to https://docs.ceph.com/en/latest/mgr/prometheus/#rbd-io-statistics for information about how to enable those optionally.", + "fieldConfig": { + "defaults": { + "custom": { + "align": "null", + "cellOptions": { + "type": "auto" + }, + "filterable": true, + "inspect": false + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "pool" + }, + "properties": [ + { + "id": "displayName", + "value": "Pool" + }, + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": null + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "image" + }, + "properties": [ + { + "id": "displayName", + "value": "Image" + }, + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": null + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "IOPS" + }, + { + "id": "unit", + "value": "iops" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": null + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 7 + }, + "id": 5, + "links": [ ], + "options": { + "footer": { + "countRows": false, + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 1, + "showHeader": true + }, + "pluginVersion": "10.4.0", + "styles": "", + "targets": [ + { + "expr": "topk(10,\n (\n sort((\n rate(ceph_rbd_write_ops{cluster=~\"$cluster\", }[$__rate_interval]) +\n on (image, pool, namespace) rate(ceph_rbd_read_ops{cluster=~\"$cluster\", }[$__rate_interval])\n ))\n )\n)\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Highest IOPS", + "transformations": [ + { + "id": "merge", + "options": { + "reducers": [ ] + } + } + ], + "type": "table" + }, + { + "columns": [ ], + "datasource": "${datasource}", + "description": "RBD per-image IO statistics are disabled by default.\n\nPlease refer to https://docs.ceph.com/en/latest/mgr/prometheus/#rbd-io-statistics for information about how to enable those optionally.", + "fieldConfig": { + "defaults": { + "custom": { + "align": "null", + "cellOptions": { + "type": "auto" + }, + "filterable": true, + "inspect": false + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "pool" + }, + "properties": [ + { + "id": "displayName", + "value": "Pool" + }, + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": null + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "image" + }, + "properties": [ + { + "id": "displayName", + "value": "Image" + }, + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": null + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "Throughput" + }, + { + "id": "unit", + "value": "Bps" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": null + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 7 + }, + "id": 6, + "links": [ ], + "options": { + "footer": { + "countRows": false, + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 1, + "showHeader": true + }, + "pluginVersion": "10.4.0", + "styles": "", + "targets": [ + { + "expr": "topk(10,\n sort(\n sum(\n rate(ceph_rbd_read_bytes{cluster=~\"$cluster\", }[$__rate_interval]) +\n rate(ceph_rbd_write_bytes{cluster=~\"$cluster\", }[$__rate_interval])\n ) by (pool, image, namespace)\n )\n)\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Highest Throughput", + "transformations": [ + { + "id": "merge", + "options": { + "reducers": [ ] + } + } + ], + "type": "table" + }, + { + "columns": [ ], + "datasource": "${datasource}", + "description": "RBD per-image IO statistics are disabled by default.\n\nPlease refer to https://docs.ceph.com/en/latest/mgr/prometheus/#rbd-io-statistics for information about how to enable those optionally.", + "fieldConfig": { + "defaults": { + "custom": { + "align": "null", + "cellOptions": { + "type": "auto" + }, + "filterable": true, + "inspect": false + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "pool" + }, + "properties": [ + { + "id": "displayName", + "value": "Pool" + }, + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": null + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "image" + }, + "properties": [ + { + "id": "displayName", + "value": "Image" + }, + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": null + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "Latency" + }, + { + "id": "unit", + "value": "ns" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": null + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 7 + }, + "id": 7, + "links": [ ], + "options": { + "footer": { + "countRows": false, + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 1, + "showHeader": true + }, + "pluginVersion": "10.4.0", + "styles": "", + "targets": [ + { + "expr": "topk(10,\n sum(\n rate(ceph_rbd_write_latency_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n clamp_min(rate(ceph_rbd_write_latency_count{cluster=~\"$cluster\", }[$__rate_interval]), 1) +\n rate(ceph_rbd_read_latency_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n clamp_min(rate(ceph_rbd_read_latency_count{cluster=~\"$cluster\", }[$__rate_interval]), 1)\n ) by (pool, image, namespace)\n)\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Highest Latency", + "transformations": [ + { + "id": "merge", + "options": { + "reducers": [ ] + } + } + ], + "type": "table" + } + ], + "refresh": "30s", + "rows": [ ], + "schemaVersion": 16, + "style": "dark", + "tags": [ + "ceph-mixin", + "overview" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": "Data Source", + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_health_status, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "RBD Overview", + "uid": "41FrpeUiz", + "version": 0 +} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/rgw-s3-analytics.json b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/rgw-s3-analytics.json new file mode 100644 index 000000000..397279f54 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/dashboards_out/rgw-s3-analytics.json @@ -0,0 +1,4715 @@ +{ + "__inputs": [ ], + "__requires": [ ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "showIn": 0, + "tags": [ ], + "type": "dashboard" + } + ] + }, + "description": "", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ ], + "panels": [ + { + "collapse": false, + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 2, + "panels": [ ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Overview", + "titleSize": "h6", + "type": "row" + }, + { + "colors": null, + "datasource": "${datasource}", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + } + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 0, + "y": 1 + }, + "id": 3, + "links": [ ], + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum(ceph_rgw_op_put_obj_bytes *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Total PUTs", + "transparent": false, + "type": "stat" + }, + { + "colors": null, + "datasource": "${datasource}", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + } + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 6, + "y": 1 + }, + "id": 4, + "links": [ ], + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum\n(ceph_rgw_op_get_obj_bytes *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Total GETs", + "transparent": false, + "type": "stat" + }, + { + "colors": null, + "datasource": "${datasource}", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + } + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 12, + "y": 1 + }, + "id": 5, + "links": [ ], + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum(ceph_rgw_op_put_obj_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Total Objects", + "transparent": false, + "type": "stat" + }, + { + "colors": null, + "datasource": "${datasource}", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + } + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 18, + "y": 1 + }, + "id": 6, + "links": [ ], + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum\n((sum by(instance_id)(ceph_rgw_op_put_obj_bytes) > 0) / (sum by(instance_id)(ceph_rgw_op_put_obj_ops) > 0) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Average Object Size", + "transparent": false, + "type": "stat" + }, + { + "datasource": "${datasource}", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 4 + }, + "id": 7, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum(ceph_rgw_op_list_obj_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "List Objects", + "range": false, + "refId": "A" + }, + { + "datasource": "${datasource}", + "expr": "sum(ceph_rgw_op_list_buckets_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "List Buckets", + "range": true, + "refId": "B" + }, + { + "datasource": "${datasource}", + "expr": "sum(ceph_rgw_op_put_obj_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "Put Objects", + "range": false, + "refId": "C" + }, + { + "datasource": "${datasource}", + "expr": "sum(ceph_rgw_op_per_bucket_get_obj_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "Get Objects", + "range": false, + "refId": "D" + }, + { + "datasource": "${datasource}", + "expr": "sum(ceph_rgw_op_del_obj_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "Delete Objects", + "range": false, + "refId": "E" + }, + { + "datasource": "${datasource}", + "expr": "sum(ceph_rgw_op_del_bucket_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "Delete Buckets", + "range": false, + "refId": "F" + }, + { + "datasource": "${datasource}", + "expr": "sum(ceph_rgw_op_copy_obj_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Copy Objects", + "range": true, + "refId": "G" + } + ], + "title": "Total Operations", + "type": "bargauge" + }, + { + "datasource": "${datasource}", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "unit", + "value": "decbytes" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 4 + }, + "id": 8, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum(ceph_rgw_op_put_obj_bytes *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "Put Objects", + "range": false, + "refId": "A" + }, + { + "datasource": "${datasource}", + "expr": "sum(ceph_rgw_op_per_bucket_get_obj_bytes *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "Get Objects", + "range": false, + "refId": "B" + }, + { + "datasource": "${datasource}", + "expr": "sum(ceph_rgw_op_del_obj_bytes *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "Delete Objects", + "range": false, + "refId": "C" + }, + { + "datasource": "${datasource}", + "expr": "sum(ceph_rgw_op_copy_obj_bytes *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Copy Objects", + "range": true, + "refId": "D" + } + ], + "title": "Total Size", + "type": "bargauge" + }, + { + "datasource": "${datasource}", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "unit", + "value": "ms" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 4 + }, + "id": 9, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum(ceph_rgw_op_list_obj_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "List Object", + "range": false, + "refId": "A" + }, + { + "datasource": "${datasource}", + "expr": "sum(ceph_rgw_op_list_buckets_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "List Bucket", + "range": true, + "refId": "B" + }, + { + "datasource": "${datasource}", + "expr": "sum(ceph_rgw_op_put_obj_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "Put Object", + "range": false, + "refId": "C" + }, + { + "datasource": "${datasource}", + "expr": "sum(ceph_rgw_op_get_obj_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "Get Object", + "range": false, + "refId": "D" + }, + { + "datasource": "${datasource}", + "expr": "sum(ceph_rgw_op_del_obj_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "Delete Object", + "range": false, + "refId": "E" + }, + { + "datasource": "${datasource}", + "expr": "sum(ceph_rgw_op_del_bucket_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "Delete Bucket", + "range": false, + "refId": "F" + }, + { + "datasource": "${datasource}", + "expr": "sum(ceph_rgw_op_copy_obj_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Copy Object", + "range": true, + "refId": "G" + } + ], + "title": "Total Latencies", + "type": "bargauge" + }, + { + "columns": [ ], + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "custom": { + "align": "auto", + "cellOptions": { + "type": "color-text" + }, + "filterable": false, + "inspect": false + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "options": "number" + }, + "properties": [ + { + "id": "unit", + "value": "decbytes" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 12 + }, + "id": 10, + "links": [ ], + "options": { + "footer": { + "countRows": false, + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 1, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "PUTs" + } + ] + }, + "pluginVersion": "9.4.7", + "styles": "", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "sum by (bucket, ceph_daemon) (ceph_rgw_op_per_bucket_put_obj_bytes *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Upload Objects", + "range": false, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "sum by (bucket, ceph_daemon) (ceph_rgw_op_per_bucket_get_obj_bytes *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Get Objects", + "range": false, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "sum by (bucket, ceph_daemon) (ceph_rgw_op_per_bucket_del_obj_bytes *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Delete Objects", + "range": false, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "sum by (bucket, ceph_daemon) (ceph_rgw_op_per_bucket_copy_obj_bytes *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Copy Objects", + "range": false, + "refId": "D" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Summary Per Bucket by Bandwidth", + "transformations": [ + { + "id": "merge", + "options": { } + }, + { + "id": "groupBy", + "options": { + "fields": { + "Bucket": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #A": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #B": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #D": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #F": { + "aggregations": [ ], + "operation": "groupby" + }, + "bucket": { + "aggregations": [ ], + "operation": "groupby" + }, + "ceph_daemon": { + "aggregations": [ ], + "operation": "groupby" + } + } + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time 1": true, + "Time 2": true, + "Time 3": true, + "Time 4": true, + "Time 5": true, + "Time 6": true, + "Time 7": true, + "__name__ 1": true, + "__name__ 2": true, + "__name__ 3": true, + "__name__ 4": true, + "__name__ 5": true, + "__name__ 6": true, + "__name__ 7": true, + "ceph_daemon 1": false, + "ceph_daemon 2": true, + "ceph_daemon 3": true, + "ceph_daemon 4": true, + "instance 1": true, + "instance 2": true, + "instance 3": true, + "instance 4": true, + "instance 5": true, + "instance 6": true, + "instance 7": true, + "instance_id 1": true, + "instance_id 2": true, + "instance_id 3": true, + "instance_id 4": true, + "instance_id 5": true, + "instance_id 6": true, + "instance_id 7": true, + "job 1": true, + "job 2": true, + "job 3": true, + "job 4": true, + "job 5": true, + "job 6": true, + "job 7": true + }, + "indexByName": { + "Value #A": 2, + "Value #B": 3, + "Value #D": 4, + "Value #F": 5, + "bucket": 1, + "ceph_daemon": 0 + }, + "renameByName": { + "Bucket": "", + "Value #A": "PUTs", + "Value #B": "GETs", + "Value #C": "List", + "Value #D": "Delete", + "Value #E": "Copy", + "Value #F": "Copy", + "Value #G": "", + "bucket": "Bucket", + "ceph_daemon": "Daemon", + "ceph_daemon 1": "Daemon" + } + } + } + ], + "type": "table" + }, + { + "columns": [ ], + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "filterable": false, + "inspect": false + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "options": "number" + }, + "properties": [ + { + "id": "unit", + "value": "ms" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 12 + }, + "id": 11, + "links": [ ], + "options": { + "footer": { + "countRows": false, + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 1, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "PUTs" + } + ] + }, + "pluginVersion": "9.4.7", + "styles": "", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "ceph_rgw_op_per_bucket_list_obj_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "List Objects", + "range": false, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "ceph_rgw_op_per_bucket_put_obj_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Upload Objects", + "range": false, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "ceph_rgw_op_per_bucket_get_obj_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Get Objects", + "range": false, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "ceph_rgw_op_per_bucket_del_obj_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Delete Objects", + "range": false, + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "ceph_rgw_op_per_bucket_copy_obj_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Copy Objects", + "range": false, + "refId": "E" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Latency(ms) Per Bucket", + "transformations": [ + { + "id": "merge", + "options": { } + }, + { + "id": "joinByField", + "options": { + "byField": "Bucket", + "mode": "outer" + } + }, + { + "id": "groupBy", + "options": { + "fields": { + "Bucket": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #A": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #B": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #C": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #D": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #F": { + "aggregations": [ ], + "operation": "groupby" + }, + "bucket": { + "aggregations": [ ], + "operation": "groupby" + }, + "ceph_daemon": { + "aggregations": [ ], + "operation": "groupby" + } + } + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time 1": true, + "Time 2": true, + "Time 3": true, + "Time 4": true, + "Time 5": true, + "Time 6": true, + "Time 7": true, + "__name__ 1": true, + "__name__ 2": true, + "__name__ 3": true, + "__name__ 4": true, + "__name__ 5": true, + "__name__ 6": true, + "__name__ 7": true, + "ceph_daemon 1": true, + "ceph_daemon 2": true, + "ceph_daemon 3": true, + "ceph_daemon 4": true, + "ceph_daemon 5": true, + "instance 1": true, + "instance 2": true, + "instance 3": true, + "instance 4": true, + "instance 5": true, + "instance 6": true, + "instance 7": true, + "instance_id 1": true, + "instance_id 2": true, + "instance_id 3": true, + "instance_id 4": true, + "instance_id 5": true, + "instance_id 6": true, + "instance_id 7": true, + "job 1": true, + "job 2": true, + "job 3": true, + "job 4": true, + "job 5": true, + "job 6": true, + "job 7": true + }, + "indexByName": { + "Value #A": 2, + "Value #B": 3, + "Value #C": 4, + "Value #D": 5, + "Value #F": 6, + "bucket": 1, + "ceph_daemon": 0 + }, + "renameByName": { + "Bucket": "", + "Value #A": "PUTs", + "Value #B": "GETs", + "Value #C": "List", + "Value #D": "Delete", + "Value #E": "Copy", + "Value #F": "Copy", + "Value #G": "", + "bucket": "Bucket", + "ceph_daemon": "Daemon" + } + } + } + ], + "type": "table" + }, + { + "columns": [ ], + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "filterable": false, + "inspect": false + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "options": "number" + }, + "properties": [ + { + "id": "unit", + "value": "decbytes" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 20 + }, + "id": 12, + "links": [ ], + "options": { + "footer": { + "countRows": false, + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 1, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "PUTs" + } + ] + }, + "pluginVersion": "9.4.7", + "styles": "", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "ceph_rgw_op_per_user_put_obj_bytes *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Upload Objects", + "range": false, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "ceph_rgw_op_per_user_get_obj_bytes *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Get Objects", + "range": false, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "ceph_rgw_op_per_user_del_obj_bytes *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Delete Objects", + "range": false, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "ceph_rgw_op_per_user_copy_obj_bytes *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Copy Objects", + "range": false, + "refId": "D" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Summary Per User By Bandwidth", + "transformations": [ + { + "id": "merge", + "options": { } + }, + { + "id": "groupBy", + "options": { + "fields": { + "User": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #A": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #B": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #D": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #F": { + "aggregations": [ ], + "operation": "groupby" + }, + "ceph_daemon": { + "aggregations": [ ], + "operation": "groupby" + }, + "instance": { + "aggregations": [ ] + }, + "user": { + "aggregations": [ ], + "operation": "groupby" + } + } + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time 1": true, + "Time 2": true, + "Time 3": true, + "Time 4": true, + "Time 5": true, + "Time 6": true, + "Time 7": true, + "__name__ 1": true, + "__name__ 2": true, + "__name__ 3": true, + "__name__ 4": true, + "__name__ 5": true, + "__name__ 6": true, + "__name__ 7": true, + "ceph_daemon 1": true, + "ceph_daemon 2": true, + "ceph_daemon 3": true, + "ceph_daemon 4": true, + "instance 1": true, + "instance 2": true, + "instance 3": true, + "instance 4": true, + "instance 5": true, + "instance 6": true, + "instance 7": true, + "instance_id 1": true, + "instance_id 2": true, + "instance_id 3": true, + "instance_id 4": true, + "instance_id 5": true, + "instance_id 6": true, + "instance_id 7": true, + "job 1": true, + "job 2": true, + "job 3": true, + "job 4": true, + "job 5": true, + "job 6": true, + "job 7": true + }, + "indexByName": { + "Value #A": 2, + "Value #B": 3, + "Value #D": 4, + "Value #F": 5, + "ceph_daemon": 0, + "user": 1 + }, + "renameByName": { + "Bucket": "", + "Value #A": "PUTs", + "Value #B": "GETs", + "Value #C": "List", + "Value #D": "Delete", + "Value #E": "Copy", + "Value #F": "Copy", + "Value #G": "", + "ceph_daemon": "Daemon", + "user": "User" + } + } + } + ], + "type": "table" + }, + { + "columns": [ ], + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "filterable": false, + "inspect": false + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "options": "number" + }, + "properties": [ + { + "id": "unit", + "value": "ms" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 20 + }, + "id": 13, + "links": [ ], + "options": { + "footer": { + "countRows": false, + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 1, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "PUTs" + } + ] + }, + "pluginVersion": "9.4.7", + "styles": "", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "ceph_rgw_op_per_user_list_obj_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": false, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "ceph_rgw_op_per_user_put_obj_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": false, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "ceph_rgw_op_per_user_get_obj_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": false, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "ceph_rgw_op_per_user_del_obj_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": false, + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "ceph_rgw_op_per_user_copy_obj_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": false, + "refId": "E" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Latency(ms) Per User", + "transformations": [ + { + "id": "merge", + "options": { } + }, + { + "id": "joinByField", + "options": { + "byField": "User", + "mode": "outer" + } + }, + { + "id": "groupBy", + "options": { + "fields": { + "User": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #A": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #B": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #C": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #D": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #F": { + "aggregations": [ ], + "operation": "groupby" + }, + "ceph_daemon": { + "aggregations": [ ], + "operation": "groupby" + }, + "user": { + "aggregations": [ ], + "operation": "groupby" + } + } + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time 1": true, + "Time 2": true, + "Time 3": true, + "Time 4": true, + "Time 5": true, + "Time 6": true, + "Time 7": true, + "__name__ 1": true, + "__name__ 2": true, + "__name__ 3": true, + "__name__ 4": true, + "__name__ 5": true, + "__name__ 6": true, + "__name__ 7": true, + "ceph_daemon 1": true, + "ceph_daemon 2": true, + "ceph_daemon 3": true, + "ceph_daemon 4": true, + "ceph_daemon 5": true, + "instance 1": true, + "instance 2": true, + "instance 3": true, + "instance 4": true, + "instance 5": true, + "instance 6": true, + "instance 7": true, + "instance_id 1": true, + "instance_id 2": true, + "instance_id 3": true, + "instance_id 4": true, + "instance_id 5": true, + "instance_id 6": true, + "instance_id 7": true, + "job 1": true, + "job 2": true, + "job 3": true, + "job 4": true, + "job 5": true, + "job 6": true, + "job 7": true + }, + "indexByName": { + "Value #A": 2, + "Value #B": 3, + "Value #C": 4, + "Value #D": 5, + "Value #F": 6, + "ceph_daemon": 0, + "user": 1 + }, + "renameByName": { + "Bucket": "", + "Value #A": "PUTs", + "Value #B": "GETs", + "Value #C": "List", + "Value #D": "Delete", + "Value #E": "Copy", + "Value #F": "Copy", + "Value #G": "", + "ceph_daemon": "Daemon", + "user": "User" + } + } + } + ], + "type": "table" + }, + { + "collapse": false, + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 28 + }, + "id": 14, + "panels": [ + { + "datasource": "${datasource}", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "color" + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 29 + }, + "id": 15, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "topk(5, \n sum by (bucket, ceph_daemon) ((ceph_rgw_op_per_bucket_put_obj_ops) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })\n)", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{bucket}}", + "range": false, + "refId": "A" + } + ], + "title": "Top 5 Bucket PUTs by Operations", + "type": "bargauge" + }, + { + "datasource": "${datasource}", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "color" + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 29 + }, + "id": 16, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "topk(5, \n sum by (bucket, ceph_daemon) ((ceph_rgw_op_per_bucket_get_obj_ops) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })\n)", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{bucket}}", + "range": false, + "refId": "A" + } + ], + "title": "Top 5 Bucket GETs by Operations", + "type": "bargauge" + }, + { + "datasource": "${datasource}", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + } + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 29 + }, + "id": 17, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ ] + } + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "topk(5,\n sum by (bucket, ceph_daemon) ((ceph_rgw_op_per_bucket_put_obj_bytes) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })\n)", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{bucket}}", + "range": false, + "refId": "A" + } + ], + "title": "Top 5 Buckets PUTs By Size", + "type": "bargauge" + }, + { + "datasource": "${datasource}", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + } + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 29 + }, + "id": 18, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ ] + } + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "topk(5,\n sum by (bucket, ceph_daemon) ((ceph_rgw_op_per_bucket_get_obj_bytes) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })\n)", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{bucket}}", + "range": false, + "refId": "A" + } + ], + "title": "Top 5 Buckets GETs By Size", + "type": "bargauge" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "decbytes" + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "color" + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 37 + }, + "id": 19, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by (bucket, ceph_daemon) ((ceph_rgw_op_per_bucket_put_obj_bytes) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{bucket}}", + "range": true, + "refId": "A", + "step": 300 + } + ], + "title": "Bucket PUTs by Size", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "decbytes" + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "color" + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 37 + }, + "id": 20, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by (bucket, ceph_daemon) ((ceph_rgw_op_per_bucket_get_obj_bytes) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{bucket}}", + "range": true, + "refId": "A", + "step": 300 + } + ], + "title": "Bucket GETs by Size", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "decbytes" + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "color" + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 37 + }, + "id": 21, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by (bucket, ceph_daemon) ((ceph_rgw_op_per_bucket_copy_obj_bytes) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{bucket}}", + "range": true, + "refId": "A", + "step": 300 + } + ], + "title": "Bucket Copy by Size", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "decbytes" + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "color" + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 37 + }, + "id": 22, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by (bucket, ceph_daemon) ((ceph_rgw_op_per_bucket_del_obj_bytes) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{bucket}}", + "range": true, + "refId": "A", + "step": 300 + } + ], + "title": "Bucket Delete by Size", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "color" + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 45 + }, + "id": 23, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by (bucket, ceph_daemon) ((ceph_rgw_op_per_bucket_get_obj_ops) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{bucket}}", + "range": true, + "refId": "A", + "step": 300 + } + ], + "title": "Bucket GETs by Operations", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "color" + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 45 + }, + "id": 24, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by (bucket, ceph_daemon) ((ceph_rgw_op_per_bucket_put_obj_ops) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{bucket}}", + "range": true, + "refId": "A", + "step": 300 + } + ], + "title": "Bucket PUTs by Operations", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "color" + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 45 + }, + "id": 25, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by (bucket, ceph_daemon) ((ceph_rgw_op_per_bucket_list_obj_ops) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{bucket}}", + "range": true, + "refId": "A", + "step": 300 + } + ], + "title": "Bucket List by Operations", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "color" + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 45 + }, + "id": 26, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by (bucket, ceph_daemon) ((ceph_rgw_op_per_bucket_del_obj_ops) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{bucket}}", + "range": true, + "refId": "A", + "step": 300 + } + ], + "title": "Bucket Delete by Operations", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "color" + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 53 + }, + "id": 27, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by (bucket, ceph_daemon) ((ceph_rgw_op_per_bucket_copy_obj_ops) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{bucket}}", + "range": true, + "refId": "A", + "step": 300 + } + ], + "title": "Bucket Copy by Operations", + "type": "timeseries" + }, + { + "columns": [ ], + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "filterable": false, + "inspect": false + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "options": "number" + }, + "properties": [ + { + "id": "unit", + "value": "none" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 53 + }, + "id": 28, + "links": [ ], + "options": { + "footer": { + "countRows": false, + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 1, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "PUTs" + } + ] + }, + "pluginVersion": "9.4.7", + "styles": "", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "sum by (bucket, ceph_daemon) (ceph_rgw_op_per_bucket_put_obj_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": false, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "sum by (bucket, ceph_daemon) (ceph_rgw_op_per_bucket_get_obj_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": false, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "sum by (bucket, ceph_daemon) (ceph_rgw_op_per_bucket_del_obj_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": false, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "sum by (bucket, ceph_daemon) (ceph_rgw_op_per_bucket_copy_obj_bytes *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": false, + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "sum by (bucket, ceph_daemon) (ceph_rgw_op_per_bucket_list_obj_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": false, + "refId": "E" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Summary Per Bucket by Operations", + "transformations": [ + { + "id": "merge", + "options": { } + }, + { + "id": "joinByField", + "options": { + "byField": "Bucket", + "mode": "outer" + } + }, + { + "id": "groupBy", + "options": { + "fields": { + "Bucket": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #A": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #B": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #C": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #D": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #F": { + "aggregations": [ ], + "operation": "groupby" + }, + "bucket": { + "aggregations": [ ], + "operation": "groupby" + }, + "ceph_daemon": { + "aggregations": [ ], + "operation": "groupby" + } + } + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time 1": true, + "Time 2": true, + "Time 3": true, + "Time 4": true, + "Time 5": true, + "Time 6": true, + "Time 7": true, + "__name__": true, + "__name__ 1": true, + "__name__ 2": true, + "__name__ 3": true, + "__name__ 4": true, + "__name__ 5": true, + "__name__ 6": true, + "__name__ 7": true, + "ceph_daemon 1": true, + "ceph_daemon 2": true, + "ceph_daemon 3": true, + "ceph_daemon 4": true, + "instance 1": true, + "instance 2": true, + "instance 3": true, + "instance 4": true, + "instance 5": true, + "instance 6": true, + "instance 7": true, + "instance_id 1": true, + "instance_id 2": true, + "instance_id 3": true, + "instance_id 4": true, + "instance_id 5": true, + "instance_id 6": true, + "instance_id 7": true, + "job 1": true, + "job 2": true, + "job 3": true, + "job 4": true, + "job 5": true, + "job 6": true, + "job 7": true + }, + "indexByName": { + "Value #A": 2, + "Value #B": 3, + "Value #C": 4, + "Value #D": 5, + "Value #F": 6, + "bucket": 1, + "ceph_daemon": 0 + }, + "renameByName": { + "Bucket": "", + "Value #A": "PUTs", + "Value #B": "GETs", + "Value #C": "List", + "Value #D": "Delete", + "Value #E": "Copy", + "Value #F": "Copy", + "Value #G": "", + "bucket": "Bucket", + "ceph_daemon": "Daemon" + } + } + } + ], + "type": "table" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Buckets", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 29 + }, + "id": 29, + "panels": [ + { + "datasource": "${datasource}", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "color" + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 62 + }, + "id": 30, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "topk(5, \n sum by (user, ceph_daemon) ((ceph_rgw_op_per_user_put_obj_ops ) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })\n)\n", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{user}}", + "range": false, + "refId": "A" + } + ], + "title": "Top 5 Users PUTs By Operations", + "type": "bargauge" + }, + { + "datasource": "${datasource}", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "color" + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 62 + }, + "id": 31, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "topk(5, \n sum by (user, ceph_daemon) ((ceph_rgw_op_per_user_get_obj_ops ) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })\n)\n", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{user}}", + "range": false, + "refId": "A" + } + ], + "title": "Top 5 Users GETs by Operations", + "type": "bargauge" + }, + { + "datasource": "${datasource}", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + } + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 62 + }, + "id": 32, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ ] + } + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "topk(5, \n sum by (user, ceph_daemon) ((ceph_rgw_op_per_user_put_obj_bytes) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })\n)", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{user}}", + "range": false, + "refId": "A" + } + ], + "title": "Top 5 Users PUTs by Size", + "type": "bargauge" + }, + { + "datasource": "${datasource}", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + } + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 62 + }, + "id": 33, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ ] + } + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "topk(5, \n sum by (user, ceph_daemon) ((ceph_rgw_op_per_user_get_obj_bytes) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })\n)", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{user}}", + "range": false, + "refId": "A" + } + ], + "title": "Top 5 Users GETs By Size", + "type": "bargauge" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "decbytes" + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "color" + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 70 + }, + "id": 34, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by (user, ceph_daemon) ((ceph_rgw_op_per_user_put_obj_bytes) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{user}}", + "range": true, + "refId": "A", + "step": 300 + } + ], + "title": "User PUTs by Size", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "decbytes" + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "color" + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 70 + }, + "id": 35, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by (user, ceph_daemon) ((ceph_rgw_op_per_user_get_obj_bytes) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{user}}", + "range": true, + "refId": "A", + "step": 300 + } + ], + "title": "User GETs by Size", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "decbytes" + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "color" + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 70 + }, + "id": 36, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by (user, ceph_daemon) ((ceph_rgw_op_per_user_del_obj_bytes) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{user}}", + "range": true, + "refId": "A", + "step": 300 + } + ], + "title": "User Delete by Size", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "decbytes" + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "color" + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 70 + }, + "id": 37, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by (user, ceph_daemon) ((ceph_rgw_op_per_user_copy_obj_bytes) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{user}}", + "range": true, + "refId": "A", + "step": 300 + } + ], + "title": "User COPY by Size", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "color" + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 78 + }, + "id": 38, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by (user, ceph_daemon) ((ceph_rgw_op_per_user_get_obj_ops) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{user}}", + "range": true, + "refId": "A", + "step": 300 + } + ], + "title": "User GETs by Operations", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "color" + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 78 + }, + "id": 39, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by (user, ceph_daemon) ((ceph_rgw_op_per_user_put_obj_ops) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{user}}", + "range": true, + "refId": "A", + "step": 300 + } + ], + "title": "User PUTs by Operations", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "color" + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 78 + }, + "id": 40, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by (user, ceph_daemon) ((ceph_rgw_op_per_user_list_obj_ops) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{user}}", + "range": true, + "refId": "A", + "step": 300 + } + ], + "title": "User List by Operations", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "color" + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 78 + }, + "id": 41, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by (user, ceph_daemon) ((ceph_rgw_op_per_user_del_obj_ops) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{user}}", + "range": true, + "refId": "A", + "step": 300 + } + ], + "title": "User Delete by Operations", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "color" + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 86 + }, + "id": 42, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by (user, ceph_daemon) ((ceph_rgw_op_per_user_copy_obj_ops) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{user}}", + "range": true, + "refId": "A", + "step": 300 + } + ], + "title": "User Copy by Operations", + "type": "timeseries" + }, + { + "columns": [ ], + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "filterable": false, + "inspect": false + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "options": "number" + }, + "properties": [ + { + "id": "unit", + "value": "none" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 86 + }, + "id": 43, + "links": [ ], + "options": { + "footer": { + "countRows": false, + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 1, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "PUTs" + } + ] + }, + "pluginVersion": "9.4.7", + "styles": "", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "sum by (user, ceph_daemon) (ceph_rgw_op_per_user_put_obj_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": false, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "sum by (user, ceph_daemon) (ceph_rgw_op_per_user_get_obj_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": false, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "sum by (user, ceph_daemon) (ceph_rgw_op_per_user_del_obj_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": false, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "sum by (user, ceph_daemon) (ceph_rgw_op_per_user_copy_obj_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": false, + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "sum by (user, ceph_daemon) (ceph_rgw_op_per_user_list_obj_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": false, + "refId": "E" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Summary Per User By Operations", + "transformations": [ + { + "id": "merge", + "options": { } + }, + { + "id": "joinByField", + "options": { + "byField": "User", + "mode": "outer" + } + }, + { + "id": "groupBy", + "options": { + "fields": { + "User": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #A": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #B": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #C": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #D": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #F": { + "aggregations": [ ], + "operation": "groupby" + }, + "ceph_daemon": { + "aggregations": [ ], + "operation": "groupby" + }, + "user": { + "aggregations": [ ], + "operation": "groupby" + } + } + } + }, + { + "id": "organize", + "options": { + "excludeByName": { }, + "indexByName": { + "Value #A": 2, + "Value #B": 3, + "Value #C": 4, + "Value #D": 5, + "Value #F": 6, + "ceph_daemon": 0, + "user": 1 + }, + "renameByName": { + "Value #A": "PUTs", + "Value #B": "GETs", + "Value #C": "LIST", + "Value #D": "DELETE", + "Value #F": "COPY", + "ceph_daemon": "Daemon", + "user": "User" + } + } + } + ], + "type": "table" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Users", + "titleSize": "h6", + "type": "row" + } + ], + "refresh": "30s", + "rows": [ ], + "schemaVersion": 22, + "style": "dark", + "tags": [ + "ceph-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": "Data Source", + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_health_status, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "rgw_servers", + "options": [ ], + "query": "label_values(ceph_rgw_metadata{cluster=~\"$cluster\", }, ceph_daemon)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "datasource": "$datasource", + "hide": 2, + "label": "filters", + "name": "Filters", + "type": "adhoc" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "RGW S3 Analytics", + "uid": "BnxelG7Sz", + "version": 0 +} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/jsonnet-bundler-build.sh b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/jsonnet-bundler-build.sh new file mode 100755 index 000000000..d713cffb8 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/jsonnet-bundler-build.sh @@ -0,0 +1,8 @@ +#!/bin/sh -ex + +JSONNET_VERSION="v0.4.0" +OUTPUT_DIR=${1:-$(pwd)} + +git clone -b ${JSONNET_VERSION} --depth 1 https://github.com/jsonnet-bundler/jsonnet-bundler +make -C jsonnet-bundler build +mv jsonnet-bundler/_output/jb ${OUTPUT_DIR} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/jsonnetfile.json b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/jsonnetfile.json new file mode 100644 index 000000000..93f3316ec --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/jsonnetfile.json @@ -0,0 +1,15 @@ +{ + "version": 1, + "dependencies": [ + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet-lib.git", + "subdir": "grafonnet" + } + }, + "version": "master" + } + ], + "legacyImports": true +} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/jsonnetfile.lock.json b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/jsonnetfile.lock.json new file mode 100644 index 000000000..480438230 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/jsonnetfile.lock.json @@ -0,0 +1,16 @@ +{ + "version": 1, + "dependencies": [ + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet-lib.git", + "subdir": "grafonnet" + } + }, + "version": "a1d61cce1da59c71409b99b5c7568511fec661ea", + "sum": "342u++/7rViR/zj2jeJOjshzglkZ1SY+hFNuyCBFMdc=" + } + ], + "legacyImports": false +} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/lint-jsonnet.sh b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/lint-jsonnet.sh new file mode 100755 index 000000000..764245684 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/lint-jsonnet.sh @@ -0,0 +1,14 @@ +#!/bin/sh -e + +JSONNETS_FILES=$(find . -name 'vendor' -prune -o \ + -name '*.jsonnet' -print -o -name '*.libsonnet' -print) +for each_jsonnet_file in ${JSONNETS_FILES}; do + jsonnetfmt "$@" ${each_jsonnet_file} || jfmt_failed_files="$jfmt_failed_files ${each_jsonnet_file}" +done +exit_status=0 +# if variable 'jfmt_failed_files' is not empty/null +if [ -n "${jfmt_failed_files}" ]; then + echo "'jsonnetfmt' check failed on:${jfmt_failed_files}" + exit_status=1 +fi +exit $exit_status diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/mixin.libsonnet b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/mixin.libsonnet new file mode 100644 index 000000000..3c983a300 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/mixin.libsonnet @@ -0,0 +1,3 @@ +(import 'config.libsonnet') + +(import 'dashboards.libsonnet') + +(import 'alerts.libsonnet') diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/prometheus_alerts.libsonnet b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/prometheus_alerts.libsonnet new file mode 100644 index 000000000..fa2899b22 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/prometheus_alerts.libsonnet @@ -0,0 +1,995 @@ +{ + _config:: error 'must provide _config', + + MultiClusterQuery():: + if $._config.showMultiCluster + then 'cluster,' + else '', + + MultiClusterSummary():: + if $._config.showMultiCluster + then ' on cluster {{ $labels.cluster }}' + else '', + + groups+: [ + { + name: 'cluster health', + rules: [ + { + alert: 'CephHealthError', + 'for': '5m', + expr: 'ceph_health_status == 2', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.2.1' }, + annotations: { + summary: 'Ceph is in the ERROR state%(cluster)s' % $.MultiClusterSummary(), + description: "The cluster state has been HEALTH_ERROR for more than 5 minutes%(cluster)s. Please check 'ceph health detail' for more information." % $.MultiClusterSummary(), + }, + }, + { + alert: 'CephHealthWarning', + 'for': '15m', + expr: 'ceph_health_status == 1', + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + summary: 'Ceph is in the WARNING state%(cluster)s' % $.MultiClusterSummary(), + description: "The cluster state has been HEALTH_WARN for more than 15 minutes%(cluster)s. Please check 'ceph health detail' for more information." % $.MultiClusterSummary(), + }, + }, + ], + }, + { + name: 'mon', + rules: [ + { + alert: 'CephMonDownQuorumAtRisk', + 'for': '30s', + expr: ||| + ( + (ceph_health_detail{name="MON_DOWN"} == 1) * on() ( + count(ceph_mon_quorum_status == 1) == bool (floor(count(ceph_mon_metadata) / 2) + 1) + ) + ) == 1 + |||, + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.3.1' }, + annotations: { + documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down', + summary: 'Monitor quorum is at risk%(cluster)s' % $.MultiClusterSummary(), + description: '{{ $min := query "floor(count(ceph_mon_metadata) / 2) + 1" | first | value }}Quorum requires a majority of monitors (x {{ $min }}) to be active. Without quorum the cluster will become inoperable, affecting all services and connected clients. The following monitors are down: {{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}', + }, + }, + { + alert: 'CephMonDown', + 'for': '30s', + expr: ||| + count(ceph_mon_quorum_status == 0) <= (count(ceph_mon_metadata) - floor(count(ceph_mon_metadata) / 2) + 1) + |||, + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down', + summary: 'One or more monitors down%(cluster)s' % $.MultiClusterSummary(), + description: ||| + {{ $down := query "count(ceph_mon_quorum_status == 0)" | first | value }}{{ $s := "" }}{{ if gt $down 1.0 }}{{ $s = "s" }}{{ end }}You have {{ $down }} monitor{{ $s }} down. Quorum is still intact, but the loss of an additional monitor will make your cluster inoperable. The following monitors are down: {{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }} + |||, + }, + }, + { + alert: 'CephMonDiskspaceCritical', + 'for': '1m', + expr: 'ceph_health_detail{name="MON_DISK_CRIT"} == 1', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.3.2' }, + annotations: { + documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-crit', + summary: 'Filesystem space on at least one monitor is critically low%(cluster)s' % $.MultiClusterSummary(), + description: "The free space available to a monitor's store is critically low. You should increase the space available to the monitor(s). The default directory is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook. Look for old, rotated versions of *.log and MANIFEST*. Do NOT touch any *.sst files. Also check any other directories under /var/lib/rook and other directories on the same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are; {{- range query \"ceph_mon_metadata\"}} - {{ .Labels.hostname }} {{- end }}", + }, + }, + { + alert: 'CephMonDiskspaceLow', + 'for': '5m', + expr: 'ceph_health_detail{name="MON_DISK_LOW"} == 1', + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-low', + summary: 'Drive space on at least one monitor is approaching full%(cluster)s' % $.MultiClusterSummary(), + description: "The space available to a monitor's store is approaching full (>70% is the default). You should increase the space available to the monitor(s). The default directory is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook. Look for old, rotated versions of *.log and MANIFEST*. Do NOT touch any *.sst files. Also check any other directories under /var/lib/rook and other directories on the same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are; {{- range query \"ceph_mon_metadata\"}} - {{ .Labels.hostname }} {{- end }}", + }, + }, + { + alert: 'CephMonClockSkew', + 'for': '1m', + expr: 'ceph_health_detail{name="MON_CLOCK_SKEW"} == 1', + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-clock-skew', + summary: 'Clock skew detected among monitors%(cluster)s' % $.MultiClusterSummary(), + description: "Ceph monitors rely on closely synchronized time to maintain quorum and cluster consistency. This event indicates that the time on at least one mon has drifted too far from the lead mon. Review cluster status with ceph -s. This will show which monitors are affected. Check the time sync status on each monitor host with 'ceph time-sync-status' and the state and peers of your ntpd or chrony daemon.", + }, + }, + ], + }, + { + name: 'osd', + rules: [ + { + alert: 'CephOSDDownHigh', + expr: 'count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.4.1' }, + annotations: { + summary: 'More than 10%% of OSDs are down%(cluster)s' % $.MultiClusterSummary(), + description: '{{ $value | humanize }}% or {{ with query "count(ceph_osd_up == 0)" }}{{ . | first | value }}{{ end }} of {{ with query "count(ceph_osd_up)" }}{{ . | first | value }}{{ end }} OSDs are down (>= 10%). The following OSDs are down: {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}', + }, + }, + { + alert: 'CephOSDHostDown', + 'for': '5m', + expr: 'ceph_health_detail{name="OSD_HOST_DOWN"} == 1', + labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.4.8' }, + annotations: { + summary: 'An OSD host is offline%(cluster)s' % $.MultiClusterSummary(), + description: 'The following OSDs are down: {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" }} - {{ .Labels.hostname }} : {{ .Labels.ceph_daemon }} {{- end }}', + }, + }, + { + alert: 'CephOSDDown', + 'for': '5m', + expr: 'ceph_health_detail{name="OSD_DOWN"} == 1', + labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.4.2' }, + annotations: { + documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-down', + summary: 'An OSD has been marked down%(cluster)s' % $.MultiClusterSummary(), + description: ||| + {{ $num := query "count(ceph_osd_up == 0)" | first | value }}{{ $s := "" }}{{ if gt $num 1.0 }}{{ $s = "s" }}{{ end }}{{ $num }} OSD{{ $s }} down for over 5mins. The following OSD{{ $s }} {{ if eq $s "" }}is{{ else }}are{{ end }} down: {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0"}} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }} + |||, + }, + }, + { + alert: 'CephOSDNearFull', + 'for': '5m', + expr: 'ceph_health_detail{name="OSD_NEARFULL"} == 1', + labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.4.3' }, + annotations: { + documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-nearfull', + summary: 'OSD(s) running low on free space (NEARFULL)%(cluster)s' % $.MultiClusterSummary(), + description: "One or more OSDs have reached the NEARFULL threshold. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data.", + }, + }, + { + alert: 'CephOSDFull', + 'for': '1m', + expr: 'ceph_health_detail{name="OSD_FULL"} > 0', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.4.6' }, + annotations: { + documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-full', + summary: 'OSD full, writes blocked%(cluster)s' % $.MultiClusterSummary(), + description: "An OSD has reached the FULL threshold. Writes to pools that share the affected OSD will be blocked. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data.", + }, + }, + { + alert: 'CephOSDBackfillFull', + 'for': '1m', + expr: 'ceph_health_detail{name="OSD_BACKFILLFULL"} > 0', + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-backfillfull', + summary: 'OSD(s) too full for backfill operations%(cluster)s' % $.MultiClusterSummary(), + description: "An OSD has reached the BACKFILL FULL threshold. This will prevent rebalance operations from completing. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data.", + }, + }, + { + alert: 'CephOSDTooManyRepairs', + 'for': '30s', + expr: 'ceph_health_detail{name="OSD_TOO_MANY_REPAIRS"} == 1', + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-too-many-repairs', + summary: 'OSD reports a high number of read errors%(cluster)s' % $.MultiClusterSummary(), + description: 'Reads from an OSD have used a secondary PG to return data to the client, indicating a potential failing drive.', + }, + }, + { + alert: 'CephOSDTimeoutsPublicNetwork', + 'for': '1m', + expr: 'ceph_health_detail{name="OSD_SLOW_PING_TIME_FRONT"} == 1', + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + summary: 'Network issues delaying OSD heartbeats (public network)%(cluster)s' % $.MultiClusterSummary(), + description: "OSD heartbeats on the cluster's 'public' network (frontend) are running slow. Investigate the network for latency or loss issues. Use 'ceph health detail' to show the affected OSDs.", + }, + }, + { + alert: 'CephOSDTimeoutsClusterNetwork', + 'for': '1m', + expr: 'ceph_health_detail{name="OSD_SLOW_PING_TIME_BACK"} == 1', + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + summary: 'Network issues delaying OSD heartbeats (cluster network)%(cluster)s' % $.MultiClusterSummary(), + description: "OSD heartbeats on the cluster's 'cluster' network (backend) are slow. Investigate the network for latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs.", + }, + }, + { + alert: 'CephOSDInternalDiskSizeMismatch', + 'for': '1m', + expr: 'ceph_health_detail{name="BLUESTORE_DISK_SIZE_MISMATCH"} == 1', + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-disk-size-mismatch', + summary: 'OSD size inconsistency error%(cluster)s' % $.MultiClusterSummary(), + description: 'One or more OSDs have an internal inconsistency between metadata and the size of the device. This could lead to the OSD(s) crashing in future. You should redeploy the affected OSDs.', + }, + }, + { + alert: 'CephDeviceFailurePredicted', + 'for': '1m', + expr: 'ceph_health_detail{name="DEVICE_HEALTH"} == 1', + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#id2', + summary: 'Device(s) predicted to fail soon%(cluster)s' % $.MultiClusterSummary(), + description: "The device health module has determined that one or more devices will fail soon. To review device status use 'ceph device ls'. To show a specific device use 'ceph device info '. Mark the OSD out so that data may migrate to other OSDs. Once the OSD has drained, destroy the OSD, replace the device, and redeploy the OSD.", + }, + }, + { + alert: 'CephDeviceFailurePredictionTooHigh', + 'for': '1m', + expr: 'ceph_health_detail{name="DEVICE_HEALTH_TOOMANY"} == 1', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.4.7' }, + annotations: { + documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-toomany', + summary: 'Too many devices are predicted to fail, unable to resolve%(cluster)s' % $.MultiClusterSummary(), + description: 'The device health module has determined that devices predicted to fail can not be remediated automatically, since too many OSDs would be removed from the cluster to ensure performance and availability. Prevent data integrity issues by adding new OSDs so that data may be relocated.', + }, + }, + { + alert: 'CephDeviceFailureRelocationIncomplete', + 'for': '1m', + expr: 'ceph_health_detail{name="DEVICE_HEALTH_IN_USE"} == 1', + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-in-use', + summary: 'Device failure is predicted, but unable to relocate data%(cluster)s' % $.MultiClusterSummary(), + description: 'The device health module has determined that one or more devices will fail soon, but the normal process of relocating the data on the device to other OSDs in the cluster is blocked. \nEnsure that the cluster has available free space. It may be necessary to add capacity to the cluster to allow data from the failing device to successfully migrate, or to enable the balancer.', + }, + }, + { + alert: 'CephOSDFlapping', + expr: '(rate(ceph_osd_up[5m]) * on(%(cluster)sceph_daemon) group_left(hostname) ceph_osd_metadata) * 60 > 1' % $.MultiClusterQuery(), + labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.4.4' }, + annotations: { + documentation: 'https://docs.ceph.com/en/latest/rados/troubleshooting/troubleshooting-osd#flapping-osds', + summary: 'Network issues are causing OSDs to flap (mark each other down)%(cluster)s' % $.MultiClusterSummary(), + description: 'OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} was marked down and back up {{ $value | humanize }} times once a minute for 5 minutes. This may indicate a network issue (latency, packet loss, MTU mismatch) on the cluster network, or the public network if no cluster network is deployed. Check the network stats on the listed host(s).', + }, + }, + { + alert: 'CephOSDReadErrors', + 'for': '30s', + expr: 'ceph_health_detail{name="BLUESTORE_SPURIOUS_READ_ERRORS"} == 1', + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-spurious-read-errors', + summary: 'Device read errors detected%(cluster)s' % $.MultiClusterSummary(), + description: 'An OSD has encountered read errors, but the OSD has recovered by retrying the reads. This may indicate an issue with hardware or the kernel.', + }, + }, + { + alert: 'CephPGImbalance', + 'for': '5m', + expr: ||| + abs( + ((ceph_osd_numpg > 0) - on (%(cluster)sjob) group_left avg(ceph_osd_numpg > 0) by (%(cluster)sjob)) / + on (job) group_left avg(ceph_osd_numpg > 0) by (job) + ) * on (%(cluster)sceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30 + ||| % [$.MultiClusterQuery(), $.MultiClusterQuery(), $.MultiClusterQuery()], + labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.4.5' }, + annotations: { + summary: 'PGs are not balanced across OSDs%(cluster)s' % $.MultiClusterSummary(), + description: 'OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} deviates by more than 30% from average PG count.', + }, + }, + ], + }, + { + name: 'mds', + rules: [ + { + alert: 'CephFilesystemDamaged', + 'for': '1m', + expr: 'ceph_health_detail{name="MDS_DAMAGE"} > 0', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.5.1' }, + annotations: { + documentation: 'https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages', + summary: 'CephFS filesystem is damaged%(cluster)s.' % $.MultiClusterSummary(), + description: 'Filesystem metadata has been corrupted. Data may be inaccessible. Analyze metrics from the MDS daemon admin socket, or escalate to support.', + }, + }, + { + alert: 'CephFilesystemOffline', + 'for': '1m', + expr: 'ceph_health_detail{name="MDS_ALL_DOWN"} > 0', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.5.3' }, + annotations: { + documentation: 'https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-all-down', + summary: 'CephFS filesystem is offline%(cluster)s' % $.MultiClusterSummary(), + description: 'All MDS ranks are unavailable. The MDS daemons managing metadata are down, rendering the filesystem offline.', + }, + }, + { + alert: 'CephFilesystemDegraded', + 'for': '1m', + expr: 'ceph_health_detail{name="FS_DEGRADED"} > 0', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.5.4' }, + annotations: { + documentation: 'https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-degraded', + summary: 'CephFS filesystem is degraded%(cluster)s' % $.MultiClusterSummary(), + description: 'One or more metadata daemons (MDS ranks) are failed or in a damaged state. At best the filesystem is partially available, at worst the filesystem is completely unusable.', + }, + }, + { + alert: 'CephFilesystemMDSRanksLow', + 'for': '1m', + expr: 'ceph_health_detail{name="MDS_UP_LESS_THAN_MAX"} > 0', + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + documentation: 'https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-up-less-than-max', + summary: 'Ceph MDS daemon count is lower than configured%(cluster)s' % $.MultiClusterSummary(), + description: "The filesystem's 'max_mds' setting defines the number of MDS ranks in the filesystem. The current number of active MDS daemons is less than this value.", + }, + }, + { + alert: 'CephFilesystemInsufficientStandby', + 'for': '1m', + expr: 'ceph_health_detail{name="MDS_INSUFFICIENT_STANDBY"} > 0', + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + documentation: 'https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-insufficient-standby', + summary: 'Ceph filesystem standby daemons too few%(cluster)s' % $.MultiClusterSummary(), + description: 'The minimum number of standby daemons required by standby_count_wanted is less than the current number of standby daemons. Adjust the standby count or increase the number of MDS daemons.', + }, + }, + { + alert: 'CephFilesystemFailureNoStandby', + 'for': '1m', + expr: 'ceph_health_detail{name="FS_WITH_FAILED_MDS"} > 0', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.5.5' }, + annotations: { + documentation: 'https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-with-failed-mds', + summary: 'MDS daemon failed, no further standby available%(cluster)s' % $.MultiClusterSummary(), + description: 'An MDS daemon has failed, leaving only one active rank and no available standby. Investigate the cause of the failure or add a standby MDS.', + }, + }, + { + alert: 'CephFilesystemReadOnly', + 'for': '1m', + expr: 'ceph_health_detail{name="MDS_HEALTH_READ_ONLY"} > 0', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.5.2' }, + annotations: { + documentation: 'https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages', + summary: 'CephFS filesystem in read only mode due to write error(s)%(cluster)s' % $.MultiClusterSummary(), + description: 'The filesystem has switched to READ ONLY due to an unexpected error when writing to the metadata pool. Either analyze the output from the MDS daemon admin socket, or escalate to support.', + }, + }, + ], + }, + { + name: 'mgr', + rules: [ + { + alert: 'CephMgrModuleCrash', + 'for': '5m', + expr: 'ceph_health_detail{name="RECENT_MGR_MODULE_CRASH"} == 1', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.6.1' }, + annotations: { + documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#recent-mgr-module-crash', + summary: 'A manager module has recently crashed%(cluster)s' % $.MultiClusterSummary(), + description: "One or more mgr modules have crashed and have yet to be acknowledged by an administrator. A crashed module may impact functionality within the cluster. Use the 'ceph crash' command to determine which module has failed, and archive it to acknowledge the failure.", + }, + }, + { + alert: 'CephMgrPrometheusModuleInactive', + 'for': '1m', + expr: 'up{job="ceph"} == 0', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.6.2' }, + annotations: { + summary: 'The mgr/prometheus module is not available%(cluster)s' % $.MultiClusterSummary(), + description: "The mgr/prometheus module at {{ $labels.instance }} is unreachable. This could mean that the module has been disabled or the mgr daemon itself is down. Without the mgr/prometheus module metrics and alerts will no longer function. Open a shell to an admin node or toolbox pod and use 'ceph -s' to to determine whether the mgr is active. If the mgr is not active, restart it, otherwise you can determine module status with 'ceph mgr module ls'. If it is not listed as enabled, enable it with 'ceph mgr module enable prometheus'.", + }, + }, + ], + }, + { + name: 'pgs', + rules: [ + { + alert: 'CephPGsInactive', + 'for': '5m', + expr: 'ceph_pool_metadata * on(%(cluster)spool_id,instance) group_left() (ceph_pg_total - ceph_pg_active) > 0' % $.MultiClusterQuery(), + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.7.1' }, + annotations: { + summary: 'One or more placement groups are inactive%(cluster)s' % $.MultiClusterSummary(), + description: '{{ $value }} PGs have been inactive for more than 5 minutes in pool {{ $labels.name }}. Inactive placement groups are not able to serve read/write requests.', + }, + }, + { + alert: 'CephPGsUnclean', + 'for': '15m', + expr: 'ceph_pool_metadata * on(%(cluster)spool_id,instance) group_left() (ceph_pg_total - ceph_pg_clean) > 0' % $.MultiClusterQuery(), + labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.7.2' }, + annotations: { + summary: 'One or more placement groups are marked unclean%(cluster)s' % $.MultiClusterSummary(), + description: '{{ $value }} PGs have been unclean for more than 15 minutes in pool {{ $labels.name }}. Unclean PGs have not recovered from a previous failure.', + }, + }, + { + alert: 'CephPGsDamaged', + 'for': '5m', + expr: 'ceph_health_detail{name=~"PG_DAMAGED|OSD_SCRUB_ERRORS"} == 1', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.7.4' }, + annotations: { + documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-damaged', + summary: 'Placement group damaged, manual intervention needed%(cluster)s' % $.MultiClusterSummary(), + description: "During data consistency checks (scrub), at least one PG has been flagged as being damaged or inconsistent. Check to see which PG is affected, and attempt a manual repair if necessary. To list problematic placement groups, use 'rados list-inconsistent-pg '. To repair PGs use the 'ceph pg repair ' command.", + }, + }, + { + alert: 'CephPGRecoveryAtRisk', + 'for': '1m', + expr: 'ceph_health_detail{name="PG_RECOVERY_FULL"} == 1', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.7.5' }, + annotations: { + documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-recovery-full', + summary: 'OSDs are too full for recovery%(cluster)s' % $.MultiClusterSummary(), + description: "Data redundancy is at risk since one or more OSDs are at or above the 'full' threshold. Add more capacity to the cluster, restore down/out OSDs, or delete unwanted data.", + }, + }, + { + alert: 'CephPGUnavailableBlockingIO', + 'for': '1m', + expr: '((ceph_health_detail{name="PG_AVAILABILITY"} == 1) - scalar(ceph_health_detail{name="OSD_DOWN"})) == 1', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.7.3' }, + annotations: { + documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-availability', + summary: 'PG is unavailable%(cluster)s, blocking I/O' % $.MultiClusterSummary(), + description: "Data availability is reduced, impacting the cluster's ability to service I/O. One or more placement groups (PGs) are in a state that blocks I/O.", + }, + }, + { + alert: 'CephPGBackfillAtRisk', + 'for': '1m', + expr: 'ceph_health_detail{name="PG_BACKFILL_FULL"} == 1', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.7.6' }, + annotations: { + documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-backfill-full', + summary: 'Backfill operations are blocked due to lack of free space%(cluster)s' % $.MultiClusterSummary(), + description: "Data redundancy may be at risk due to lack of free space within the cluster. One or more OSDs have reached the 'backfillfull' threshold. Add more capacity, or delete unwanted data.", + }, + }, + { + alert: 'CephPGNotScrubbed', + 'for': '5m', + expr: 'ceph_health_detail{name="PG_NOT_SCRUBBED"} == 1', + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-scrubbed', + summary: 'Placement group(s) have not been scrubbed%(cluster)s' % $.MultiClusterSummary(), + description: "One or more PGs have not been scrubbed recently. Scrubs check metadata integrity, protecting against bit-rot. They check that metadata is consistent across data replicas. When PGs miss their scrub interval, it may indicate that the scrub window is too small, or PGs were not in a 'clean' state during the scrub window. You can manually initiate a scrub with: ceph pg scrub ", + }, + }, + { + alert: 'CephPGsHighPerOSD', + 'for': '1m', + expr: 'ceph_health_detail{name="TOO_MANY_PGS"} == 1', + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks/#too-many-pgs', + summary: 'Placement groups per OSD is too high%(cluster)s' % $.MultiClusterSummary(), + description: "The number of placement groups per OSD is too high (exceeds the mon_max_pg_per_osd setting).\n Check that the pg_autoscaler has not been disabled for any pools with 'ceph osd pool autoscale-status', and that the profile selected is appropriate. You may also adjust the target_size_ratio of a pool to guide the autoscaler based on the expected relative size of the pool ('ceph osd pool set cephfs.cephfs.meta target_size_ratio .1') or set the pg_autoscaler mode to 'warn' and adjust pg_num appropriately for one or more pools.", + }, + }, + { + alert: 'CephPGNotDeepScrubbed', + 'for': '5m', + expr: 'ceph_health_detail{name="PG_NOT_DEEP_SCRUBBED"} == 1', + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-deep-scrubbed', + summary: 'Placement group(s) have not been deep scrubbed%(cluster)s' % $.MultiClusterSummary(), + description: "One or more PGs have not been deep scrubbed recently. Deep scrubs protect against bit-rot. They compare data replicas to ensure consistency. When PGs miss their deep scrub interval, it may indicate that the window is too small or PGs were not in a 'clean' state during the deep-scrub window.", + }, + }, + ], + }, + { + name: 'nodes', + rules: [ + { + alert: 'CephNodeRootFilesystemFull', + 'for': '5m', + expr: 'node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100 < 5', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.8.1' }, + annotations: { + summary: 'Root filesystem is dangerously full%(cluster)s' % $.MultiClusterSummary(), + description: 'Root volume is dangerously full: {{ $value | humanize }}% free.', + }, + }, + { + alert: 'CephNodeNetworkPacketDrops', + expr: ||| + ( + rate(node_network_receive_drop_total{device!="lo"}[1m]) + + rate(node_network_transmit_drop_total{device!="lo"}[1m]) + ) / ( + rate(node_network_receive_packets_total{device!="lo"}[1m]) + + rate(node_network_transmit_packets_total{device!="lo"}[1m]) + ) >= %(CephNodeNetworkPacketDropsThreshold)s and ( + rate(node_network_receive_drop_total{device!="lo"}[1m]) + + rate(node_network_transmit_drop_total{device!="lo"}[1m]) + ) >= %(CephNodeNetworkPacketDropsPerSec)s + ||| % $._config, + labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.8.2' }, + annotations: { + summary: 'One or more NICs reports packet drops%(cluster)s' % $.MultiClusterSummary(), + description: 'Node {{ $labels.instance }} experiences packet drop > %(CephNodeNetworkPacketDropsThreshold)s%% or > %(CephNodeNetworkPacketDropsPerSec)s packets/s on interface {{ $labels.device }}.' % { CephNodeNetworkPacketDropsThreshold: $._config.CephNodeNetworkPacketDropsThreshold * 100, CephNodeNetworkPacketDropsPerSec: $._config.CephNodeNetworkPacketDropsPerSec }, + }, + }, + { + alert: 'CephNodeNetworkPacketErrors', + expr: ||| + ( + rate(node_network_receive_errs_total{device!="lo"}[1m]) + + rate(node_network_transmit_errs_total{device!="lo"}[1m]) + ) / ( + rate(node_network_receive_packets_total{device!="lo"}[1m]) + + rate(node_network_transmit_packets_total{device!="lo"}[1m]) + ) >= 0.0001 or ( + rate(node_network_receive_errs_total{device!="lo"}[1m]) + + rate(node_network_transmit_errs_total{device!="lo"}[1m]) + ) >= 10 + |||, + labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.8.3' }, + annotations: { + summary: 'One or more NICs reports packet errors%(cluster)s' % $.MultiClusterSummary(), + description: 'Node {{ $labels.instance }} experiences packet errors > 0.01% or > 10 packets/s on interface {{ $labels.device }}.', + }, + }, + { + alert: 'CephNodeNetworkBondDegraded', + expr: ||| + node_bonding_slaves - node_bonding_active != 0 + |||, + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + summary: 'Degraded Bond on Node {{ $labels.instance }}%(cluster)s' % $.MultiClusterSummary(), + description: 'Bond {{ $labels.master }} is degraded on Node {{ $labels.instance }}.', + }, + }, + { + alert: 'CephNodeDiskspaceWarning', + expr: 'predict_linear(node_filesystem_free_bytes{device=~"/.*"}[2d], 3600 * 24 * 5) *on(instance) group_left(nodename) node_uname_info < 0', + labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.8.4' }, + annotations: { + summary: 'Host filesystem free space is getting low%(cluster)s' % $.MultiClusterSummary(), + description: 'Mountpoint {{ $labels.mountpoint }} on {{ $labels.nodename }} will be full in less than 5 days based on the 48 hour trailing fill rate.', + }, + }, + { + alert: 'CephNodeInconsistentMTU', + expr: 'node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) == scalar( max by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) != quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) )or node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) == scalar( min by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) != quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) )', + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + summary: 'MTU settings across Ceph hosts are inconsistent%(cluster)s' % $.MultiClusterSummary(), + description: 'Node {{ $labels.instance }} has a different MTU size ({{ $value }}) than the median of devices named {{ $labels.device }}.', + }, + }, + ], + }, + { + name: 'pools', + rules: [ + { + alert: 'CephPoolGrowthWarning', + expr: '(predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(%(cluster)spool_id, instance) group_right() ceph_pool_metadata) >= 95' % $.MultiClusterQuery(), + labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.9.2' }, + annotations: { + summary: 'Pool growth rate may soon exceed capacity%(cluster)s' % $.MultiClusterSummary(), + description: "Pool '{{ $labels.name }}' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours.", + }, + }, + { + alert: 'CephPoolBackfillFull', + expr: 'ceph_health_detail{name="POOL_BACKFILLFULL"} > 0', + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + summary: 'Free space in a pool is too low for recovery/backfill%(cluster)s' % $.MultiClusterSummary(), + description: 'A pool is approaching the near full threshold, which will prevent recovery/backfill operations from completing. Consider adding more capacity.', + }, + }, + { + alert: 'CephPoolFull', + 'for': '1m', + expr: 'ceph_health_detail{name="POOL_FULL"} > 0', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.9.1' }, + annotations: { + documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#pool-full', + summary: 'Pool is full - writes are blocked%(cluster)s' % $.MultiClusterSummary(), + description: "A pool has reached its MAX quota, or OSDs supporting the pool have reached the FULL threshold. Until this is resolved, writes to the pool will be blocked. Pool Breakdown (top 5) {{- range query \"topk(5, sort_desc(ceph_pool_percent_used * on(pool_id) group_right ceph_pool_metadata))\" }} - {{ .Labels.name }} at {{ .Value }}% {{- end }} Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota max_bytes )", + }, + }, + { + alert: 'CephPoolNearFull', + 'for': '5m', + expr: 'ceph_health_detail{name="POOL_NEAR_FULL"} > 0', + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + summary: 'One or more Ceph pools are nearly full%(cluster)s' % $.MultiClusterSummary(), + description: "A pool has exceeded the warning (percent full) threshold, or OSDs supporting the pool have reached the NEARFULL threshold. Writes may continue, but you are at risk of the pool going read-only if more capacity isn't made available. Determine the affected pool with 'ceph df detail', looking at QUOTA BYTES and STORED. Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota max_bytes ). Also ensure that the balancer is active.", + }, + }, + ], + }, + { + name: 'healthchecks', + rules: [ + { + alert: 'CephSlowOps', + 'for': '30s', + expr: 'ceph_healthcheck_slow_ops > 0', + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops', + summary: 'OSD operations are slow to complete%(cluster)s' % $.MultiClusterSummary(), + description: '{{ $value }} OSD requests are taking too long to process (osd_op_complaint_time exceeded)', + }, + }, + { + alert: 'CephDaemonSlowOps', + 'for': '30s', + expr: 'ceph_daemon_health_metrics{type="SLOW_OPS"} > 0', + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops', + summary: '{{ $labels.ceph_daemon }} operations are slow to complete', + description: '{{ $labels.ceph_daemon }} operations are taking too long to process (complaint time exceeded)', + }, + }, + ], + }, + { + name: 'cephadm', + rules: [ + { + alert: 'CephadmUpgradeFailed', + 'for': '30s', + expr: 'ceph_health_detail{name="UPGRADE_EXCEPTION"} > 0', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.11.2' }, + annotations: { + summary: 'Ceph version upgrade has failed%(cluster)s' % $.MultiClusterSummary(), + description: 'The cephadm cluster upgrade process has failed. The cluster remains in an undetermined state. Please review the cephadm logs, to understand the nature of the issue', + }, + }, + { + alert: 'CephadmDaemonFailed', + 'for': '30s', + expr: 'ceph_health_detail{name="CEPHADM_FAILED_DAEMON"} > 0', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.11.1' }, + annotations: { + summary: 'A ceph daemon managed by cephadm is down%(cluster)s' % $.MultiClusterSummary(), + description: "A daemon managed by cephadm is no longer active. Determine, which daemon is down with 'ceph health detail'. you may start daemons with the 'ceph orch daemon start '", + }, + }, + { + alert: 'CephadmPaused', + 'for': '1m', + expr: 'ceph_health_detail{name="CEPHADM_PAUSED"} > 0', + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + documentation: 'https://docs.ceph.com/en/latest/cephadm/operations#cephadm-paused', + summary: 'Orchestration tasks via cephadm are PAUSED%(cluster)s' % $.MultiClusterSummary(), + description: "Cluster management has been paused manually. This will prevent the orchestrator from service management and reconciliation. If this is not intentional, resume cephadm operations with 'ceph orch resume'", + }, + }, + ], + }, + { + name: 'hardware', + rules: [ + { + alert: 'HardwareStorageError', + 'for': '30s', + expr: 'ceph_health_detail{name="HARDWARE_STORAGE"} > 0', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.13.1' }, + annotations: { + summary: 'Storage devices error(s) detected%(cluster)s' % $.MultiClusterSummary(), + description: 'Some storage devices are in error. Check `ceph health detail`.', + }, + }, + { + alert: 'HardwareMemoryError', + 'for': '30s', + expr: 'ceph_health_detail{name="HARDWARE_MEMORY"} > 0', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.13.2' }, + annotations: { + summary: 'DIMM error(s) detected%(cluster)s' % $.MultiClusterSummary(), + description: 'DIMM error(s) detected. Check `ceph health detail`.', + }, + }, + { + alert: 'HardwareProcessorError', + 'for': '30s', + expr: 'ceph_health_detail{name="HARDWARE_PROCESSOR"} > 0', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.13.3' }, + annotations: { + summary: 'Processor error(s) detected%(cluster)s' % $.MultiClusterSummary(), + description: 'Processor error(s) detected. Check `ceph health detail`.', + }, + }, + { + alert: 'HardwareNetworkError', + 'for': '30s', + expr: 'ceph_health_detail{name="HARDWARE_NETWORK"} > 0', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.13.4' }, + annotations: { + summary: 'Network error(s) detected%(cluster)s' % $.MultiClusterSummary(), + description: 'Network error(s) detected. Check `ceph health detail`.', + }, + }, + { + alert: 'HardwarePowerError', + 'for': '30s', + expr: 'ceph_health_detail{name="HARDWARE_POWER"} > 0', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.13.5' }, + annotations: { + summary: 'Power supply error(s) detected%(cluster)s' % $.MultiClusterSummary(), + description: 'Power supply error(s) detected. Check `ceph health detail`.', + }, + }, + { + alert: 'HardwareFanError', + 'for': '30s', + expr: 'ceph_health_detail{name="HARDWARE_FANS"} > 0', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.13.6' }, + annotations: { + summary: 'Fan error(s) detected%(cluster)s' % $.MultiClusterSummary(), + description: 'Fan error(s) detected. Check `ceph health detail`.', + }, + }, + ], + }, + { + name: 'PrometheusServer', + rules: [ + { + alert: 'PrometheusJobMissing', + 'for': '30s', + expr: 'absent(up{job="ceph"})', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.12.1' }, + annotations: { + summary: 'The scrape job for Ceph is missing from Prometheus%(cluster)s' % $.MultiClusterSummary(), + description: "The prometheus job that scrapes from Ceph is no longer defined, this will effectively mean you'll have no metrics or alerts for the cluster. Please review the job definitions in the prometheus.yml file of the prometheus instance.", + }, + }, + ], + }, + { + name: 'rados', + rules: [ + { + alert: 'CephObjectMissing', + 'for': '30s', + expr: '(ceph_health_detail{name="OBJECT_UNFOUND"} == 1) * on() (count(ceph_osd_up == 1) == bool count(ceph_osd_metadata)) == 1', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.10.1' }, + annotations: { + documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#object-unfound', + summary: 'Object(s) marked UNFOUND%(cluster)s' % $.MultiClusterSummary(), + description: 'The latest version of a RADOS object can not be found, even though all OSDs are up. I/O requests for this object from clients will block (hang). Resolving this issue may require the object to be rolled back to a prior version manually, and manually verified.', + }, + }, + ], + }, + { + name: 'generic', + rules: [ + { + alert: 'CephDaemonCrash', + 'for': '1m', + expr: 'ceph_health_detail{name="RECENT_CRASH"} == 1', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.1.2' }, + annotations: { + documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks/#recent-crash', + summary: 'One or more Ceph daemons have crashed, and are pending acknowledgement%(cluster)s' % $.MultiClusterSummary(), + description: "One or more daemons have crashed recently, and need to be acknowledged. This notification ensures that software crashes do not go unseen. To acknowledge a crash, use the 'ceph crash archive ' command.", + }, + }, + ], + }, + { + name: 'rbdmirror', + rules: [ + { + alert: 'CephRBDMirrorImagesPerDaemonHigh', + 'for': '1m', + expr: 'sum by (ceph_daemon, namespace) (ceph_rbd_mirror_snapshot_image_snapshots) > %(CephRBDMirrorImagesPerDaemonThreshold)s' % $._config, + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.10.2' }, + annotations: { + summary: 'Number of image replications are now above %(CephRBDMirrorImagesPerDaemonThreshold)s' % $._config, + description: 'Number of image replications per daemon is not suppossed to go beyond threshold %(CephRBDMirrorImagesPerDaemonThreshold)s' % $._config, + }, + }, + { + alert: 'CephRBDMirrorImagesNotInSync', + 'for': '1m', + expr: 'sum by (ceph_daemon, image, namespace, pool) (topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.10.3' }, + annotations: { + summary: 'Some of the RBD mirror images are not in sync with the remote counter parts.', + description: 'Both local and remote RBD mirror images should be in sync.', + }, + }, + { + alert: 'CephRBDMirrorImagesNotInSyncVeryHigh', + 'for': '1m', + expr: 'count by (ceph_daemon) ((topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0) > (sum by (ceph_daemon) (ceph_rbd_mirror_snapshot_snapshots)*.1)', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.10.4' }, + annotations: { + summary: 'Number of unsynchronized images are very high.', + description: 'More than 10% of the images have synchronization problems', + }, + }, + { + alert: 'CephRBDMirrorImageTransferBandwidthHigh', + 'for': '1m', + expr: 'rate(ceph_rbd_mirror_journal_replay_bytes[30m]) > %.2f' % [$._config.CephRBDMirrorImageTransferBandwidthThreshold], + labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.10.5' }, + annotations: { + summary: 'The replication network usage has been increased over %d%s in the last 30 minutes. Review the number of images being replicated. This alert will be cleaned automatically after 30 minutes' % [$._config.CephRBDMirrorImageTransferBandwidthThreshold * 100, '%'], + description: 'Detected a heavy increase in bandwidth for rbd replications (over %d%s) in the last 30 min. This might not be a problem, but it is good to review the number of images being replicated simultaneously' % [$._config.CephRBDMirrorImageTransferBandwidthThreshold * 100, '%'], + }, + }, + ], + }, + { + name: 'nvmeof', + rules: [ + { + alert: 'NVMeoFSubsystemNamespaceLimit', + 'for': '1m', + expr: '(count by(nqn) (ceph_nvmeof_subsystem_namespace_metadata)) >= ceph_nvmeof_subsystem_namespace_limit', + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + summary: '{{ $labels.nqn }} subsystem has reached its maximum number of namespaces %(cluster)s' % $.MultiClusterSummary(), + description: 'Subsystems have a max namespace limit defined at creation time. This alert means that no more namespaces can be added to {{ $labels.nqn }}', + }, + }, + { + alert: 'NVMeoFTooManyGateways', + 'for': '1m', + expr: 'count(ceph_nvmeof_gateway_info) > %.2f' % [$._config.NVMeoFMaxGatewaysPerCluster], + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + summary: 'Max supported gateways exceeded %(cluster)s' % $.MultiClusterSummary(), + description: 'You may create many gateways, but %(NVMeoFMaxGatewaysPerCluster)d is the tested limit' % $._config, + }, + }, + { + alert: 'NVMeoFMaxGatewayGroupSize', + 'for': '1m', + expr: 'count by(group) (ceph_nvmeof_gateway_info) > %.2f' % [$._config.NVMeoFMaxGatewaysPerGroup], + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + summary: 'Max gateways within a gateway group ({{ $labels.group }}) exceeded %(cluster)s' % $.MultiClusterSummary(), + description: 'You may create many gateways in a gateway group, but %(NVMeoFMaxGatewaysPerGroup)d is the tested limit' % $._config, + }, + }, + { + alert: 'NVMeoFSingleGatewayGroup', + 'for': '5m', + expr: 'count by(group) (ceph_nvmeof_gateway_info) == 1', + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + summary: 'The gateway group {{ $labels.group }} consists of a single gateway - HA is not possible %(cluster)s' % $.MultiClusterSummary(), + description: 'Although a single member gateway group is valid, it should only be used for test purposes', + }, + }, + { + alert: 'NVMeoFHighGatewayCPU', + 'for': '10m', + expr: 'label_replace(avg by(instance) (rate(ceph_nvmeof_reactor_seconds_total{mode="busy"}[1m])),"instance","$1","instance","(.*):.*") > %.2f' % [$._config.NVMeoFHighGatewayCPU], + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + summary: 'CPU used by {{ $labels.instance }} NVMe-oF Gateway is high %(cluster)s' % $.MultiClusterSummary(), + description: 'Typically, high CPU may indicate degraded performance. Consider increasing the number of reactor cores', + }, + }, + { + alert: 'NVMeoFGatewayOpenSecurity', + 'for': '5m', + expr: 'ceph_nvmeof_subsystem_metadata{allow_any_host="yes"}', + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + summary: 'Subsystem {{ $labels.nqn }} has been defined without host level security %(cluster)s' % $.MultiClusterSummary(), + description: 'It is good practice to ensure subsystems use host security to reduce the risk of unexpected data loss', + }, + }, + { + alert: 'NVMeoFTooManySubsystems', + 'for': '1m', + expr: 'count by(gateway_host) (label_replace(ceph_nvmeof_subsystem_metadata,"gateway_host","$1","instance","(.*):.*")) > %.2f' % [$._config.NVMeoFMaxSubsystemsPerGateway], + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + summary: 'The number of subsystems defined to the gateway exceeds supported values %(cluster)s' % $.MultiClusterSummary(), + description: 'Although you may continue to create subsystems in {{ $labels.gateway_host }}, the configuration may not be supported', + }, + }, + { + alert: 'NVMeoFVersionMismatch', + 'for': '1h', + expr: 'count(count by(version) (ceph_nvmeof_gateway_info)) > 1', + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + summary: 'The cluster has different NVMe-oF gateway releases active %(cluster)s' % $.MultiClusterSummary(), + description: 'This may indicate an issue with deployment. Check cephadm logs', + }, + }, + { + alert: 'NVMeoFHighClientCount', + 'for': '1m', + expr: 'ceph_nvmeof_subsystem_host_count > %.2f' % [$._config.NVMeoFHighClientCount], + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + summary: 'The number of clients connected to {{ $labels.nqn }} is too high %(cluster)s' % $.MultiClusterSummary(), + description: 'The supported limit for clients connecting to a subsystem is %(NVMeoFHighClientCount)d' % $._config, + }, + }, + { + alert: 'NVMeoFHighHostCPU', + 'for': '10m', + expr: '100-((100*(avg by(host) (label_replace(rate(node_cpu_seconds_total{mode="idle"}[5m]),"host","$1","instance","(.*):.*")) * on(host) group_right label_replace(ceph_nvmeof_gateway_info,"host","$1","instance","(.*):.*")))) >= %.2f' % [$._config.NVMeoFHighHostCPU], + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + summary: 'The CPU is high ({{ $value }}%%) on NVMeoF Gateway host ({{ $labels.host }}) %(cluster)s' % $.MultiClusterSummary(), + description: 'High CPU on a gateway host can lead to CPU contention and performance degradation', + }, + }, + { + alert: 'NVMeoFInterfaceDown', + 'for': '30s', + expr: 'ceph_nvmeof_subsystem_listener_iface_info{operstate="down"}', + labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.14.1' }, + annotations: { + summary: 'Network interface {{ $labels.device }} is down %(cluster)s' % $.MultiClusterSummary(), + description: 'A NIC used by one or more subsystems is in a down state', + }, + }, + { + alert: 'NVMeoFInterfaceDuplex', + 'for': '30s', + expr: 'ceph_nvmeof_subsystem_listener_iface_info{duplex!="full"}', + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + summary: 'Network interface {{ $labels.device }} is not running in full duplex mode %(cluster)s' % $.MultiClusterSummary(), + description: 'Until this is resolved, performance from the gateway will be degraded', + }, + }, + { + alert: 'NVMeoFHighReadLatency', + 'for': '5m', + expr: 'label_replace((avg by(instance) ((rate(ceph_nvmeof_bdev_read_seconds_total[1m]) / rate(ceph_nvmeof_bdev_reads_completed_total[1m])))),"gateway","$1","instance","(.*):.*") > %.2f' % [$._config.NVMeoFHighClientReadLatency / 1000], + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + summary: 'The average read latency over the last 5 mins has reached %(NVMeoFHighClientReadLatency)d ms or more on {{ $labels.gateway }}' % $._config, + description: 'High latencies may indicate a constraint within the cluster e.g. CPU, network. Please investigate', + }, + }, + { + alert: 'NVMeoFHighWriteLatency', + 'for': '5m', + expr: 'label_replace((avg by(instance) ((rate(ceph_nvmeof_bdev_write_seconds_total[5m]) / rate(ceph_nvmeof_bdev_writes_completed_total[5m])))),"gateway","$1","instance","(.*):.*") > %.2f' % [$._config.NVMeoFHighClientWriteLatency / 1000], + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + summary: 'The average write latency over the last 5 mins has reached %(NVMeoFHighClientWriteLatency)d ms or more on {{ $labels.gateway }}' % $._config, + description: 'High latencies may indicate a constraint within the cluster e.g. CPU, network. Please investigate', + }, + }, + ], + }, + ], +} diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/prometheus_alerts.yml b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/prometheus_alerts.yml new file mode 100644 index 000000000..84452e584 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/prometheus_alerts.yml @@ -0,0 +1,887 @@ +groups: + - name: "cluster health" + rules: + - alert: "CephHealthError" + annotations: + description: "The cluster state has been HEALTH_ERROR for more than 5 minutes. Please check 'ceph health detail' for more information." + summary: "Ceph is in the ERROR state" + expr: "ceph_health_status == 2" + for: "5m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.2.1" + severity: "critical" + type: "ceph_default" + - alert: "CephHealthWarning" + annotations: + description: "The cluster state has been HEALTH_WARN for more than 15 minutes. Please check 'ceph health detail' for more information." + summary: "Ceph is in the WARNING state" + expr: "ceph_health_status == 1" + for: "15m" + labels: + severity: "warning" + type: "ceph_default" + - name: "mon" + rules: + - alert: "CephMonDownQuorumAtRisk" + annotations: + description: "{{ $min := query \"floor(count(ceph_mon_metadata) / 2) + 1\" | first | value }}Quorum requires a majority of monitors (x {{ $min }}) to be active. Without quorum the cluster will become inoperable, affecting all services and connected clients. The following monitors are down: {{- range query \"(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)\" }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}" + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down" + summary: "Monitor quorum is at risk" + expr: | + ( + (ceph_health_detail{name="MON_DOWN"} == 1) * on() ( + count(ceph_mon_quorum_status == 1) == bool (floor(count(ceph_mon_metadata) / 2) + 1) + ) + ) == 1 + for: "30s" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.3.1" + severity: "critical" + type: "ceph_default" + - alert: "CephMonDown" + annotations: + description: | + {{ $down := query "count(ceph_mon_quorum_status == 0)" | first | value }}{{ $s := "" }}{{ if gt $down 1.0 }}{{ $s = "s" }}{{ end }}You have {{ $down }} monitor{{ $s }} down. Quorum is still intact, but the loss of an additional monitor will make your cluster inoperable. The following monitors are down: {{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }} + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down" + summary: "One or more monitors down" + expr: | + count(ceph_mon_quorum_status == 0) <= (count(ceph_mon_metadata) - floor(count(ceph_mon_metadata) / 2) + 1) + for: "30s" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephMonDiskspaceCritical" + annotations: + description: "The free space available to a monitor's store is critically low. You should increase the space available to the monitor(s). The default directory is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook. Look for old, rotated versions of *.log and MANIFEST*. Do NOT touch any *.sst files. Also check any other directories under /var/lib/rook and other directories on the same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are; {{- range query \"ceph_mon_metadata\"}} - {{ .Labels.hostname }} {{- end }}" + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-crit" + summary: "Filesystem space on at least one monitor is critically low" + expr: "ceph_health_detail{name=\"MON_DISK_CRIT\"} == 1" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.3.2" + severity: "critical" + type: "ceph_default" + - alert: "CephMonDiskspaceLow" + annotations: + description: "The space available to a monitor's store is approaching full (>70% is the default). You should increase the space available to the monitor(s). The default directory is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook. Look for old, rotated versions of *.log and MANIFEST*. Do NOT touch any *.sst files. Also check any other directories under /var/lib/rook and other directories on the same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are; {{- range query \"ceph_mon_metadata\"}} - {{ .Labels.hostname }} {{- end }}" + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-low" + summary: "Drive space on at least one monitor is approaching full" + expr: "ceph_health_detail{name=\"MON_DISK_LOW\"} == 1" + for: "5m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephMonClockSkew" + annotations: + description: "Ceph monitors rely on closely synchronized time to maintain quorum and cluster consistency. This event indicates that the time on at least one mon has drifted too far from the lead mon. Review cluster status with ceph -s. This will show which monitors are affected. Check the time sync status on each monitor host with 'ceph time-sync-status' and the state and peers of your ntpd or chrony daemon." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-clock-skew" + summary: "Clock skew detected among monitors" + expr: "ceph_health_detail{name=\"MON_CLOCK_SKEW\"} == 1" + for: "1m" + labels: + severity: "warning" + type: "ceph_default" + - name: "osd" + rules: + - alert: "CephOSDDownHigh" + annotations: + description: "{{ $value | humanize }}% or {{ with query \"count(ceph_osd_up == 0)\" }}{{ . | first | value }}{{ end }} of {{ with query \"count(ceph_osd_up)\" }}{{ . | first | value }}{{ end }} OSDs are down (>= 10%). The following OSDs are down: {{- range query \"(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0\" }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}" + summary: "More than 10% of OSDs are down" + expr: "count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.4.1" + severity: "critical" + type: "ceph_default" + - alert: "CephOSDHostDown" + annotations: + description: "The following OSDs are down: {{- range query \"(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0\" }} - {{ .Labels.hostname }} : {{ .Labels.ceph_daemon }} {{- end }}" + summary: "An OSD host is offline" + expr: "ceph_health_detail{name=\"OSD_HOST_DOWN\"} == 1" + for: "5m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.4.8" + severity: "warning" + type: "ceph_default" + - alert: "CephOSDDown" + annotations: + description: | + {{ $num := query "count(ceph_osd_up == 0)" | first | value }}{{ $s := "" }}{{ if gt $num 1.0 }}{{ $s = "s" }}{{ end }}{{ $num }} OSD{{ $s }} down for over 5mins. The following OSD{{ $s }} {{ if eq $s "" }}is{{ else }}are{{ end }} down: {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0"}} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }} + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-down" + summary: "An OSD has been marked down" + expr: "ceph_health_detail{name=\"OSD_DOWN\"} == 1" + for: "5m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.4.2" + severity: "warning" + type: "ceph_default" + - alert: "CephOSDNearFull" + annotations: + description: "One or more OSDs have reached the NEARFULL threshold. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-nearfull" + summary: "OSD(s) running low on free space (NEARFULL)" + expr: "ceph_health_detail{name=\"OSD_NEARFULL\"} == 1" + for: "5m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.4.3" + severity: "warning" + type: "ceph_default" + - alert: "CephOSDFull" + annotations: + description: "An OSD has reached the FULL threshold. Writes to pools that share the affected OSD will be blocked. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-full" + summary: "OSD full, writes blocked" + expr: "ceph_health_detail{name=\"OSD_FULL\"} > 0" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.4.6" + severity: "critical" + type: "ceph_default" + - alert: "CephOSDBackfillFull" + annotations: + description: "An OSD has reached the BACKFILL FULL threshold. This will prevent rebalance operations from completing. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-backfillfull" + summary: "OSD(s) too full for backfill operations" + expr: "ceph_health_detail{name=\"OSD_BACKFILLFULL\"} > 0" + for: "1m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephOSDTooManyRepairs" + annotations: + description: "Reads from an OSD have used a secondary PG to return data to the client, indicating a potential failing drive." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-too-many-repairs" + summary: "OSD reports a high number of read errors" + expr: "ceph_health_detail{name=\"OSD_TOO_MANY_REPAIRS\"} == 1" + for: "30s" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephOSDTimeoutsPublicNetwork" + annotations: + description: "OSD heartbeats on the cluster's 'public' network (frontend) are running slow. Investigate the network for latency or loss issues. Use 'ceph health detail' to show the affected OSDs." + summary: "Network issues delaying OSD heartbeats (public network)" + expr: "ceph_health_detail{name=\"OSD_SLOW_PING_TIME_FRONT\"} == 1" + for: "1m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephOSDTimeoutsClusterNetwork" + annotations: + description: "OSD heartbeats on the cluster's 'cluster' network (backend) are slow. Investigate the network for latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs." + summary: "Network issues delaying OSD heartbeats (cluster network)" + expr: "ceph_health_detail{name=\"OSD_SLOW_PING_TIME_BACK\"} == 1" + for: "1m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephOSDInternalDiskSizeMismatch" + annotations: + description: "One or more OSDs have an internal inconsistency between metadata and the size of the device. This could lead to the OSD(s) crashing in future. You should redeploy the affected OSDs." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-disk-size-mismatch" + summary: "OSD size inconsistency error" + expr: "ceph_health_detail{name=\"BLUESTORE_DISK_SIZE_MISMATCH\"} == 1" + for: "1m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephDeviceFailurePredicted" + annotations: + description: "The device health module has determined that one or more devices will fail soon. To review device status use 'ceph device ls'. To show a specific device use 'ceph device info '. Mark the OSD out so that data may migrate to other OSDs. Once the OSD has drained, destroy the OSD, replace the device, and redeploy the OSD." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#id2" + summary: "Device(s) predicted to fail soon" + expr: "ceph_health_detail{name=\"DEVICE_HEALTH\"} == 1" + for: "1m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephDeviceFailurePredictionTooHigh" + annotations: + description: "The device health module has determined that devices predicted to fail can not be remediated automatically, since too many OSDs would be removed from the cluster to ensure performance and availability. Prevent data integrity issues by adding new OSDs so that data may be relocated." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-toomany" + summary: "Too many devices are predicted to fail, unable to resolve" + expr: "ceph_health_detail{name=\"DEVICE_HEALTH_TOOMANY\"} == 1" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.4.7" + severity: "critical" + type: "ceph_default" + - alert: "CephDeviceFailureRelocationIncomplete" + annotations: + description: "The device health module has determined that one or more devices will fail soon, but the normal process of relocating the data on the device to other OSDs in the cluster is blocked. \nEnsure that the cluster has available free space. It may be necessary to add capacity to the cluster to allow data from the failing device to successfully migrate, or to enable the balancer." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-in-use" + summary: "Device failure is predicted, but unable to relocate data" + expr: "ceph_health_detail{name=\"DEVICE_HEALTH_IN_USE\"} == 1" + for: "1m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephOSDFlapping" + annotations: + description: "OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} was marked down and back up {{ $value | humanize }} times once a minute for 5 minutes. This may indicate a network issue (latency, packet loss, MTU mismatch) on the cluster network, or the public network if no cluster network is deployed. Check the network stats on the listed host(s)." + documentation: "https://docs.ceph.com/en/latest/rados/troubleshooting/troubleshooting-osd#flapping-osds" + summary: "Network issues are causing OSDs to flap (mark each other down)" + expr: "(rate(ceph_osd_up[5m]) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) * 60 > 1" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.4.4" + severity: "warning" + type: "ceph_default" + - alert: "CephOSDReadErrors" + annotations: + description: "An OSD has encountered read errors, but the OSD has recovered by retrying the reads. This may indicate an issue with hardware or the kernel." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-spurious-read-errors" + summary: "Device read errors detected" + expr: "ceph_health_detail{name=\"BLUESTORE_SPURIOUS_READ_ERRORS\"} == 1" + for: "30s" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephPGImbalance" + annotations: + description: "OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} deviates by more than 30% from average PG count." + summary: "PGs are not balanced across OSDs" + expr: | + abs( + ((ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0) by (job)) / + on (job) group_left avg(ceph_osd_numpg > 0) by (job) + ) * on (ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30 + for: "5m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.4.5" + severity: "warning" + type: "ceph_default" + - name: "mds" + rules: + - alert: "CephFilesystemDamaged" + annotations: + description: "Filesystem metadata has been corrupted. Data may be inaccessible. Analyze metrics from the MDS daemon admin socket, or escalate to support." + documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages" + summary: "CephFS filesystem is damaged." + expr: "ceph_health_detail{name=\"MDS_DAMAGE\"} > 0" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.5.1" + severity: "critical" + type: "ceph_default" + - alert: "CephFilesystemOffline" + annotations: + description: "All MDS ranks are unavailable. The MDS daemons managing metadata are down, rendering the filesystem offline." + documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-all-down" + summary: "CephFS filesystem is offline" + expr: "ceph_health_detail{name=\"MDS_ALL_DOWN\"} > 0" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.5.3" + severity: "critical" + type: "ceph_default" + - alert: "CephFilesystemDegraded" + annotations: + description: "One or more metadata daemons (MDS ranks) are failed or in a damaged state. At best the filesystem is partially available, at worst the filesystem is completely unusable." + documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-degraded" + summary: "CephFS filesystem is degraded" + expr: "ceph_health_detail{name=\"FS_DEGRADED\"} > 0" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.5.4" + severity: "critical" + type: "ceph_default" + - alert: "CephFilesystemMDSRanksLow" + annotations: + description: "The filesystem's 'max_mds' setting defines the number of MDS ranks in the filesystem. The current number of active MDS daemons is less than this value." + documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-up-less-than-max" + summary: "Ceph MDS daemon count is lower than configured" + expr: "ceph_health_detail{name=\"MDS_UP_LESS_THAN_MAX\"} > 0" + for: "1m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephFilesystemInsufficientStandby" + annotations: + description: "The minimum number of standby daemons required by standby_count_wanted is less than the current number of standby daemons. Adjust the standby count or increase the number of MDS daemons." + documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-insufficient-standby" + summary: "Ceph filesystem standby daemons too few" + expr: "ceph_health_detail{name=\"MDS_INSUFFICIENT_STANDBY\"} > 0" + for: "1m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephFilesystemFailureNoStandby" + annotations: + description: "An MDS daemon has failed, leaving only one active rank and no available standby. Investigate the cause of the failure or add a standby MDS." + documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-with-failed-mds" + summary: "MDS daemon failed, no further standby available" + expr: "ceph_health_detail{name=\"FS_WITH_FAILED_MDS\"} > 0" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.5.5" + severity: "critical" + type: "ceph_default" + - alert: "CephFilesystemReadOnly" + annotations: + description: "The filesystem has switched to READ ONLY due to an unexpected error when writing to the metadata pool. Either analyze the output from the MDS daemon admin socket, or escalate to support." + documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages" + summary: "CephFS filesystem in read only mode due to write error(s)" + expr: "ceph_health_detail{name=\"MDS_HEALTH_READ_ONLY\"} > 0" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.5.2" + severity: "critical" + type: "ceph_default" + - name: "mgr" + rules: + - alert: "CephMgrModuleCrash" + annotations: + description: "One or more mgr modules have crashed and have yet to be acknowledged by an administrator. A crashed module may impact functionality within the cluster. Use the 'ceph crash' command to determine which module has failed, and archive it to acknowledge the failure." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#recent-mgr-module-crash" + summary: "A manager module has recently crashed" + expr: "ceph_health_detail{name=\"RECENT_MGR_MODULE_CRASH\"} == 1" + for: "5m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.6.1" + severity: "critical" + type: "ceph_default" + - alert: "CephMgrPrometheusModuleInactive" + annotations: + description: "The mgr/prometheus module at {{ $labels.instance }} is unreachable. This could mean that the module has been disabled or the mgr daemon itself is down. Without the mgr/prometheus module metrics and alerts will no longer function. Open a shell to an admin node or toolbox pod and use 'ceph -s' to to determine whether the mgr is active. If the mgr is not active, restart it, otherwise you can determine module status with 'ceph mgr module ls'. If it is not listed as enabled, enable it with 'ceph mgr module enable prometheus'." + summary: "The mgr/prometheus module is not available" + expr: "up{job=\"ceph\"} == 0" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.6.2" + severity: "critical" + type: "ceph_default" + - name: "pgs" + rules: + - alert: "CephPGsInactive" + annotations: + description: "{{ $value }} PGs have been inactive for more than 5 minutes in pool {{ $labels.name }}. Inactive placement groups are not able to serve read/write requests." + summary: "One or more placement groups are inactive" + expr: "ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_active) > 0" + for: "5m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.7.1" + severity: "critical" + type: "ceph_default" + - alert: "CephPGsUnclean" + annotations: + description: "{{ $value }} PGs have been unclean for more than 15 minutes in pool {{ $labels.name }}. Unclean PGs have not recovered from a previous failure." + summary: "One or more placement groups are marked unclean" + expr: "ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_clean) > 0" + for: "15m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.7.2" + severity: "warning" + type: "ceph_default" + - alert: "CephPGsDamaged" + annotations: + description: "During data consistency checks (scrub), at least one PG has been flagged as being damaged or inconsistent. Check to see which PG is affected, and attempt a manual repair if necessary. To list problematic placement groups, use 'rados list-inconsistent-pg '. To repair PGs use the 'ceph pg repair ' command." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-damaged" + summary: "Placement group damaged, manual intervention needed" + expr: "ceph_health_detail{name=~\"PG_DAMAGED|OSD_SCRUB_ERRORS\"} == 1" + for: "5m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.7.4" + severity: "critical" + type: "ceph_default" + - alert: "CephPGRecoveryAtRisk" + annotations: + description: "Data redundancy is at risk since one or more OSDs are at or above the 'full' threshold. Add more capacity to the cluster, restore down/out OSDs, or delete unwanted data." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-recovery-full" + summary: "OSDs are too full for recovery" + expr: "ceph_health_detail{name=\"PG_RECOVERY_FULL\"} == 1" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.7.5" + severity: "critical" + type: "ceph_default" + - alert: "CephPGUnavailableBlockingIO" + annotations: + description: "Data availability is reduced, impacting the cluster's ability to service I/O. One or more placement groups (PGs) are in a state that blocks I/O." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-availability" + summary: "PG is unavailable, blocking I/O" + expr: "((ceph_health_detail{name=\"PG_AVAILABILITY\"} == 1) - scalar(ceph_health_detail{name=\"OSD_DOWN\"})) == 1" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.7.3" + severity: "critical" + type: "ceph_default" + - alert: "CephPGBackfillAtRisk" + annotations: + description: "Data redundancy may be at risk due to lack of free space within the cluster. One or more OSDs have reached the 'backfillfull' threshold. Add more capacity, or delete unwanted data." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-backfill-full" + summary: "Backfill operations are blocked due to lack of free space" + expr: "ceph_health_detail{name=\"PG_BACKFILL_FULL\"} == 1" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.7.6" + severity: "critical" + type: "ceph_default" + - alert: "CephPGNotScrubbed" + annotations: + description: "One or more PGs have not been scrubbed recently. Scrubs check metadata integrity, protecting against bit-rot. They check that metadata is consistent across data replicas. When PGs miss their scrub interval, it may indicate that the scrub window is too small, or PGs were not in a 'clean' state during the scrub window. You can manually initiate a scrub with: ceph pg scrub " + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-scrubbed" + summary: "Placement group(s) have not been scrubbed" + expr: "ceph_health_detail{name=\"PG_NOT_SCRUBBED\"} == 1" + for: "5m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephPGsHighPerOSD" + annotations: + description: "The number of placement groups per OSD is too high (exceeds the mon_max_pg_per_osd setting).\n Check that the pg_autoscaler has not been disabled for any pools with 'ceph osd pool autoscale-status', and that the profile selected is appropriate. You may also adjust the target_size_ratio of a pool to guide the autoscaler based on the expected relative size of the pool ('ceph osd pool set cephfs.cephfs.meta target_size_ratio .1') or set the pg_autoscaler mode to 'warn' and adjust pg_num appropriately for one or more pools." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks/#too-many-pgs" + summary: "Placement groups per OSD is too high" + expr: "ceph_health_detail{name=\"TOO_MANY_PGS\"} == 1" + for: "1m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephPGNotDeepScrubbed" + annotations: + description: "One or more PGs have not been deep scrubbed recently. Deep scrubs protect against bit-rot. They compare data replicas to ensure consistency. When PGs miss their deep scrub interval, it may indicate that the window is too small or PGs were not in a 'clean' state during the deep-scrub window." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-deep-scrubbed" + summary: "Placement group(s) have not been deep scrubbed" + expr: "ceph_health_detail{name=\"PG_NOT_DEEP_SCRUBBED\"} == 1" + for: "5m" + labels: + severity: "warning" + type: "ceph_default" + - name: "nodes" + rules: + - alert: "CephNodeRootFilesystemFull" + annotations: + description: "Root volume is dangerously full: {{ $value | humanize }}% free." + summary: "Root filesystem is dangerously full" + expr: "node_filesystem_avail_bytes{mountpoint=\"/\"} / node_filesystem_size_bytes{mountpoint=\"/\"} * 100 < 5" + for: "5m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.8.1" + severity: "critical" + type: "ceph_default" + - alert: "CephNodeNetworkPacketDrops" + annotations: + description: "Node {{ $labels.instance }} experiences packet drop > 0.5% or > 10 packets/s on interface {{ $labels.device }}." + summary: "One or more NICs reports packet drops" + expr: | + ( + rate(node_network_receive_drop_total{device!="lo"}[1m]) + + rate(node_network_transmit_drop_total{device!="lo"}[1m]) + ) / ( + rate(node_network_receive_packets_total{device!="lo"}[1m]) + + rate(node_network_transmit_packets_total{device!="lo"}[1m]) + ) >= 0.0050000000000000001 and ( + rate(node_network_receive_drop_total{device!="lo"}[1m]) + + rate(node_network_transmit_drop_total{device!="lo"}[1m]) + ) >= 10 + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.8.2" + severity: "warning" + type: "ceph_default" + - alert: "CephNodeNetworkPacketErrors" + annotations: + description: "Node {{ $labels.instance }} experiences packet errors > 0.01% or > 10 packets/s on interface {{ $labels.device }}." + summary: "One or more NICs reports packet errors" + expr: | + ( + rate(node_network_receive_errs_total{device!="lo"}[1m]) + + rate(node_network_transmit_errs_total{device!="lo"}[1m]) + ) / ( + rate(node_network_receive_packets_total{device!="lo"}[1m]) + + rate(node_network_transmit_packets_total{device!="lo"}[1m]) + ) >= 0.0001 or ( + rate(node_network_receive_errs_total{device!="lo"}[1m]) + + rate(node_network_transmit_errs_total{device!="lo"}[1m]) + ) >= 10 + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.8.3" + severity: "warning" + type: "ceph_default" + - alert: "CephNodeNetworkBondDegraded" + annotations: + description: "Bond {{ $labels.master }} is degraded on Node {{ $labels.instance }}." + summary: "Degraded Bond on Node {{ $labels.instance }}" + expr: | + node_bonding_slaves - node_bonding_active != 0 + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephNodeDiskspaceWarning" + annotations: + description: "Mountpoint {{ $labels.mountpoint }} on {{ $labels.nodename }} will be full in less than 5 days based on the 48 hour trailing fill rate." + summary: "Host filesystem free space is getting low" + expr: "predict_linear(node_filesystem_free_bytes{device=~\"/.*\"}[2d], 3600 * 24 * 5) *on(instance) group_left(nodename) node_uname_info < 0" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.8.4" + severity: "warning" + type: "ceph_default" + - alert: "CephNodeInconsistentMTU" + annotations: + description: "Node {{ $labels.instance }} has a different MTU size ({{ $value }}) than the median of devices named {{ $labels.device }}." + summary: "MTU settings across Ceph hosts are inconsistent" + expr: "node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0) == scalar( max by (device) (node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) != quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) )or node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0) == scalar( min by (device) (node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) != quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) )" + labels: + severity: "warning" + type: "ceph_default" + - name: "pools" + rules: + - alert: "CephPoolGrowthWarning" + annotations: + description: "Pool '{{ $labels.name }}' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours." + summary: "Pool growth rate may soon exceed capacity" + expr: "(predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id, instance) group_right() ceph_pool_metadata) >= 95" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.9.2" + severity: "warning" + type: "ceph_default" + - alert: "CephPoolBackfillFull" + annotations: + description: "A pool is approaching the near full threshold, which will prevent recovery/backfill operations from completing. Consider adding more capacity." + summary: "Free space in a pool is too low for recovery/backfill" + expr: "ceph_health_detail{name=\"POOL_BACKFILLFULL\"} > 0" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephPoolFull" + annotations: + description: "A pool has reached its MAX quota, or OSDs supporting the pool have reached the FULL threshold. Until this is resolved, writes to the pool will be blocked. Pool Breakdown (top 5) {{- range query \"topk(5, sort_desc(ceph_pool_percent_used * on(pool_id) group_right ceph_pool_metadata))\" }} - {{ .Labels.name }} at {{ .Value }}% {{- end }} Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota max_bytes )" + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pool-full" + summary: "Pool is full - writes are blocked" + expr: "ceph_health_detail{name=\"POOL_FULL\"} > 0" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.9.1" + severity: "critical" + type: "ceph_default" + - alert: "CephPoolNearFull" + annotations: + description: "A pool has exceeded the warning (percent full) threshold, or OSDs supporting the pool have reached the NEARFULL threshold. Writes may continue, but you are at risk of the pool going read-only if more capacity isn't made available. Determine the affected pool with 'ceph df detail', looking at QUOTA BYTES and STORED. Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota max_bytes ). Also ensure that the balancer is active." + summary: "One or more Ceph pools are nearly full" + expr: "ceph_health_detail{name=\"POOL_NEAR_FULL\"} > 0" + for: "5m" + labels: + severity: "warning" + type: "ceph_default" + - name: "healthchecks" + rules: + - alert: "CephSlowOps" + annotations: + description: "{{ $value }} OSD requests are taking too long to process (osd_op_complaint_time exceeded)" + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops" + summary: "OSD operations are slow to complete" + expr: "ceph_healthcheck_slow_ops > 0" + for: "30s" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephDaemonSlowOps" + annotations: + description: "{{ $labels.ceph_daemon }} operations are taking too long to process (complaint time exceeded)" + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops" + summary: "{{ $labels.ceph_daemon }} operations are slow to complete" + expr: "ceph_daemon_health_metrics{type=\"SLOW_OPS\"} > 0" + for: "30s" + labels: + severity: "warning" + type: "ceph_default" + - name: "cephadm" + rules: + - alert: "CephadmUpgradeFailed" + annotations: + description: "The cephadm cluster upgrade process has failed. The cluster remains in an undetermined state. Please review the cephadm logs, to understand the nature of the issue" + summary: "Ceph version upgrade has failed" + expr: "ceph_health_detail{name=\"UPGRADE_EXCEPTION\"} > 0" + for: "30s" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.11.2" + severity: "critical" + type: "ceph_default" + - alert: "CephadmDaemonFailed" + annotations: + description: "A daemon managed by cephadm is no longer active. Determine, which daemon is down with 'ceph health detail'. you may start daemons with the 'ceph orch daemon start '" + summary: "A ceph daemon managed by cephadm is down" + expr: "ceph_health_detail{name=\"CEPHADM_FAILED_DAEMON\"} > 0" + for: "30s" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.11.1" + severity: "critical" + type: "ceph_default" + - alert: "CephadmPaused" + annotations: + description: "Cluster management has been paused manually. This will prevent the orchestrator from service management and reconciliation. If this is not intentional, resume cephadm operations with 'ceph orch resume'" + documentation: "https://docs.ceph.com/en/latest/cephadm/operations#cephadm-paused" + summary: "Orchestration tasks via cephadm are PAUSED" + expr: "ceph_health_detail{name=\"CEPHADM_PAUSED\"} > 0" + for: "1m" + labels: + severity: "warning" + type: "ceph_default" + - name: "hardware" + rules: + - alert: "HardwareStorageError" + annotations: + description: "Some storage devices are in error. Check `ceph health detail`." + summary: "Storage devices error(s) detected" + expr: "ceph_health_detail{name=\"HARDWARE_STORAGE\"} > 0" + for: "30s" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.13.1" + severity: "critical" + type: "ceph_default" + - alert: "HardwareMemoryError" + annotations: + description: "DIMM error(s) detected. Check `ceph health detail`." + summary: "DIMM error(s) detected" + expr: "ceph_health_detail{name=\"HARDWARE_MEMORY\"} > 0" + for: "30s" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.13.2" + severity: "critical" + type: "ceph_default" + - alert: "HardwareProcessorError" + annotations: + description: "Processor error(s) detected. Check `ceph health detail`." + summary: "Processor error(s) detected" + expr: "ceph_health_detail{name=\"HARDWARE_PROCESSOR\"} > 0" + for: "30s" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.13.3" + severity: "critical" + type: "ceph_default" + - alert: "HardwareNetworkError" + annotations: + description: "Network error(s) detected. Check `ceph health detail`." + summary: "Network error(s) detected" + expr: "ceph_health_detail{name=\"HARDWARE_NETWORK\"} > 0" + for: "30s" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.13.4" + severity: "critical" + type: "ceph_default" + - alert: "HardwarePowerError" + annotations: + description: "Power supply error(s) detected. Check `ceph health detail`." + summary: "Power supply error(s) detected" + expr: "ceph_health_detail{name=\"HARDWARE_POWER\"} > 0" + for: "30s" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.13.5" + severity: "critical" + type: "ceph_default" + - alert: "HardwareFanError" + annotations: + description: "Fan error(s) detected. Check `ceph health detail`." + summary: "Fan error(s) detected" + expr: "ceph_health_detail{name=\"HARDWARE_FANS\"} > 0" + for: "30s" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.13.6" + severity: "critical" + type: "ceph_default" + - name: "PrometheusServer" + rules: + - alert: "PrometheusJobMissing" + annotations: + description: "The prometheus job that scrapes from Ceph is no longer defined, this will effectively mean you'll have no metrics or alerts for the cluster. Please review the job definitions in the prometheus.yml file of the prometheus instance." + summary: "The scrape job for Ceph is missing from Prometheus" + expr: "absent(up{job=\"ceph\"})" + for: "30s" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.12.1" + severity: "critical" + type: "ceph_default" + - name: "rados" + rules: + - alert: "CephObjectMissing" + annotations: + description: "The latest version of a RADOS object can not be found, even though all OSDs are up. I/O requests for this object from clients will block (hang). Resolving this issue may require the object to be rolled back to a prior version manually, and manually verified." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#object-unfound" + summary: "Object(s) marked UNFOUND" + expr: "(ceph_health_detail{name=\"OBJECT_UNFOUND\"} == 1) * on() (count(ceph_osd_up == 1) == bool count(ceph_osd_metadata)) == 1" + for: "30s" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.10.1" + severity: "critical" + type: "ceph_default" + - name: "generic" + rules: + - alert: "CephDaemonCrash" + annotations: + description: "One or more daemons have crashed recently, and need to be acknowledged. This notification ensures that software crashes do not go unseen. To acknowledge a crash, use the 'ceph crash archive ' command." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks/#recent-crash" + summary: "One or more Ceph daemons have crashed, and are pending acknowledgement" + expr: "ceph_health_detail{name=\"RECENT_CRASH\"} == 1" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.1.2" + severity: "critical" + type: "ceph_default" + - name: "rbdmirror" + rules: + - alert: "CephRBDMirrorImagesPerDaemonHigh" + annotations: + description: "Number of image replications per daemon is not suppossed to go beyond threshold 100" + summary: "Number of image replications are now above 100" + expr: "sum by (ceph_daemon, namespace) (ceph_rbd_mirror_snapshot_image_snapshots) > 100" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.10.2" + severity: "critical" + type: "ceph_default" + - alert: "CephRBDMirrorImagesNotInSync" + annotations: + description: "Both local and remote RBD mirror images should be in sync." + summary: "Some of the RBD mirror images are not in sync with the remote counter parts." + expr: "sum by (ceph_daemon, image, namespace, pool) (topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.10.3" + severity: "critical" + type: "ceph_default" + - alert: "CephRBDMirrorImagesNotInSyncVeryHigh" + annotations: + description: "More than 10% of the images have synchronization problems" + summary: "Number of unsynchronized images are very high." + expr: "count by (ceph_daemon) ((topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0) > (sum by (ceph_daemon) (ceph_rbd_mirror_snapshot_snapshots)*.1)" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.10.4" + severity: "critical" + type: "ceph_default" + - alert: "CephRBDMirrorImageTransferBandwidthHigh" + annotations: + description: "Detected a heavy increase in bandwidth for rbd replications (over 80%) in the last 30 min. This might not be a problem, but it is good to review the number of images being replicated simultaneously" + summary: "The replication network usage has been increased over 80% in the last 30 minutes. Review the number of images being replicated. This alert will be cleaned automatically after 30 minutes" + expr: "rate(ceph_rbd_mirror_journal_replay_bytes[30m]) > 0.80" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.10.5" + severity: "warning" + type: "ceph_default" + - name: "nvmeof" + rules: + - alert: "NVMeoFSubsystemNamespaceLimit" + annotations: + description: "Subsystems have a max namespace limit defined at creation time. This alert means that no more namespaces can be added to {{ $labels.nqn }}" + summary: "{{ $labels.nqn }} subsystem has reached its maximum number of namespaces " + expr: "(count by(nqn) (ceph_nvmeof_subsystem_namespace_metadata)) >= ceph_nvmeof_subsystem_namespace_limit" + for: "1m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "NVMeoFTooManyGateways" + annotations: + description: "You may create many gateways, but 4 is the tested limit" + summary: "Max supported gateways exceeded " + expr: "count(ceph_nvmeof_gateway_info) > 4.00" + for: "1m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "NVMeoFMaxGatewayGroupSize" + annotations: + description: "You may create many gateways in a gateway group, but 2 is the tested limit" + summary: "Max gateways within a gateway group ({{ $labels.group }}) exceeded " + expr: "count by(group) (ceph_nvmeof_gateway_info) > 2.00" + for: "1m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "NVMeoFSingleGatewayGroup" + annotations: + description: "Although a single member gateway group is valid, it should only be used for test purposes" + summary: "The gateway group {{ $labels.group }} consists of a single gateway - HA is not possible " + expr: "count by(group) (ceph_nvmeof_gateway_info) == 1" + for: "5m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "NVMeoFHighGatewayCPU" + annotations: + description: "Typically, high CPU may indicate degraded performance. Consider increasing the number of reactor cores" + summary: "CPU used by {{ $labels.instance }} NVMe-oF Gateway is high " + expr: "label_replace(avg by(instance) (rate(ceph_nvmeof_reactor_seconds_total{mode=\"busy\"}[1m])),\"instance\",\"$1\",\"instance\",\"(.*):.*\") > 80.00" + for: "10m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "NVMeoFGatewayOpenSecurity" + annotations: + description: "It is good practice to ensure subsystems use host security to reduce the risk of unexpected data loss" + summary: "Subsystem {{ $labels.nqn }} has been defined without host level security " + expr: "ceph_nvmeof_subsystem_metadata{allow_any_host=\"yes\"}" + for: "5m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "NVMeoFTooManySubsystems" + annotations: + description: "Although you may continue to create subsystems in {{ $labels.gateway_host }}, the configuration may not be supported" + summary: "The number of subsystems defined to the gateway exceeds supported values " + expr: "count by(gateway_host) (label_replace(ceph_nvmeof_subsystem_metadata,\"gateway_host\",\"$1\",\"instance\",\"(.*):.*\")) > 16.00" + for: "1m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "NVMeoFVersionMismatch" + annotations: + description: "This may indicate an issue with deployment. Check cephadm logs" + summary: "The cluster has different NVMe-oF gateway releases active " + expr: "count(count by(version) (ceph_nvmeof_gateway_info)) > 1" + for: "1h" + labels: + severity: "warning" + type: "ceph_default" + - alert: "NVMeoFHighClientCount" + annotations: + description: "The supported limit for clients connecting to a subsystem is 32" + summary: "The number of clients connected to {{ $labels.nqn }} is too high " + expr: "ceph_nvmeof_subsystem_host_count > 32.00" + for: "1m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "NVMeoFHighHostCPU" + annotations: + description: "High CPU on a gateway host can lead to CPU contention and performance degradation" + summary: "The CPU is high ({{ $value }}%) on NVMeoF Gateway host ({{ $labels.host }}) " + expr: "100-((100*(avg by(host) (label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]),\"host\",\"$1\",\"instance\",\"(.*):.*\")) * on(host) group_right label_replace(ceph_nvmeof_gateway_info,\"host\",\"$1\",\"instance\",\"(.*):.*\")))) >= 80.00" + for: "10m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "NVMeoFInterfaceDown" + annotations: + description: "A NIC used by one or more subsystems is in a down state" + summary: "Network interface {{ $labels.device }} is down " + expr: "ceph_nvmeof_subsystem_listener_iface_info{operstate=\"down\"}" + for: "30s" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.14.1" + severity: "warning" + type: "ceph_default" + - alert: "NVMeoFInterfaceDuplex" + annotations: + description: "Until this is resolved, performance from the gateway will be degraded" + summary: "Network interface {{ $labels.device }} is not running in full duplex mode " + expr: "ceph_nvmeof_subsystem_listener_iface_info{duplex!=\"full\"}" + for: "30s" + labels: + severity: "warning" + type: "ceph_default" + - alert: "NVMeoFHighReadLatency" + annotations: + description: "High latencies may indicate a constraint within the cluster e.g. CPU, network. Please investigate" + summary: "The average read latency over the last 5 mins has reached 10 ms or more on {{ $labels.gateway }}" + expr: "label_replace((avg by(instance) ((rate(ceph_nvmeof_bdev_read_seconds_total[1m]) / rate(ceph_nvmeof_bdev_reads_completed_total[1m])))),\"gateway\",\"$1\",\"instance\",\"(.*):.*\") > 0.01" + for: "5m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "NVMeoFHighWriteLatency" + annotations: + description: "High latencies may indicate a constraint within the cluster e.g. CPU, network. Please investigate" + summary: "The average write latency over the last 5 mins has reached 20 ms or more on {{ $labels.gateway }}" + expr: "label_replace((avg by(instance) ((rate(ceph_nvmeof_bdev_write_seconds_total[5m]) / rate(ceph_nvmeof_bdev_writes_completed_total[5m])))),\"gateway\",\"$1\",\"instance\",\"(.*):.*\") > 0.02" + for: "5m" + labels: + severity: "warning" + type: "ceph_default" diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/requirements-alerts.txt b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/requirements-alerts.txt new file mode 100644 index 000000000..3ba54d314 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/requirements-alerts.txt @@ -0,0 +1,2 @@ +pyyaml==6.0.1 +bs4 diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/requirements-grafonnet.txt b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/requirements-grafonnet.txt new file mode 100644 index 000000000..9891d5590 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/requirements-grafonnet.txt @@ -0,0 +1 @@ +jsondiff diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/requirements-lint.txt b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/requirements-lint.txt new file mode 100644 index 000000000..8c7219897 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/requirements-lint.txt @@ -0,0 +1,18 @@ +attrs==21.2.0 +behave==1.2.6 +py==1.10.0 +pyparsing==2.4.7 +PyYAML==6.0.1 +types-PyYAML==6.0.0 +typing-extensions==3.10.0.2 +termcolor==1.1.0 +types-termcolor==1.1.2 +dataclasses==0.6 +types-dataclasses==0.6.1 +six==1.16.0 +toml==0.10.2 +pylint==2.6.0 +isort==5.10.0 +mypy==0.910 +mypy-extensions==0.4.3 +prettytable==2.4.0 diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/test-jsonnet.sh b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/test-jsonnet.sh new file mode 100755 index 000000000..87c533892 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/test-jsonnet.sh @@ -0,0 +1,35 @@ +#!/bin/sh -e + +TEMPDIR=$(mktemp -d) +BASEDIR=$(dirname "$0") + +jsonnet -J vendor -m ${TEMPDIR} $BASEDIR/dashboards.jsonnet + +truncate -s 0 ${TEMPDIR}/json_difference.log +for file in ${BASEDIR}/dashboards_out/*.json +do + file_name="$(basename $file)" + for generated_file in ${TEMPDIR}/*.json + do + generated_file_name="$(basename $generated_file)" + if [ "$file_name" == "$generated_file_name" ]; then + jsondiff --indent 2 "${generated_file}" "${file}" \ + | tee -a ${TEMPDIR}/json_difference.log + fi + done +done + +jsonnet -J vendor -S alerts.jsonnet -o ${TEMPDIR}/prometheus_alerts.yml +jsondiff --indent 2 "prometheus_alerts.yml" "${TEMPDIR}/prometheus_alerts.yml" \ + | tee -a ${TEMPDIR}/json_difference.log + +err=0 +if [ $(wc -l < ${TEMPDIR}/json_difference.log) -eq 0 ] +then + rm -rf ${TEMPDIR} + echo "Congratulations! Grafonnet Check Passed" +else + rm -rf ${TEMPDIR} + echo "Grafonnet Check Failed, failed comparing generated file with existing" + exit 1 +fi diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/tests_alerts/README.md b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/tests_alerts/README.md new file mode 100644 index 000000000..cf95fa636 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/tests_alerts/README.md @@ -0,0 +1,92 @@ + +## Alert Rule Standards + +The alert rules should adhere to the following principles +- each alert must have a unique name +- each alert should define a common structure + - labels : must contain severity and type + - annotations : must provide description + - expr : must define the promql expression + - alert : defines the alert name +- alerts that have a corresponding section within docs.ceph.com must include a + documentation field in the annotations section +- critical alerts should declare an oid in the labels section +- critical alerts should have a corresponding entry in the Ceph MIB + +  +## Testing Prometheus Rules +Once you have updated the `ceph_default_alerts.yml` file, you should use the +`validate_rules.py` script directly, or via `tox` to ensure the format of any update +or change aligns to our rule structure guidelines. The validate_rules.py script will +process the rules and look for any configuration anomalies and output a report if +problems are detected. + +Here's an example run, to illustrate the format and the kinds of issues detected. + +``` +[paul@myhost tests]$ ./validate_rules.py + +Checking rule groups + cluster health : .. + mon : E.W.. + osd : E...W......W.E.. + mds : WW + mgr : WW + pgs : ..WWWW.. + nodes : .EEEE + pools : EEEW. + healthchecks : . + cephadm : WW. + prometheus : W + rados : W + +Summary + +Rule file : ../alerts/ceph_default_alerts.yml +Unit Test file : test_alerts.yml + +Rule groups processed : 12 +Rules processed : 51 +Rule errors : 10 +Rule warnings : 16 +Rule name duplicates : 0 +Unit tests missing : 4 + +Problem Report + + Group Severity Alert Name Problem Description + ----- -------- ---------- ------------------- + cephadm Warning Cluster upgrade has failed critical level alert is missing an SNMP oid entry + cephadm Warning A daemon managed by cephadm is down critical level alert is missing an SNMP oid entry + mds Warning Ceph Filesystem damage detected critical level alert is missing an SNMP oid entry + mds Warning Ceph Filesystem switched to READ ONLY critical level alert is missing an SNMP oid entry + mgr Warning mgr module failure critical level alert is missing an SNMP oid entry + mgr Warning mgr prometheus module is not active critical level alert is missing an SNMP oid entry + mon Error Monitor down, quorum is at risk documentation link error: #mon-downwah not found on the page + mon Warning Ceph mon disk space critically low critical level alert is missing an SNMP oid entry + nodes Error network packets dropped invalid alert structure. Missing field: for + nodes Error network packet errors invalid alert structure. Missing field: for + nodes Error storage filling up invalid alert structure. Missing field: for + nodes Error MTU Mismatch invalid alert structure. Missing field: for + osd Error 10% OSDs down invalid alert structure. Missing field: for + osd Error Flapping OSD invalid alert structure. Missing field: for + osd Warning OSD Full critical level alert is missing an SNMP oid entry + osd Warning Too many devices predicted to fail critical level alert is missing an SNMP oid entry + pgs Warning Placement Group (PG) damaged critical level alert is missing an SNMP oid entry + pgs Warning Recovery at risk, cluster too full critical level alert is missing an SNMP oid entry + pgs Warning I/O blocked to some data critical level alert is missing an SNMP oid entry + pgs Warning Cluster too full, automatic data recovery impaired critical level alert is missing an SNMP oid entry + pools Error pool full invalid alert structure. Missing field: for + pools Error pool filling up (growth forecast) invalid alert structure. Missing field: for + pools Error Ceph pool is too full for recovery/rebalance invalid alert structure. Missing field: for + pools Warning Ceph pool is full - writes blocked critical level alert is missing an SNMP oid entry + prometheus Warning Scrape job is missing critical level alert is missing an SNMP oid entry + rados Warning Data not found/missing critical level alert is missing an SNMP oid entry + +Unit tests are incomplete. Tests missing for the following alerts; + - Placement Group (PG) damaged + - OSD Full + - storage filling up + - pool filling up (growth forecast) + +``` diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/tests_alerts/__init__.py b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/tests_alerts/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/tests_alerts/settings.py b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/tests_alerts/settings.py new file mode 100644 index 000000000..d99dfdca6 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/tests_alerts/settings.py @@ -0,0 +1,11 @@ +import os + +ALERTS_FILE = '../prometheus_alerts.yml' +UNIT_TESTS_FILE = 'test_alerts.yml' +MIB_FILE = '../../snmp/CEPH-MIB.txt' + +current_dir = os.path.dirname(os.path.abspath(__file__)) + +ALERTS_FILE = os.path.join(current_dir, ALERTS_FILE) +UNIT_TESTS_FILE = os.path.join(current_dir, UNIT_TESTS_FILE) +MIB_FILE = os.path.join(current_dir, MIB_FILE) diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/tests_alerts/test_alerts.yml b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/tests_alerts/test_alerts.yml new file mode 100644 index 000000000..40d6f4d09 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/tests_alerts/test_alerts.yml @@ -0,0 +1,2644 @@ +rule_files: + - ../prometheus_alerts.yml +evaluation_interval: 5m +tests: + # health error + - interval: 5m + input_series: + - series: 'ceph_health_status{instance="ceph:9283",job="ceph"}' + values: '2 2 2 2 2 2 2' + promql_expr_test: + - expr: ceph_health_status == 2 + eval_time: 5m + exp_samples: + - labels: 'ceph_health_status{instance="ceph:9283",job="ceph"}' + value: 2 + alert_rule_test: + - eval_time: 1m + alertname: CephHealthError + - eval_time: 6m + alertname: CephHealthError + exp_alerts: + - exp_labels: + instance: ceph:9283 + job: ceph + oid: 1.3.6.1.4.1.50495.1.2.1.2.1 + type: ceph_default + severity: critical + exp_annotations: + summary: Ceph is in the ERROR state + description: The cluster state has been HEALTH_ERROR for more than 5 minutes. Please check 'ceph health detail' for more information. + + # health warning + - interval: 5m + input_series: + - series: 'ceph_health_status{instance="ceph:9283",job="ceph"}' + values: '1 1 1 1 1 1 1 1 1 1' + promql_expr_test: + - expr: ceph_health_status == 1 + eval_time: 15m + exp_samples: + - labels: 'ceph_health_status{instance="ceph:9283",job="ceph"}' + value: 1 + alert_rule_test: + - eval_time: 10m + alertname: CephHealthWarning + - eval_time: 20m + alertname: CephHealthWarning + exp_alerts: + - exp_labels: + instance: ceph:9283 + job: ceph + type: ceph_default + severity: warning + exp_annotations: + summary: Ceph is in the WARNING state + description: The cluster state has been HEALTH_WARN for more than 15 minutes. Please check 'ceph health detail' for more information. + + # 10% OSDs down + - interval: 1m + input_series: + - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}' + values: '1 1 1 1 1' + - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}' + values: '0 0 0 0 0' + - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}' + values: '1 1 1 1 1' + - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0", + ceph_version="ceph version 17.0.0-189-g3558fd72 + (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", + cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", + hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", + public_addr="172.20.0.2"}' + values: '1 1 1 1 1' + - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1", + ceph_version="ceph version 17.0.0-189-g3558fd72 + (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", + cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", + hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", + public_addr="172.20.0.2"}' + values: '1 1 1 1 1' + - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2", + ceph_version="ceph version 17.0.0-189-g3558fd72 + (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", + cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", + hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", + public_addr="172.20.0.2"}' + values: '1 1 1 1 1' + promql_expr_test: + - expr: count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10 + eval_time: 1m + exp_samples: + - labels: '{}' + value: 3.333333333333333E+01 + alert_rule_test: + - eval_time: 1m + alertname: CephOSDDownHigh + exp_alerts: + - exp_labels: + oid: 1.3.6.1.4.1.50495.1.2.1.4.1 + type: ceph_default + severity: critical + exp_annotations: + summary: More than 10% of OSDs are down + description: "33.33% or 1 of 3 OSDs are down (>= 10%). The following OSDs are down: - osd.1 on ceph" + + # flapping OSD + - interval: 1s + input_series: + - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}' + values: '1+1x100' + - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}' + values: '1+0x100' + - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}' + values: '1+0x100' + - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0", + ceph_version="ceph version 17.0.0-189-g3558fd72 + (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", + cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", + hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", + public_addr="172.20.0.2"}' + values: '1 1 1 1 1 1' + - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1", + ceph_version="ceph version 17.0.0-189-g3558fd72 + (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", + cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", + hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", + public_addr="172.20.0.2"}' + values: '1 1 1 1 1 1' + - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2", + ceph_version="ceph version 17.0.0-189-g3558fd72 + (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", + cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", + hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", + public_addr="172.20.0.2"}' + values: '1 1 1 1 1 1' + promql_expr_test: + - expr: | + ( + rate(ceph_osd_up[5m]) + * on(ceph_daemon) group_left(hostname) ceph_osd_metadata + ) * 60 > 1 + eval_time: 1m + exp_samples: + - labels: '{ceph_daemon="osd.0", hostname="ceph", instance="ceph:9283", + job="ceph"}' + value: 1.2200000000000001E+01 + alert_rule_test: + - eval_time: 5m + alertname: CephOSDFlapping + exp_alerts: + - exp_labels: + ceph_daemon: osd.0 + hostname: ceph + instance: ceph:9283 + job: ceph + oid: 1.3.6.1.4.1.50495.1.2.1.4.4 + severity: warning + type: ceph_default + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/troubleshooting/troubleshooting-osd#flapping-osds + summary: Network issues are causing OSDs to flap (mark each other down) + description: "OSD osd.0 on ceph was marked down and back up 20.1 times once a minute for 5 minutes. This may indicate a network issue (latency, packet loss, MTU mismatch) on the cluster network, or the public network if no cluster network is deployed. Check the network stats on the listed host(s)." + + # high pg count deviation + - interval: 1m + input_series: + - series: 'ceph_osd_numpg{ceph_daemon="osd.0",instance="ceph:9283", + job="ceph"}' + values: '100 100 100 100 100 160' + - series: 'ceph_osd_numpg{ceph_daemon="osd.1",instance="ceph:9283", + job="ceph"}' + values: '100 100 100 100 100 320' + - series: 'ceph_osd_numpg{ceph_daemon="osd.2",instance="ceph:9283", + job="ceph"}' + values: '100 100 100 100 100 160' + - series: 'ceph_osd_numpg{ceph_daemon="osd.3",instance="ceph:9283", + job="ceph"}' + values: '100 100 100 100 100 160' + - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0", + ceph_version="ceph version 17.0.0-189-g3558fd72 + (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", + cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", + hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", + public_addr="172.20.0.2"}' + values: '1 1 1 1 1 1' + - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1", + ceph_version="ceph version 17.0.0-189-g3558fd72 + (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", + cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", + hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", + public_addr="172.20.0.2"}' + values: '1 1 1 1 1 1' + - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2", + ceph_version="ceph version 17.0.0-189-g3558fd72 + (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", + cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", + hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", + public_addr="172.20.0.2"}' + values: '1 1 1 1 1 1' + - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.3", + ceph_version="ceph version 17.0.0-189-g3558fd72 + (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", + cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", + hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", + public_addr="172.20.0.2"}' + values: '1 1 1 1 1 1' + promql_expr_test: + - expr: | + abs( + ( + (ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0) + by (job) + ) / on (job) group_left avg(ceph_osd_numpg > 0) by (job) + ) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30 + + eval_time: 5m + exp_samples: + - labels: '{ceph_daemon="osd.1", hostname="ceph", instance="ceph:9283", + job="ceph"}' + value: 6E-01 + alert_rule_test: + - eval_time: 10m + alertname: CephPGImbalance + exp_alerts: + - exp_labels: + ceph_daemon: osd.1 + hostname: ceph + instance: ceph:9283 + job: ceph + oid: 1.3.6.1.4.1.50495.1.2.1.4.5 + severity: warning + type: ceph_default + exp_annotations: + summary: PGs are not balanced across OSDs + description: "OSD osd.1 on ceph deviates by more than 30% from average PG count." + + # pgs inactive + - interval: 1m + input_series: + - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph", + name="device_health_metrics",pool_id="1"}' + values: '1 1 1 1 1 1 1 1' + - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph", + name="device_health_metrics",pool_id="2"}' + values: '1 1 1 1 1 1 1 1' + - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph", + name="device_health_metrics",pool_id="3"}' + values: '1 1 1 1 1 1 1 1' + - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="1"}' + values: '1 1 1 1 1 1 1 1' + - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="2"}' + values: '32 32 32 32 32 32 32 32' + - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="3"}' + values: '33 32 32 32 32 33 33 32' + - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",pool_id="1"}' + values: '1 1 1 1 1 1 1 1 1' + - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",pool_id="2"}' + values: '32 32 32 32 32 32 32 32' + - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",pool_id="3"}' + values: '32 32 32 32 32 32 32 32' + promql_expr_test: + - expr: ceph_pool_metadata * on(pool_id,instance) group_left() + (ceph_pg_total - ceph_pg_active) > 0 + eval_time: 5m + exp_samples: + - labels: '{instance="ceph:9283", job="ceph", + name="device_health_metrics", + pool_id="3"}' + value: 1 + alert_rule_test: + - eval_time: 5m + alertname: CephPGsInactive + exp_alerts: + - exp_labels: + instance: ceph:9283 + job: ceph + name: device_health_metrics + oid: 1.3.6.1.4.1.50495.1.2.1.7.1 + pool_id: 3 + severity: critical + type: ceph_default + exp_annotations: + summary: One or more placement groups are inactive + description: "1 PGs have been inactive for more than 5 minutes in pool device_health_metrics. Inactive placement groups are not able to serve read/write requests." + + #pgs unclean + - interval: 1m + input_series: + - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph", + name="device_health_metrics",pool_id="1"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' + - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph", + name="device_health_metrics",pool_id="2"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' + - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph", + name="device_health_metrics",pool_id="3"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' + - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="1"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' + - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="2"}' + values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 + 32 32 32' + - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="3"}' + values: '33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 + 33 33' + - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",pool_id="1"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' + - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",pool_id="2"}' + values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 + 32 32' + - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",pool_id="3"}' + values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 + 32 32' + promql_expr_test: + - expr: ceph_pool_metadata * on(pool_id,instance) group_left() + (ceph_pg_total - ceph_pg_clean) > 0 + eval_time: 15m + exp_samples: + - labels: '{instance="ceph:9283", job="ceph", + name="device_health_metrics", pool_id="3"}' + value: 1 + alert_rule_test: + - eval_time: 16m + alertname: CephPGsUnclean + exp_alerts: + - exp_labels: + instance: ceph:9283 + job: ceph + name: device_health_metrics + oid: 1.3.6.1.4.1.50495.1.2.1.7.2 + pool_id: 3 + severity: warning + type: ceph_default + exp_annotations: + summary: One or more placement groups are marked unclean + description: "1 PGs have been unclean for more than 15 minutes in pool device_health_metrics. Unclean PGs have not recovered from a previous failure." + + # root volume full + - interval: 1m + input_series: + - series: 'node_filesystem_avail_bytes{device="/dev/mapper/fedora_localhost + --live-home",fstype="ext4",instance="node-exporter",job="node-exporter", + mountpoint="/"}' + values: '35336400896 35336400896 35336400896 35336400896 35336400896 + 3525385519.104 3533640089' + - series: 'node_filesystem_size_bytes{device="/dev/mapper/fedora_localhost + --live-home",fstype="ext4",instance="node-exporter",job="node-exporter", + mountpoint="/"}' + values: '73445531648 73445531648 73445531648 73445531648 73445531648 + 73445531648 73445531648' + promql_expr_test: + - expr: node_filesystem_avail_bytes{mountpoint="/"} / + node_filesystem_size_bytes{mountpoint="/"} * 100 < 5 + eval_time: 5m + exp_samples: + - labels: '{device="/dev/mapper/fedora_localhost --live-home", + fstype="ext4", instance="node-exporter", job="node-exporter", + mountpoint="/"}' + value: 4.8E+00 + alert_rule_test: + - eval_time: 10m + alertname: CephNodeRootFilesystemFull + exp_alerts: + - exp_labels: + device: /dev/mapper/fedora_localhost --live-home + fstype: ext4 + instance: node-exporter + job: node-exporter + mountpoint: / + oid: 1.3.6.1.4.1.50495.1.2.1.8.1 + severity: critical + type: ceph_default + exp_annotations: + summary: Root filesystem is dangerously full + description: "Root volume is dangerously full: 4.811% free." + + # network packets dropped + - interval: 1m + input_series: + - series: 'node_network_receive_drop_total{device="eth0", + instance="node-exporter",job="node-exporter"}' + values: '0+600x10' + - series: 'node_network_transmit_drop_total{device="eth0", + instance="node-exporter",job="node-exporter"}' + values: '0+600x10' + - series: 'node_network_receive_packets_total{device="eth0", + instance="node-exporter",job="node-exporter"}' + values: '0+750x10' + - series: 'node_network_transmit_packets_total{device="eth0", + instance="node-exporter",job="node-exporter"}' + values: '0+750x10' + promql_expr_test: + - expr: | + ( + rate(node_network_receive_drop_total{device!="lo"}[1m]) + + rate(node_network_transmit_drop_total{device!="lo"}[1m]) + ) / ( + rate(node_network_receive_packets_total{device!="lo"}[1m]) + + rate(node_network_transmit_packets_total{device!="lo"}[1m]) + ) >= 0.0050000000000000001 and ( + rate(node_network_receive_drop_total{device!="lo"}[1m]) + + rate(node_network_transmit_drop_total{device!="lo"}[1m]) + ) >= 10 + + eval_time: 5m + exp_samples: + - labels: '{device="eth0", instance="node-exporter", + job="node-exporter"}' + value: 8E-1 + alert_rule_test: + - eval_time: 5m + alertname: CephNodeNetworkPacketDrops + exp_alerts: + - exp_labels: + device: eth0 + instance: node-exporter + job: node-exporter + oid: 1.3.6.1.4.1.50495.1.2.1.8.2 + severity: warning + type: ceph_default + exp_annotations: + summary: One or more NICs reports packet drops + description: "Node node-exporter experiences packet drop > 0.5% or > 10 packets/s on interface eth0." + + # network packets errors + - interval: 1m + input_series: + - series: 'node_network_receive_errs_total{device="eth0", + instance="node-exporter",job="node-exporter"}' + values: '0+600x10' + - series: 'node_network_transmit_errs_total{device="eth0", + instance="node-exporter",job="node-exporter"}' + values: '0+600x10' + - series: 'node_network_transmit_packets_total{device="eth0", + instance="node-exporter",job="node-exporter"}' + values: '0+750x10' + - series: 'node_network_receive_packets_total{device="eth0", + instance="node-exporter",job="node-exporter"}' + values: '0+750x10' + promql_expr_test: + - expr: | + ( + rate(node_network_receive_errs_total{device!="lo"}[1m]) + + rate(node_network_transmit_errs_total{device!="lo"}[1m]) + ) / ( + rate(node_network_receive_packets_total{device!="lo"}[1m]) + + rate(node_network_transmit_packets_total{device!="lo"}[1m]) + ) >= 0.0001 or ( + rate(node_network_receive_errs_total{device!="lo"}[1m]) + + rate(node_network_transmit_errs_total{device!="lo"}[1m]) + ) >= 10 + + eval_time: 5m + exp_samples: + - labels: '{device="eth0", instance="node-exporter", + job="node-exporter"}' + value: 8E-01 + alert_rule_test: + - eval_time: 5m + alertname: CephNodeNetworkPacketErrors + exp_alerts: + - exp_labels: + device: eth0 + instance: node-exporter + job: node-exporter + oid: 1.3.6.1.4.1.50495.1.2.1.8.3 + severity: warning + type: ceph_default + exp_annotations: + summary: One or more NICs reports packet errors + description: "Node node-exporter experiences packet errors > 0.01% or > 10 packets/s on interface eth0." + + # Bond is missing a peer + - interval: 1m + input_series: + - series: 'node_bonding_active{master="bond0", + instance="node-exporter",job="node-exporter"}' + values: '3' + - series: 'node_bonding_slaves{master="bond0", + instance="node-exporter",job="node-exporter"}' + values: '4' + promql_expr_test: + - expr: | + node_bonding_slaves - node_bonding_active != 0 + eval_time: 5m + exp_samples: + - labels: '{master="bond0", instance="node-exporter", + job="node-exporter"}' + value: 1 + alert_rule_test: + - eval_time: 5m + alertname: CephNodeNetworkBondDegraded + exp_alerts: + - exp_labels: + master: bond0 + instance: node-exporter + job: node-exporter + severity: warning + type: ceph_default + exp_annotations: + summary: Degraded Bond on Node node-exporter + description: "Bond bond0 is degraded on Node node-exporter." + +# Node Storage disk space filling up + - interval: 1m + # 20GB = 21474836480, 256MB = 268435456 + input_series: + - series: 'node_filesystem_free_bytes{device="/dev/mapper/vg-root", + fstype="xfs",instance="node-1",mountpoint="/rootfs"}' + values: '21474836480-268435456x48' + - series: 'node_filesystem_free_bytes{device="/dev/mapper/vg-root", + fstype="xfs",instance="node-2",mountpoint="/rootfs"}' + values: '21474836480+0x48' + - series: 'node_uname_info{instance="node-1", nodename="node-1.unittests.com"}' + values: 1+0x48 + - series: 'node_uname_info{instance="node-2", nodename="node-2.unittests.com"}' + values: 1+0x48 + promql_expr_test: + - expr: | + predict_linear(node_filesystem_free_bytes{device=~"/.*"}[2d], 3600 * 24 * 5) * + on(instance) group_left(nodename) node_uname_info < 0 + eval_time: 5m + exp_samples: + - labels: '{device="/dev/mapper/vg-root",instance="node-1",fstype="xfs", + mountpoint="/rootfs",nodename="node-1.unittests.com"}' + value: -1.912602624E+12 + alert_rule_test: + - eval_time: 5m + alertname: CephNodeDiskspaceWarning + exp_alerts: + - exp_labels: + severity: warning + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.8.4 + device: /dev/mapper/vg-root + fstype: xfs + instance: node-1 + mountpoint: /rootfs + nodename: node-1.unittests.com + exp_annotations: + summary: Host filesystem free space is getting low + description: "Mountpoint /rootfs on node-1.unittests.com will be full in less than 5 days based on the 48 hour trailing fill rate." + # MTU Mismatch + - interval: 1m + input_series: + - series: 'node_network_mtu_bytes{device="eth0",instance="node-exporter", + job="node-exporter"}' + values: '1500 1500 1500 1500 1500' + - series: 'node_network_mtu_bytes{device="eth1",instance="node-exporter", + job="node-exporter"}' + values: '1500 1500 1500 1500 1500' + - series: 'node_network_mtu_bytes{device="eth2",instance="node-exporter", + job="node-exporter"}' + values: '1500 1500 1500 1500 1500' + - series: 'node_network_mtu_bytes{device="eth3",instance="node-exporter", + job="node-exporter"}' + values: '1500 1500 1500 1500 1500' + - series: 'node_network_mtu_bytes{device="eth4",instance="node-exporter", + job="node-exporter"}' + values: '9000 9000 9000 9000 9000' + - series: 'node_network_mtu_bytes{device="eth4",instance="hostname1", + job="node-exporter"}' + values: '2200 2200 2200 2200 2200' + - series: 'node_network_mtu_bytes{device="eth4",instance="hostname2", + job="node-exporter"}' + values: '2400 2400 2400 2400 2400' + - series: 'node_network_up{device="eth0",instance="node-exporter", + job="node-exporter"}' + values: '0 0 0 0 0' + - series: 'node_network_up{device="eth1",instance="node-exporter", + job="node-exporter"}' + values: '0 0 0 0 0' + - series: 'node_network_up{device="eth2",instance="node-exporter", + job="node-exporter"}' + values: '1 1 1 1 1' + - series: 'node_network_up{device="eth3",instance="node-exporter", + job="node-exporter"}' + values: '1 1 1 1 1' + - series: 'node_network_up{device="eth4",instance="node-exporter", + job="node-exporter"}' + values: '1 1 1 1 1' + - series: 'node_network_up{device="eth4",instance="hostname1", + job="node-exporter"}' + values: '1 1 1 1 1' + - series: 'node_network_up{device="eth4",instance="hostname2", + job="node-exporter"}' + values: '0 0 0 0 0' + promql_expr_test: + - expr: | + node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) == + scalar( + max by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) != + quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) + ) + or + node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) == + scalar( + min by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) != + quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) + ) + eval_time: 1m + exp_samples: + - labels: '{device="eth4", instance="node-exporter", job="node-exporter"}' + value: 9000 + - labels: '{device="eth4", instance="hostname1", job="node-exporter"}' + value: 2200 + alert_rule_test: + - eval_time: 1m + alertname: CephNodeInconsistentMTU + exp_alerts: + - exp_labels: + device: eth4 + instance: hostname1 + job: node-exporter + severity: warning + type: ceph_default + exp_annotations: + summary: MTU settings across Ceph hosts are inconsistent + description: "Node hostname1 has a different MTU size (2200) than the median of devices named eth4." + - exp_labels: + device: eth4 + instance: node-exporter + job: node-exporter + severity: warning + type: ceph_default + exp_annotations: + summary: MTU settings across Ceph hosts are inconsistent + description: "Node node-exporter has a different MTU size (9000) than the median of devices named eth4." + + # pool full, data series has 6 but using topk(5) so to ensure the + # results are working as expected + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="POOL_FULL"}' + values: '0 0 0 1 1 1 1 1 1 1 1' + - series: 'ceph_pool_percent_used{pool_id="1"}' + values: '32+0x10' + - series: 'ceph_pool_percent_used{pool_id="2"}' + values: '96+0x10' + - series: 'ceph_pool_percent_used{pool_id="3"}' + values: '90+0x10' + - series: 'ceph_pool_percent_used{pool_id="4"}' + values: '72+0x10' + - series: 'ceph_pool_percent_used{pool_id="5"}' + values: '19+0x10' + - series: 'ceph_pool_percent_used{pool_id="6"}' + values: '10+0x10' + - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph", + name="cephfs_data",pool_id="1"}' + values: '1 1 1 1 1 1 1 1 1' + - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph", + name="rbd",pool_id="2"}' + values: '1 1 1 1 1 1 1 1 1' + - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph", + name="iscsi",pool_id="3"}' + values: '1 1 1 1 1 1 1 1 1' + - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph", + name="default.rgw.index",pool_id="4"}' + values: '1 1 1 1 1 1 1 1 1' + - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph", + name="default.rgw.log",pool_id="5"}' + values: '1 1 1 1 1 1 1 1 1' + - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph", + name="dummy",pool_id="6"}' + values: '1 1 1 1 1 1 1 1 1' + promql_expr_test: + - expr: ceph_health_detail{name="POOL_FULL"} > 0 + eval_time: 5m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="POOL_FULL"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: CephPoolFull + - eval_time: 10m + alertname: CephPoolFull + exp_alerts: + - exp_labels: + name: POOL_FULL + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.9.1 + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pool-full + summary: Pool is full - writes are blocked + description: "A pool has reached its MAX quota, or OSDs supporting the pool have reached the FULL threshold. Until this is resolved, writes to the pool will be blocked. Pool Breakdown (top 5) - rbd at 96% - iscsi at 90% - default.rgw.index at 72% - cephfs_data at 32% - default.rgw.log at 19% Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota max_bytes )" + # slow OSD ops + - interval : 1m + input_series: + - series: 'ceph_healthcheck_slow_ops{instance="ceph:9283",job="ceph"}' + values: '1+0x120' + promql_expr_test: + - expr: ceph_healthcheck_slow_ops > 0 + eval_time: 1m + exp_samples: + - labels: '{__name__="ceph_healthcheck_slow_ops", instance="ceph:9283", + job="ceph"}' + value: 1 + alert_rule_test: + - eval_time: 20m + alertname: CephSlowOps + exp_alerts: + - exp_labels: + instance: ceph:9283 + job: ceph + severity: warning + type: ceph_default + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops + summary: OSD operations are slow to complete + description: "1 OSD requests are taking too long to process (osd_op_complaint_time exceeded)" + + # slow daemon ops + - interval : 1m + input_series: + - series: 'ceph_daemon_health_metrics{ceph_daemon="osd.1", instance="ceph:9283",job="ceph", type="SLOW_OPS"}' + values: '1+0x120' + promql_expr_test: + - expr: 'ceph_daemon_health_metrics{type="SLOW_OPS"} > 0' + eval_time: 1m + exp_samples: + - labels: '{__name__="ceph_daemon_health_metrics", ceph_daemon="osd.1",instance="ceph:9283", + job="ceph", type="SLOW_OPS"}' + value: 1 + alert_rule_test: + - eval_time: 20m + alertname: CephDaemonSlowOps + exp_alerts: + - exp_labels: + instance: ceph:9283 + ceph_daemon: "osd.1" + job: ceph + severity: warning + type: ceph_default + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops + summary: osd.1 operations are slow to complete + description: "osd.1 operations are taking too long to process (complaint time exceeded)" + +# CEPHADM orchestrator alert triggers + - interval: 30s + input_series: + - series: 'ceph_health_detail{name="UPGRADE_EXCEPTION"}' + values: '1+0x40' + promql_expr_test: + - expr: ceph_health_detail{name="UPGRADE_EXCEPTION"} > 0 + eval_time: 2m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="UPGRADE_EXCEPTION"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: CephadmUpgradeFailed + - eval_time: 5m + alertname: CephadmUpgradeFailed + exp_alerts: + - exp_labels: + name: UPGRADE_EXCEPTION + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.11.2 + exp_annotations: + summary: Ceph version upgrade has failed + description: "The cephadm cluster upgrade process has failed. The cluster remains in an undetermined state. Please review the cephadm logs, to understand the nature of the issue" + - interval: 30s + input_series: + - series: 'ceph_health_detail{name="CEPHADM_FAILED_DAEMON"}' + values: '1+0x40' + promql_expr_test: + - expr: ceph_health_detail{name="CEPHADM_FAILED_DAEMON"} > 0 + eval_time: 2m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="CEPHADM_FAILED_DAEMON"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: CephadmDaemonFailed + - eval_time: 5m + alertname: CephadmDaemonFailed + exp_alerts: + - exp_labels: + name: CEPHADM_FAILED_DAEMON + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.11.1 + exp_annotations: + summary: A ceph daemon managed by cephadm is down + description: "A daemon managed by cephadm is no longer active. Determine, which daemon is down with 'ceph health detail'. you may start daemons with the 'ceph orch daemon start '" + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="CEPHADM_PAUSED"}' + values: '1 1 1 1 1 1 1 1 1' + promql_expr_test: + - expr: ceph_health_detail{name="CEPHADM_PAUSED"} > 0 + eval_time: 2m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="CEPHADM_PAUSED"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: CephadmPaused + - eval_time: 5m + alertname: CephadmPaused + exp_alerts: + - exp_labels: + name: CEPHADM_PAUSED + severity: warning + type: ceph_default + exp_annotations: + documentation: https://docs.ceph.com/en/latest/cephadm/operations#cephadm-paused + summary: Orchestration tasks via cephadm are PAUSED + description: "Cluster management has been paused manually. This will prevent the orchestrator from service management and reconciliation. If this is not intentional, resume cephadm operations with 'ceph orch resume'" +# MDS + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="MDS_DAMAGE"}' + values: '1 1 1 1 1 1 1 1 1' + promql_expr_test: + - expr: ceph_health_detail{name="MDS_DAMAGE"} > 0 + eval_time: 2m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="MDS_DAMAGE"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: CephFilesystemDamaged + - eval_time: 5m + alertname: CephFilesystemDamaged + exp_alerts: + - exp_labels: + name: MDS_DAMAGE + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.5.1 + exp_annotations: + documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages + summary: CephFS filesystem is damaged. + description: "Filesystem metadata has been corrupted. Data may be inaccessible. Analyze metrics from the MDS daemon admin socket, or escalate to support." + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="MDS_HEALTH_READ_ONLY"}' + values: '1 1 1 1 1 1 1 1 1' + promql_expr_test: + - expr: ceph_health_detail{name="MDS_HEALTH_READ_ONLY"} > 0 + eval_time: 2m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="MDS_HEALTH_READ_ONLY"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: CephFilesystemReadOnly + - eval_time: 5m + alertname: CephFilesystemReadOnly + exp_alerts: + - exp_labels: + name: MDS_HEALTH_READ_ONLY + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.5.2 + exp_annotations: + documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages + summary: CephFS filesystem in read only mode due to write error(s) + description: "The filesystem has switched to READ ONLY due to an unexpected error when writing to the metadata pool. Either analyze the output from the MDS daemon admin socket, or escalate to support." + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="MDS_ALL_DOWN"}' + values: '0 0 1 1 1 1 1 1 1 1 1' + promql_expr_test: + - expr: ceph_health_detail{name="MDS_ALL_DOWN"} > 0 + eval_time: 2m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="MDS_ALL_DOWN"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: CephFilesystemOffline + - eval_time: 10m + alertname: CephFilesystemOffline + exp_alerts: + - exp_labels: + name: MDS_ALL_DOWN + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.5.3 + exp_annotations: + documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-all-down + summary: CephFS filesystem is offline + description: "All MDS ranks are unavailable. The MDS daemons managing metadata are down, rendering the filesystem offline." + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="FS_DEGRADED"}' + values: '0 0 1 1 1 1 1 1 1 1 1' + promql_expr_test: + - expr: ceph_health_detail{name="FS_DEGRADED"} > 0 + eval_time: 2m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="FS_DEGRADED"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: CephFilesystemDegraded + - eval_time: 10m + alertname: CephFilesystemDegraded + exp_alerts: + - exp_labels: + name: FS_DEGRADED + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.5.4 + exp_annotations: + documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-degraded + summary: CephFS filesystem is degraded + description: "One or more metadata daemons (MDS ranks) are failed or in a damaged state. At best the filesystem is partially available, at worst the filesystem is completely unusable." + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="MDS_INSUFFICIENT_STANDBY"}' + values: '0 0 1 1 1 1 1 1 1 1 1' + promql_expr_test: + - expr: ceph_health_detail{name="MDS_INSUFFICIENT_STANDBY"} > 0 + eval_time: 2m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="MDS_INSUFFICIENT_STANDBY"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: CephFilesystemInsufficientStandby + - eval_time: 10m + alertname: CephFilesystemInsufficientStandby + exp_alerts: + - exp_labels: + name: MDS_INSUFFICIENT_STANDBY + severity: warning + type: ceph_default + exp_annotations: + documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-insufficient-standby + summary: Ceph filesystem standby daemons too few + description: "The minimum number of standby daemons required by standby_count_wanted is less than the current number of standby daemons. Adjust the standby count or increase the number of MDS daemons." + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="FS_WITH_FAILED_MDS"}' + values: '0 0 1 1 1 1 1 1 1 1 1' + promql_expr_test: + - expr: ceph_health_detail{name="FS_WITH_FAILED_MDS"} > 0 + eval_time: 2m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="FS_WITH_FAILED_MDS"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: CephFilesystemFailureNoStandby + - eval_time: 10m + alertname: CephFilesystemFailureNoStandby + exp_alerts: + - exp_labels: + name: FS_WITH_FAILED_MDS + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.5.5 + exp_annotations: + documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-with-failed-mds + summary: MDS daemon failed, no further standby available + description: "An MDS daemon has failed, leaving only one active rank and no available standby. Investigate the cause of the failure or add a standby MDS." + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="MDS_UP_LESS_THAN_MAX"}' + values: '0 0 1 1 1 1 1 1 1 1 1' + promql_expr_test: + - expr: ceph_health_detail{name="MDS_UP_LESS_THAN_MAX"} > 0 + eval_time: 2m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="MDS_UP_LESS_THAN_MAX"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: CephFilesystemMDSRanksLow + - eval_time: 10m + alertname: CephFilesystemMDSRanksLow + exp_alerts: + - exp_labels: + name: MDS_UP_LESS_THAN_MAX + severity: warning + type: ceph_default + exp_annotations: + documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-up-less-than-max + summary: Ceph MDS daemon count is lower than configured + description: "The filesystem's 'max_mds' setting defines the number of MDS ranks in the filesystem. The current number of active MDS daemons is less than this value." +# MGR + - interval: 1m + input_series: + - series: 'up{job="ceph", instance="ceph-mgr:9283"}' + values: '1+0x2 0+0x10' + promql_expr_test: + - expr: up{job="ceph"} == 0 + eval_time: 3m + exp_samples: + - labels: '{__name__="up", job="ceph", instance="ceph-mgr:9283"}' + value: 0 + alert_rule_test: + - eval_time: 1m + alertname: CephMgrPrometheusModuleInactive + - eval_time: 10m + alertname: CephMgrPrometheusModuleInactive + exp_alerts: + - exp_labels: + instance: ceph-mgr:9283 + job: ceph + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.6.2 + exp_annotations: + summary: The mgr/prometheus module is not available + description: "The mgr/prometheus module at ceph-mgr:9283 is unreachable. This could mean that the module has been disabled or the mgr daemon itself is down. Without the mgr/prometheus module metrics and alerts will no longer function. Open a shell to an admin node or toolbox pod and use 'ceph -s' to to determine whether the mgr is active. If the mgr is not active, restart it, otherwise you can determine module status with 'ceph mgr module ls'. If it is not listed as enabled, enable it with 'ceph mgr module enable prometheus'." + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="RECENT_MGR_MODULE_CRASH"}' + values: '0+0x2 1+0x20' + promql_expr_test: + - expr: ceph_health_detail{name="RECENT_MGR_MODULE_CRASH"} == 1 + eval_time: 3m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="RECENT_MGR_MODULE_CRASH"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: CephMgrModuleCrash + - eval_time: 15m + alertname: CephMgrModuleCrash + exp_alerts: + - exp_labels: + name: RECENT_MGR_MODULE_CRASH + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.6.1 + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#recent-mgr-module-crash + summary: A manager module has recently crashed + description: "One or more mgr modules have crashed and have yet to be acknowledged by an administrator. A crashed module may impact functionality within the cluster. Use the 'ceph crash' command to determine which module has failed, and archive it to acknowledge the failure." +# MON + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="MON_DISK_CRIT"}' + values: '0+0x2 1+0x10' + - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-a"}' + values: '1+0x13' + promql_expr_test: + - expr: ceph_health_detail{name="MON_DISK_CRIT"} == 1 + eval_time: 3m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="MON_DISK_CRIT"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: CephMonDiskspaceCritical + - eval_time: 10m + alertname: CephMonDiskspaceCritical + exp_alerts: + - exp_labels: + name: "MON_DISK_CRIT" + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.3.2 + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-crit + summary: Filesystem space on at least one monitor is critically low + description: "The free space available to a monitor's store is critically low. You should increase the space available to the monitor(s). The default directory is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook. Look for old, rotated versions of *.log and MANIFEST*. Do NOT touch any *.sst files. Also check any other directories under /var/lib/rook and other directories on the same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are; - ceph-mon-a" + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="MON_DISK_LOW"}' + values: '0+0x2 1+0x10' + - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-a"}' + values: '1+0x13' + promql_expr_test: + - expr: ceph_health_detail{name="MON_DISK_LOW"} == 1 + eval_time: 3m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="MON_DISK_LOW"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: CephMonDiskspaceLow + - eval_time: 10m + alertname: CephMonDiskspaceLow + exp_alerts: + - exp_labels: + name: "MON_DISK_LOW" + severity: warning + type: ceph_default + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-low + summary: Drive space on at least one monitor is approaching full + description: "The space available to a monitor's store is approaching full (>70% is the default). You should increase the space available to the monitor(s). The default directory is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook. Look for old, rotated versions of *.log and MANIFEST*. Do NOT touch any *.sst files. Also check any other directories under /var/lib/rook and other directories on the same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are; - ceph-mon-a" + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="MON_CLOCK_SKEW"}' + values: '0+0x2 1+0x10' + promql_expr_test: + - expr: ceph_health_detail{name="MON_CLOCK_SKEW"} == 1 + eval_time: 3m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="MON_CLOCK_SKEW"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: CephMonClockSkew + - eval_time: 10m + alertname: CephMonClockSkew + exp_alerts: + - exp_labels: + name: "MON_CLOCK_SKEW" + severity: warning + type: ceph_default + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-clock-skew + summary: Clock skew detected among monitors + description: "Ceph monitors rely on closely synchronized time to maintain quorum and cluster consistency. This event indicates that the time on at least one mon has drifted too far from the lead mon. Review cluster status with ceph -s. This will show which monitors are affected. Check the time sync status on each monitor host with 'ceph time-sync-status' and the state and peers of your ntpd or chrony daemon." + +# Check 3 mons one down, quorum at risk + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="MON_DOWN"}' + values: '0+0x2 1+0x12' + - series: 'ceph_mon_quorum_status{ceph_daemon="mon.a"}' + values: '1+0x14' + - series: 'ceph_mon_quorum_status{ceph_daemon="mon.b"}' + values: '1+0x14' + - series: 'ceph_mon_quorum_status{ceph_daemon="mon.c"}' + values: '1+0x2 0+0x12' + - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-1"}' + values: '1+0x14' + - series: 'ceph_mon_metadata{ceph_daemon="mon.b", hostname="ceph-mon-2"}' + values: '1+0x14' + - series: 'ceph_mon_metadata{ceph_daemon="mon.c", hostname="ceph-mon-3"}' + values: '1+0x14' + promql_expr_test: + - expr: ((ceph_health_detail{name="MON_DOWN"} == 1) * on() (count(ceph_mon_quorum_status == 1) == bool (floor(count(ceph_mon_metadata) / 2) + 1))) == 1 + eval_time: 3m + exp_samples: + - labels: '{}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: CephMonDownQuorumAtRisk + # shouldn't fire + - eval_time: 10m + alertname: CephMonDownQuorumAtRisk + exp_alerts: + - exp_labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.3.1 + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down + summary: Monitor quorum is at risk + description: "Quorum requires a majority of monitors (x 2) to be active. Without quorum the cluster will become inoperable, affecting all services and connected clients. The following monitors are down: - mon.c on ceph-mon-3" +# check 5 mons, 1 down - warning only + - interval: 1m + input_series: + - series: 'ceph_mon_quorum_status{ceph_daemon="mon.a"}' + values: '1+0x14' + - series: 'ceph_mon_quorum_status{ceph_daemon="mon.b"}' + values: '1+0x14' + - series: 'ceph_mon_quorum_status{ceph_daemon="mon.c"}' + values: '1+0x14' + - series: 'ceph_mon_quorum_status{ceph_daemon="mon.d"}' + values: '1+0x14' + - series: 'ceph_mon_quorum_status{ceph_daemon="mon.e"}' + values: '1+0x2 0+0x12' + - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-1"}' + values: '1+0x14' + - series: 'ceph_mon_metadata{ceph_daemon="mon.b", hostname="ceph-mon-2"}' + values: '1+0x14' + - series: 'ceph_mon_metadata{ceph_daemon="mon.c", hostname="ceph-mon-3"}' + values: '1+0x14' + - series: 'ceph_mon_metadata{ceph_daemon="mon.d", hostname="ceph-mon-4"}' + values: '1+0x14' + - series: 'ceph_mon_metadata{ceph_daemon="mon.e", hostname="ceph-mon-5"}' + values: '1+0x14' + promql_expr_test: + - expr: (count(ceph_mon_quorum_status == 0) <= (count(ceph_mon_metadata) - floor(count(ceph_mon_metadata) / 2) + 1)) + eval_time: 3m + exp_samples: + - labels: '{}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: CephMonDown + - eval_time: 10m + alertname: CephMonDown + exp_alerts: + - exp_labels: + severity: warning + type: ceph_default + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down + summary: One or more monitors down + description: "You have 1 monitor down. Quorum is still intact, but the loss of an additional monitor will make your cluster inoperable. The following monitors are down: - mon.e on ceph-mon-5\n" +# Device Health + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="DEVICE_HEALTH"}' + values: '0+0x2 1+0x10' + promql_expr_test: + - expr: ceph_health_detail{name="DEVICE_HEALTH"} == 1 + eval_time: 3m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="DEVICE_HEALTH"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: CephDeviceFailurePredicted + - eval_time: 10m + alertname: CephDeviceFailurePredicted + exp_alerts: + - exp_labels: + name: "DEVICE_HEALTH" + severity: warning + type: ceph_default + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#id2 + summary: Device(s) predicted to fail soon + description: "The device health module has determined that one or more devices will fail soon. To review device status use 'ceph device ls'. To show a specific device use 'ceph device info '. Mark the OSD out so that data may migrate to other OSDs. Once the OSD has drained, destroy the OSD, replace the device, and redeploy the OSD." + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="DEVICE_HEALTH_TOOMANY"}' + values: '0+0x2 1+0x10' + promql_expr_test: + - expr: ceph_health_detail{name="DEVICE_HEALTH_TOOMANY"} == 1 + eval_time: 3m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="DEVICE_HEALTH_TOOMANY"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: CephDeviceFailurePredictionTooHigh + - eval_time: 10m + alertname: CephDeviceFailurePredictionTooHigh + exp_alerts: + - exp_labels: + name: "DEVICE_HEALTH_TOOMANY" + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.4.7 + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-toomany + summary: Too many devices are predicted to fail, unable to resolve + description: "The device health module has determined that devices predicted to fail can not be remediated automatically, since too many OSDs would be removed from the cluster to ensure performance and availability. Prevent data integrity issues by adding new OSDs so that data may be relocated." + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="DEVICE_HEALTH_IN_USE"}' + values: '0+0x2 1+0x10' + promql_expr_test: + - expr: ceph_health_detail{name="DEVICE_HEALTH_IN_USE"} == 1 + eval_time: 3m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="DEVICE_HEALTH_IN_USE"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: CephDeviceFailureRelocationIncomplete + - eval_time: 10m + alertname: CephDeviceFailureRelocationIncomplete + exp_alerts: + - exp_labels: + name: "DEVICE_HEALTH_IN_USE" + severity: warning + type: ceph_default + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-in-use + summary: Device failure is predicted, but unable to relocate data + description: "The device health module has determined that one or more devices will fail soon, but the normal process of relocating the data on the device to other OSDs in the cluster is blocked. \nEnsure that the cluster has available free space. It may be necessary to add capacity to the cluster to allow data from the failing device to successfully migrate, or to enable the balancer." +# OSD + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="OSD_HOST_DOWN"}' + values: '0+0x2 1+0x10' + - series: 'ceph_osd_up{ceph_daemon="osd.0"}' + values: '1+0x2 0+0x10' + - series: 'ceph_osd_metadata{ceph_daemon="osd.0", hostname="ceph-osd-1"}' + values: '1+0x12' + promql_expr_test: + - expr: ceph_health_detail{name="OSD_HOST_DOWN"} == 1 + eval_time: 3m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="OSD_HOST_DOWN"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: CephOSDHostDown + - eval_time: 10m + alertname: CephOSDHostDown + exp_alerts: + - exp_labels: + name: "OSD_HOST_DOWN" + severity: warning + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.4.8 + exp_annotations: + summary: An OSD host is offline + description: "The following OSDs are down: - ceph-osd-1 : osd.0" + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="OSD_SLOW_PING_TIME_FRONT"}' + values: '0+0x2 1+0x20' + promql_expr_test: + - expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_FRONT"} == 0 + eval_time: 1m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="OSD_SLOW_PING_TIME_FRONT"}' + value: 0 + alert_rule_test: + - eval_time: 1m + alertname: CephOSDTimeoutsPublicNetwork + - eval_time: 10m + alertname: CephOSDTimeoutsPublicNetwork + exp_alerts: + - exp_labels: + name: "OSD_SLOW_PING_TIME_FRONT" + severity: warning + type: ceph_default + exp_annotations: + summary: Network issues delaying OSD heartbeats (public network) + description: "OSD heartbeats on the cluster's 'public' network (frontend) are running slow. Investigate the network for latency or loss issues. Use 'ceph health detail' to show the affected OSDs." + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="OSD_SLOW_PING_TIME_BACK"}' + values: '0+0x2 1+0x20' + promql_expr_test: + - expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_BACK"} == 0 + eval_time: 1m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="OSD_SLOW_PING_TIME_BACK"}' + value: 0 + alert_rule_test: + - eval_time: 1m + alertname: CephOSDTimeoutsClusterNetwork + - eval_time: 10m + alertname: CephOSDTimeoutsClusterNetwork + exp_alerts: + - exp_labels: + name: "OSD_SLOW_PING_TIME_BACK" + severity: warning + type: ceph_default + exp_annotations: + summary: Network issues delaying OSD heartbeats (cluster network) + description: "OSD heartbeats on the cluster's 'cluster' network (backend) are slow. Investigate the network for latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs." + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="BLUESTORE_DISK_SIZE_MISMATCH"}' + values: '0+0x2 1+0x20' + promql_expr_test: + - expr: ceph_health_detail{name="BLUESTORE_DISK_SIZE_MISMATCH"} == 0 + eval_time: 1m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="BLUESTORE_DISK_SIZE_MISMATCH"}' + value: 0 + alert_rule_test: + - eval_time: 1m + alertname: CephOSDInternalDiskSizeMismatch + - eval_time: 10m + alertname: CephOSDInternalDiskSizeMismatch + exp_alerts: + - exp_labels: + name: "BLUESTORE_DISK_SIZE_MISMATCH" + severity: warning + type: ceph_default + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-disk-size-mismatch + summary: OSD size inconsistency error + description: "One or more OSDs have an internal inconsistency between metadata and the size of the device. This could lead to the OSD(s) crashing in future. You should redeploy the affected OSDs." + - interval: 30s + input_series: + - series: 'ceph_health_detail{name="BLUESTORE_SPURIOUS_READ_ERRORS"}' + values: '0+0x2 1+0x20' + promql_expr_test: + - expr: ceph_health_detail{name="BLUESTORE_SPURIOUS_READ_ERRORS"} == 1 + eval_time: 3m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="BLUESTORE_SPURIOUS_READ_ERRORS"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: CephOSDReadErrors + - eval_time: 10m + alertname: CephOSDReadErrors + exp_alerts: + - exp_labels: + name: "BLUESTORE_SPURIOUS_READ_ERRORS" + severity: warning + type: ceph_default + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-spurious-read-errors + summary: Device read errors detected + description: "An OSD has encountered read errors, but the OSD has recovered by retrying the reads. This may indicate an issue with hardware or the kernel." + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="OSD_DOWN"}' + values: '0+0x2 1+0x10' + - series: 'ceph_osd_up{ceph_daemon="osd.0"}' + values: '1+0x12' + - series: 'ceph_osd_up{ceph_daemon="osd.1"}' + values: '1+0x2 0+0x10' + - series: 'ceph_osd_up{ceph_daemon="osd.2"}' + values: '1+0x12' + - series: 'ceph_osd_metadata{ceph_daemon="osd.0", hostname="ceph-osd-1"}' + values: '1+0x12' + - series: 'ceph_osd_metadata{ceph_daemon="osd.1", hostname="ceph-osd-2"}' + values: '1+0x12' + - series: 'ceph_osd_metadata{ceph_daemon="osd.2", hostname="ceph-osd-3"}' + values: '1+0x12' + promql_expr_test: + - expr: ceph_health_detail{name="OSD_DOWN"} == 1 + eval_time: 3m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="OSD_DOWN"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: CephOSDDown + - eval_time: 10m + alertname: CephOSDDown + exp_alerts: + - exp_labels: + name: "OSD_DOWN" + severity: warning + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.4.2 + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-down + summary: An OSD has been marked down + description: "1 OSD down for over 5mins. The following OSD is down: - osd.1 on ceph-osd-2\n" + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="OSD_NEARFULL"}' + values: '0+0x2 1+0x10' + promql_expr_test: + - expr: ceph_health_detail{name="OSD_NEARFULL"} == 1 + eval_time: 3m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="OSD_NEARFULL"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: CephOSDNearFull + - eval_time: 10m + alertname: CephOSDNearFull + exp_alerts: + - exp_labels: + name: "OSD_NEARFULL" + severity: warning + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.4.3 + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-nearfull + summary: OSD(s) running low on free space (NEARFULL) + description: One or more OSDs have reached the NEARFULL threshold. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data. + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="OSD_FULL"}' + values: '0+0x2 1+0x10' + promql_expr_test: + - expr: ceph_health_detail{name="OSD_FULL"} == 1 + eval_time: 3m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="OSD_FULL"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: CephOSDFull + - eval_time: 10m + alertname: CephOSDFull + exp_alerts: + - exp_labels: + name: "OSD_FULL" + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.4.6 + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-full + summary: OSD full, writes blocked + description: An OSD has reached the FULL threshold. Writes to pools that share the affected OSD will be blocked. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data. + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="OSD_BACKFILLFULL"}' + values: '0+0x2 1+0x10' + promql_expr_test: + - expr: ceph_health_detail{name="OSD_BACKFILLFULL"} == 1 + eval_time: 3m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="OSD_BACKFILLFULL"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: CephOSDBackfillFull + - eval_time: 10m + alertname: CephOSDBackfillFull + exp_alerts: + - exp_labels: + name: "OSD_BACKFILLFULL" + severity: warning + type: ceph_default + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-backfillfull + summary: OSD(s) too full for backfill operations + description: "An OSD has reached the BACKFILL FULL threshold. This will prevent rebalance operations from completing. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data." + - interval: 30s + input_series: + - series: 'ceph_health_detail{name="OSD_TOO_MANY_REPAIRS"}' + values: '0+0x2 1+0x20' + promql_expr_test: + - expr: ceph_health_detail{name="OSD_TOO_MANY_REPAIRS"} == 0 + eval_time: 1m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="OSD_TOO_MANY_REPAIRS"}' + value: 0 + alert_rule_test: + - eval_time: 1m + alertname: CephOSDTooManyRepairs + - eval_time: 10m + alertname: CephOSDTooManyRepairs + exp_alerts: + - exp_labels: + name: "OSD_TOO_MANY_REPAIRS" + severity: warning + type: ceph_default + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-too-many-repairs + summary: OSD reports a high number of read errors + description: Reads from an OSD have used a secondary PG to return data to the client, indicating a potential failing drive. +# Pools + # trigger percent full prediction on pools 1 and 2 only + - interval: 12h + input_series: + - series: 'ceph_pool_percent_used{pool_id="1", instance="9090"}' + values: '1 1 1 1 1' + - series: 'ceph_pool_percent_used{pool_id="1", instance="8090"}' + values: '78 89 79 98 78' + - series: 'ceph_pool_percent_used{pool_id="2", instance="9090"}' + values: '1 1 1 1 1' + - series: 'ceph_pool_percent_used{pool_id="2", instance="8090"}' + values: '22 22 23 23 24' + - series: 'ceph_pool_metadata{pool_id="1" , instance="9090" ,name="rbd",type="replicated"}' + values: '1 1 1 1 1' + - series: 'ceph_pool_metadata{pool_id="1", instance="8090",name="default.rgw.index",type="replicated"}' + values: '1 1 1 1 1' + - series: 'ceph_pool_metadata{pool_id="2" , instance="9090" ,name="rbd",type="replicated"}' + values: '1 1 1 1 1' + - series: 'ceph_pool_metadata{pool_id="2", instance="8090",name="default.rgw.index",type="replicated"}' + values: '1 1 1 1 1' + promql_expr_test: + - expr: | + (predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id, instance) + group_right() ceph_pool_metadata) >= 95 + eval_time: 36h + exp_samples: + - labels: '{instance="8090",name="default.rgw.index",pool_id="1",type="replicated"}' + value: 1.435E+02 # 142% + alert_rule_test: + - eval_time: 48h + alertname: CephPoolGrowthWarning + exp_alerts: + - exp_labels: + instance: 8090 + name: default.rgw.index + pool_id: 1 + severity: warning + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.9.2 + exp_annotations: + summary: Pool growth rate may soon exceed capacity + description: Pool 'default.rgw.index' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours. + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="POOL_BACKFILLFULL"}' + values: '0+0x2 1+0x10' + promql_expr_test: + - expr: ceph_health_detail{name="POOL_BACKFILLFULL"} == 1 + eval_time: 3m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="POOL_BACKFILLFULL"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: CephPoolBackfillFull + - eval_time: 5m + alertname: CephPoolBackfillFull + exp_alerts: + - exp_labels: + name: "POOL_BACKFILLFULL" + severity: warning + type: ceph_default + exp_annotations: + summary: Free space in a pool is too low for recovery/backfill + description: A pool is approaching the near full threshold, which will prevent recovery/backfill operations from completing. Consider adding more capacity. + + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="POOL_NEAR_FULL"}' + values: '0+0x2 1+0x10' + promql_expr_test: + - expr: ceph_health_detail{name="POOL_NEAR_FULL"} == 1 + eval_time: 3m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="POOL_NEAR_FULL"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: CephPoolNearFull + - eval_time: 10m + alertname: CephPoolNearFull + exp_alerts: + - exp_labels: + name: "POOL_NEAR_FULL" + severity: warning + type: ceph_default + exp_annotations: + summary: One or more Ceph pools are nearly full + description: "A pool has exceeded the warning (percent full) threshold, or OSDs supporting the pool have reached the NEARFULL threshold. Writes may continue, but you are at risk of the pool going read-only if more capacity isn't made available. Determine the affected pool with 'ceph df detail', looking at QUOTA BYTES and STORED. Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota max_bytes ). Also ensure that the balancer is active." + +# PGs + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="PG_NOT_SCRUBBED"}' + values: '0+0x2 1+0x10' + promql_expr_test: + - expr: ceph_health_detail{name="PG_NOT_SCRUBBED"} == 1 + eval_time: 3m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="PG_NOT_SCRUBBED"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: CephPGNotScrubbed + - eval_time: 10m + alertname: CephPGNotScrubbed + exp_alerts: + - exp_labels: + name: "PG_NOT_SCRUBBED" + severity: warning + type: ceph_default + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-scrubbed + summary: Placement group(s) have not been scrubbed + description: "One or more PGs have not been scrubbed recently. Scrubs check metadata integrity, protecting against bit-rot. They check that metadata is consistent across data replicas. When PGs miss their scrub interval, it may indicate that the scrub window is too small, or PGs were not in a 'clean' state during the scrub window. You can manually initiate a scrub with: ceph pg scrub " + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="PG_DAMAGED"}' + values: '0+0x4 1+0x20' + promql_expr_test: + - expr: ceph_health_detail{name=~"PG_DAMAGED|OSD_SCRUB_ERRORS"} == 1 + eval_time: 5m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="PG_DAMAGED"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: CephPGsDamaged + - eval_time: 10m + alertname: CephPGsDamaged + exp_alerts: + - exp_labels: + name: "PG_DAMAGED" + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.7.4 + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-damaged + summary: Placement group damaged, manual intervention needed + description: During data consistency checks (scrub), at least one PG has been flagged as being damaged or inconsistent. Check to see which PG is affected, and attempt a manual repair if necessary. To list problematic placement groups, use 'rados list-inconsistent-pg '. To repair PGs use the 'ceph pg repair ' command. + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="TOO_MANY_PGS"}' + values: '0+0x4 1+0x20' + promql_expr_test: + - expr: ceph_health_detail{name="TOO_MANY_PGS"} == 1 + eval_time: 5m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="TOO_MANY_PGS"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: CephPGsHighPerOSD + - eval_time: 10m + alertname: CephPGsHighPerOSD + exp_alerts: + - exp_labels: + name: "TOO_MANY_PGS" + severity: warning + type: ceph_default + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#too-many-pgs + summary: Placement groups per OSD is too high + description: "The number of placement groups per OSD is too high (exceeds the mon_max_pg_per_osd setting).\n Check that the pg_autoscaler has not been disabled for any pools with 'ceph osd pool autoscale-status', and that the profile selected is appropriate. You may also adjust the target_size_ratio of a pool to guide the autoscaler based on the expected relative size of the pool ('ceph osd pool set cephfs.cephfs.meta target_size_ratio .1') or set the pg_autoscaler mode to 'warn' and adjust pg_num appropriately for one or more pools." + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="PG_RECOVERY_FULL"}' + values: '0+0x2 1+0x20' + promql_expr_test: + - expr: ceph_health_detail{name="PG_RECOVERY_FULL"} == 0 + eval_time: 1m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="PG_RECOVERY_FULL"}' + value: 0 + alert_rule_test: + - eval_time: 1m + alertname: CephPGRecoveryAtRisk + - eval_time: 10m + alertname: CephPGRecoveryAtRisk + exp_alerts: + - exp_labels: + name: "PG_RECOVERY_FULL" + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.7.5 + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-recovery-full + summary: OSDs are too full for recovery + description: Data redundancy is at risk since one or more OSDs are at or above the 'full' threshold. Add more capacity to the cluster, restore down/out OSDs, or delete unwanted data. + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="PG_BACKFILL_FULL"}' + values: '0+0x2 1+0x20' + promql_expr_test: + - expr: ceph_health_detail{name="PG_BACKFILL_FULL"} == 0 + eval_time: 1m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="PG_BACKFILL_FULL"}' + value: 0 + alert_rule_test: + - eval_time: 1m + alertname: CephPGBackfillAtRisk + - eval_time: 10m + alertname: CephPGBackfillAtRisk + exp_alerts: + - exp_labels: + name: "PG_BACKFILL_FULL" + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.7.6 + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-backfill-full + summary: Backfill operations are blocked due to lack of free space + description: Data redundancy may be at risk due to lack of free space within the cluster. One or more OSDs have reached the 'backfillfull' threshold. Add more capacity, or delete unwanted data. + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="PG_AVAILABILITY"}' + values: '0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1' + - series: 'ceph_health_detail{name="OSD_DOWN"}' + values: '0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0' + promql_expr_test: + - expr: ((ceph_health_detail{name="PG_AVAILABILITY"} == 1) - scalar(ceph_health_detail{name="OSD_DOWN"})) + eval_time: 1m + # empty set at 1m + exp_samples: + alert_rule_test: + # PG_AVAILABILITY and OSD_DOWN not firing .. no alert + - eval_time: 1m + alertname: CephPGUnavailableBlockingIO + exp_alerts: + # PG_AVAILABILITY firing, but osd_down is active .. no alert + - eval_time: 5m + alertname: CephPGUnavailableBlockingIO + exp_alerts: + # PG_AVAILABILITY firing, AND OSD_DOWN is not active...raise the alert + - eval_time: 15m + alertname: CephPGUnavailableBlockingIO + exp_alerts: + - exp_labels: + name: "PG_AVAILABILITY" + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.7.3 + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-availability + summary: PG is unavailable, blocking I/O + description: Data availability is reduced, impacting the cluster's ability to service I/O. One or more placement groups (PGs) are in a state that blocks I/O. + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="PG_NOT_DEEP_SCRUBBED"}' + values: '0+0x2 1+0x10' + promql_expr_test: + - expr: ceph_health_detail{name="PG_NOT_DEEP_SCRUBBED"} == 1 + eval_time: 3m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="PG_NOT_DEEP_SCRUBBED"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: CephPGNotDeepScrubbed + - eval_time: 10m + alertname: CephPGNotDeepScrubbed + exp_alerts: + - exp_labels: + name: "PG_NOT_DEEP_SCRUBBED" + severity: warning + type: ceph_default + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-deep-scrubbed + summary: Placement group(s) have not been deep scrubbed + description: One or more PGs have not been deep scrubbed recently. Deep scrubs protect against bit-rot. They compare data replicas to ensure consistency. When PGs miss their deep scrub interval, it may indicate that the window is too small or PGs were not in a 'clean' state during the deep-scrub window. + +# Prometheus + - interval: 1m + input_series: + - series: 'up{job="myjob"}' + values: '1+0x10' + promql_expr_test: + - expr: absent(up{job="ceph"}) + eval_time: 1m + exp_samples: + - labels: '{job="ceph"}' + value: 1 + alert_rule_test: + - eval_time: 5m + alertname: PrometheusJobMissing + exp_alerts: + - exp_labels: + job: ceph + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.12.1 + exp_annotations: + summary: The scrape job for Ceph is missing from Prometheus + description: The prometheus job that scrapes from Ceph is no longer defined, this will effectively mean you'll have no metrics or alerts for the cluster. Please review the job definitions in the prometheus.yml file of the prometheus instance. +# RADOS + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="OBJECT_UNFOUND"}' + values: '0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' + - series: 'ceph_osd_up{ceph_daemon="osd.0"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' + - series: 'ceph_osd_up{ceph_daemon="osd.1"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' + - series: 'ceph_osd_up{ceph_daemon="osd.2"}' + values: '1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1' + - series: 'ceph_osd_metadata{ceph_daemon="osd.0"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' + - series: 'ceph_osd_metadata{ceph_daemon="osd.1"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' + - series: 'ceph_osd_metadata{ceph_daemon="osd.2"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' + promql_expr_test: + - expr: (ceph_health_detail{name="OBJECT_UNFOUND"} == 1) * on() (count(ceph_osd_up == 1) == bool count(ceph_osd_metadata)) == 1 + eval_time: 1m + exp_samples: + alert_rule_test: + # OBJECT_UNFOUND but osd.2 is down, so don't fire + - eval_time: 5m + alertname: CephObjectMissing + exp_alerts: + # OBJECT_UNFOUND and all osd's are online, so fire + - eval_time: 15m + alertname: CephObjectMissing + exp_alerts: + - exp_labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.10.1 + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#object-unfound + summary: Object(s) marked UNFOUND + description: The latest version of a RADOS object can not be found, even though all OSDs are up. I/O requests for this object from clients will block (hang). Resolving this issue may require the object to be rolled back to a prior version manually, and manually verified. +# Generic Alerts + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="RECENT_CRASH"}' + values: '0 0 0 1 1 1 1 1 1 1 1' + promql_expr_test: + - expr: ceph_health_detail{name="RECENT_CRASH"} == 1 + eval_time: 1m + exp_samples: + alert_rule_test: + # not firing + - eval_time: 1m + alertname: CephDaemonCrash + exp_alerts: + # firing + - eval_time: 10m + alertname: CephDaemonCrash + exp_alerts: + - exp_labels: + name: RECENT_CRASH + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.1.2 + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#recent-crash + summary: One or more Ceph daemons have crashed, and are pending acknowledgement + description: One or more daemons have crashed recently, and need to be acknowledged. This notification ensures that software crashes do not go unseen. To acknowledge a crash, use the 'ceph crash archive ' command. + + # new rbdmirror alerts tests + # RBD Mirror Alerts + # alert: CephRBDMirrorImagesPerDaemonHigh + - interval: 1m + input_series: + - series: 'ceph_rbd_mirror_snapshot_image_snapshots{ceph_daemon="client.admin.40628", image="image1", namespace="default", pool="data"}' + values: '0+0x20 1+1x130' + - series: 'ceph_rbd_mirror_snapshot_image_snapshots{ceph_daemon="client.admin.40628", image="image2", namespace="default", pool="data"}' + values: '1+1x130 131+0x20' + # prometheus query test + promql_expr_test: + # negative test where there are no samples + - expr: sum by (ceph_daemon, namespace) (ceph_rbd_mirror_snapshot_image_snapshots) > 100 + eval_time: 50m + exp_samples: + # second positive test + - expr: sum by (ceph_daemon, namespace) (ceph_rbd_mirror_snapshot_image_snapshots) > 100 + eval_time: 70m + exp_samples: + - labels: '{ceph_daemon="client.admin.40628", namespace="default"}' + value: 121 + # prometheus alert test + alert_rule_test: + # negative test + - eval_time: 30m + alertname: CephRBDMirrorImagesPerDaemonHigh + exp_alerts: + # positive test where alert is fired + - eval_time: 70m + alertname: CephRBDMirrorImagesPerDaemonHigh + exp_alerts: + - exp_labels: + oid: "1.3.6.1.4.1.50495.1.2.1.10.2" + severity: "critical" + type: "ceph_default" + ceph_daemon: "client.admin.40628" + namespace: "default" + exp_annotations: + description: "Number of image replications per daemon is not suppossed to go beyond threshold 100" + summary: "Number of image replications are now above 100" + + # alert: CephRBDMirrorImagesNotInSync + - interval: 1m + input_series: + - series: 'ceph_rbd_mirror_snapshot_image_local_timestamp{ceph_daemon="client.admin.40628",image="image1",namespace="default",pool="data"}' + values: '1.678+0x20 2.03+0x20 3.21+0x20' + - series: 'ceph_rbd_mirror_snapshot_image_remote_timestamp{ceph_daemon="client.admin.40628",image="image1",namespace="default",pool="data"}' + values: '1.678+0x20 2.03+0x20 2.03+0x20' + # prometheus query test + promql_expr_test: + # negative test where there are no samples + - expr: sum by (ceph_daemon, image, namespace, pool) (topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0 + eval_time: 30m + exp_samples: + # second positive test + - expr: sum by (ceph_daemon, image, namespace, pool) (topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0 + eval_time: 45m + exp_samples: + - labels: '{ceph_daemon="client.admin.40628", image="image1", namespace="default", pool="data"}' + value: 1.1800000000000002 + # prometheus alert test + alert_rule_test: + # negative test + - eval_time: 20m + alertname: CephRBDMirrorImagesNotInSync + exp_alerts: + # positive test where alert is fired + - eval_time: 50m + alertname: CephRBDMirrorImagesNotInSync + exp_alerts: + - exp_labels: + image: "image1" + pool: "data" + oid: "1.3.6.1.4.1.50495.1.2.1.10.3" + severity: "critical" + type: "ceph_default" + ceph_daemon: "client.admin.40628" + namespace: "default" + exp_annotations: + description: "Both local and remote RBD mirror images should be in sync." + summary: "Some of the RBD mirror images are not in sync with the remote counter parts." + + # alert: CephRBDMirrorImagesNotInSyncVeryHigh + - interval: 1m + input_series: + - series: 'ceph_rbd_mirror_snapshot_image_local_timestamp{ceph_daemon="client.admin.40628",image="image1",namespace="default",pool="data"}' + values: '1.678+0x20 2.03+0x20 3.21+0x20' + - series: 'ceph_rbd_mirror_snapshot_image_remote_timestamp{ceph_daemon="client.admin.40628",image="image1",namespace="default",pool="data"}' + values: '1.678+0x20 2.03+0x20 2.03+0x20' + - series: 'ceph_rbd_mirror_snapshot_image_local_timestamp{ceph_daemon="client.admin.40628",image="image2",namespace="default",pool="data"}' + values: '2.189+0x20 3.301+0x14 3.301+0x26' + - series: 'ceph_rbd_mirror_snapshot_image_remote_timestamp{ceph_daemon="client.admin.40628",image="image2",namespace="default",pool="data"}' + values: '2.189+0x20 3.301+0x14 7.13+0x26' + - series: 'ceph_rbd_mirror_snapshot_image_local_timestamp{ceph_daemon="client.admin.40628",image="image3",namespace="default",pool="data"}' + values: '2.189+0x20 3.301+0x14 3.301+0x26' + - series: 'ceph_rbd_mirror_snapshot_image_remote_timestamp{ceph_daemon="client.admin.40628",image="image3",namespace="default",pool="data"}' + values: '2.189+0x20 3.301+0x14 7.13+0x26' + - series: 'ceph_rbd_mirror_snapshot_image_local_timestamp{ceph_daemon="client.admin.40628",image="image4",namespace="default",pool="data"}' + values: '2.189+0x65' + - series: 'ceph_rbd_mirror_snapshot_image_remote_timestamp{ceph_daemon="client.admin.40628",image="image4",namespace="default",pool="data"}' + values: '2.189+0x65' + - series: 'ceph_rbd_mirror_snapshot_snapshots{ceph_daemon="client.admin.40628"}' + values: '1+0x20 2+0x45' + # prometheus query test + promql_expr_test: + # test each query individually + # query 1 + - expr: count by (ceph_daemon) ((topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0) + eval_time: 45m + exp_samples: + - labels: '{ceph_daemon="client.admin.40628"}' + value: 3 + # query 2 + - expr: sum by (ceph_daemon) (ceph_rbd_mirror_snapshot_snapshots) * .1 + eval_time: 45m + exp_samples: + - labels: '{ceph_daemon="client.admin.40628"}' + value: 0.2 + # prometheus alert test + alert_rule_test: + # negative test + - eval_time: 2m + alertname: CephRBDMirrorImagesNotInSyncVeryHigh + exp_alerts: + # positive test where alert is fired + - eval_time: 50m + alertname: CephRBDMirrorImagesNotInSyncVeryHigh + exp_alerts: + - exp_labels: + ceph_daemon: "client.admin.40628" + oid: "1.3.6.1.4.1.50495.1.2.1.10.4" + severity: "critical" + type: "ceph_default" + exp_annotations: + description: "More than 10% of the images have synchronization problems" + summary: "Number of unsynchronized images are very high." + + # alert: "CephRBDMirrorImageTransferBandwidthHigh" + - interval: 1m + input_series: + - series: 'ceph_rbd_mirror_journal_replay_bytes{ceph_daemon="client.admin.40628"}' + values: '0+0x10 1+0x5 10+30x25 736+200x30' + # prometheus query test + promql_expr_test: + # test each couple of rates + # rate 1 + - expr: rate(ceph_rbd_mirror_journal_replay_bytes[5m]) + eval_time: 5m + exp_samples: + - labels: '{ceph_daemon="client.admin.40628"}' + value: 0.0 + # rate 2 + - expr: rate(ceph_rbd_mirror_journal_replay_bytes[5m]) + eval_time: 20m + exp_samples: + - labels: '{ceph_daemon="client.admin.40628"}' + value: 0.33 + # rate 3 + - expr: rate(ceph_rbd_mirror_journal_replay_bytes[5m]) + eval_time: 40m + exp_samples: + - labels: '{ceph_daemon="client.admin.40628"}' + value: 0.5 + # rate 4 + - expr: rate(ceph_rbd_mirror_journal_replay_bytes[5m]) + eval_time: 50m + exp_samples: + - labels: '{ceph_daemon="client.admin.40628"}' + value: 3.3333333333333335 + # prometheus alert test + alert_rule_test: + # negative test + - eval_time: 2m + alertname: CephRBDMirrorImageTransferBandwidthHigh + exp_alerts: + # positive test where alert is fired + - eval_time: 50m + alertname: CephRBDMirrorImageTransferBandwidthHigh + exp_alerts: + - exp_labels: + ceph_daemon: "client.admin.40628" + oid: "1.3.6.1.4.1.50495.1.2.1.10.5" + severity: "warning" + type: "ceph_default" + exp_annotations: + description: "Detected a heavy increase in bandwidth for rbd replications (over 80%) in the last 30 min. This might not be a problem, but it is good to review the number of images being replicated simultaneously" + summary: "The replication network usage has been increased over 80% in the last 30 minutes. Review the number of images being replicated. This alert will be cleaned automatically after 30 minutes" + + - interval: 30s + input_series: + - series: 'ceph_health_detail{name="HARDWARE_STORAGE"}' + values: '1+0x40' + promql_expr_test: + - expr: ceph_health_detail{name="HARDWARE_STORAGE"} > 0 + eval_time: 2m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="HARDWARE_STORAGE"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: HardwareStorageError + - eval_time: 5m + alertname: HardwareStorageError + exp_alerts: + - exp_labels: + name: HARDWARE_STORAGE + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.13.1 + exp_annotations: + summary: Storage devices error(s) detected + description: "Some storage devices are in error. Check `ceph health detail`." + - interval: 30s + input_series: + - series: 'ceph_health_detail{name="HARDWARE_MEMORY"}' + values: '1+0x40' + promql_expr_test: + - expr: ceph_health_detail{name="HARDWARE_MEMORY"} > 0 + eval_time: 2m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="HARDWARE_MEMORY"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: HardwareMemoryError + - eval_time: 5m + alertname: HardwareMemoryError + exp_alerts: + - exp_labels: + name: HARDWARE_MEMORY + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.13.2 + exp_annotations: + summary: DIMM error(s) detected + description: "DIMM error(s) detected. Check `ceph health detail`." + - interval: 30s + input_series: + - series: 'ceph_health_detail{name="HARDWARE_PROCESSOR"}' + values: '1+0x40' + promql_expr_test: + - expr: ceph_health_detail{name="HARDWARE_PROCESSOR"} > 0 + eval_time: 2m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="HARDWARE_PROCESSOR"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: HardwareProcessorError + - eval_time: 5m + alertname: HardwareProcessorError + exp_alerts: + - exp_labels: + name: HARDWARE_PROCESSOR + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.13.3 + exp_annotations: + summary: Processor error(s) detected + description: "Processor error(s) detected. Check `ceph health detail`." + - interval: 30s + input_series: + - series: 'ceph_health_detail{name="HARDWARE_NETWORK"}' + values: '1+0x40' + promql_expr_test: + - expr: ceph_health_detail{name="HARDWARE_NETWORK"} > 0 + eval_time: 2m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="HARDWARE_NETWORK"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: HardwareNetworkError + - eval_time: 5m + alertname: HardwareNetworkError + exp_alerts: + - exp_labels: + name: HARDWARE_NETWORK + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.13.4 + exp_annotations: + summary: Network error(s) detected + description: "Network error(s) detected. Check `ceph health detail`." + - interval: 30s + input_series: + - series: 'ceph_health_detail{name="HARDWARE_POWER"}' + values: '1+0x40' + promql_expr_test: + - expr: ceph_health_detail{name="HARDWARE_POWER"} > 0 + eval_time: 2m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="HARDWARE_POWER"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: HardwarePowerError + - eval_time: 5m + alertname: HardwarePowerError + exp_alerts: + - exp_labels: + name: HARDWARE_POWER + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.13.5 + exp_annotations: + summary: Power supply error(s) detected + description: "Power supply error(s) detected. Check `ceph health detail`." + - interval: 30s + input_series: + - series: 'ceph_health_detail{name="HARDWARE_FANS"}' + values: '1+0x40' + promql_expr_test: + - expr: ceph_health_detail{name="HARDWARE_FANS"} > 0 + eval_time: 2m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="HARDWARE_FANS"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: HardwareFanError + - eval_time: 5m + alertname: HardwareFanError + exp_alerts: + - exp_labels: + name: HARDWARE_FANS + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.13.6 + exp_annotations: + summary: Fan error(s) detected + description: "Fan error(s) detected. Check `ceph health detail`." + +# nvmeof Tests + # NVMeoFSubsystemNamespaceLimit + - interval: 1m + input_series: + - series: 'ceph_nvmeof_subsystem_namespace_limit{nqn="wah"}' + values: '5x10' + - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="wah", bdev_name="disk1"}' + values: '1x10' + - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="wah", bdev_name="disk2"}' + values: '1x10' + - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="wah", bdev_name="disk3"}' + values: '1x10' + - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="wah", bdev_name="disk4"}' + values: '1x10' + - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="wah", bdev_name="disk5"}' + values: '1x10' + - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="wah", bdev_name="disk6"}' + values: '1x10' + promql_expr_test: + - expr: (count by(nqn) (ceph_nvmeof_subsystem_namespace_metadata)) >= ceph_nvmeof_subsystem_namespace_limit + eval_time: 1m + exp_samples: + - labels: '{nqn="wah"}' + value: 6 + alert_rule_test: + - eval_time: 5m + alertname: NVMeoFSubsystemNamespaceLimit + exp_alerts: + - exp_labels: + nqn: wah + severity: warning + type: ceph_default + exp_annotations: + summary: "wah subsystem has reached its maximum number of namespaces " + description: "Subsystems have a max namespace limit defined at creation time. This alert means that no more namespaces can be added to wah" + + # NVMeoFTooManyGateways + - interval: 1m + input_series: + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.1"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.2"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.3"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.4"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.5"}' + values: '1+0x20' + promql_expr_test: + - expr: count(ceph_nvmeof_gateway_info) > 4.00 + eval_time: 1m + exp_samples: + - labels: '{}' + value: 5 + alert_rule_test: + - eval_time: 5m + alertname: NVMeoFTooManyGateways + exp_alerts: + - exp_labels: + severity: warning + type: ceph_default + exp_annotations: + summary: "Max supported gateways exceeded " + description: "You may create many gateways, but 4 is the tested limit" + + # NVMeoFMaxGatewayGroupSize + - interval: 1m + input_series: + - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.1"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.2"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.3"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{group="group-2",addr="1.1.1.4"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{group="group-2",addr="1.1.1.5"}' + values: '1+0x20' + promql_expr_test: + - expr: count by(group) (ceph_nvmeof_gateway_info) > 2.00 + eval_time: 1m + exp_samples: + - labels: '{group="group-1"}' + value: 3 + alert_rule_test: + - eval_time: 5m + alertname: NVMeoFMaxGatewayGroupSize + exp_alerts: + - exp_labels: + group: group-1 + severity: warning + type: ceph_default + exp_annotations: + summary: "Max gateways within a gateway group (group-1) exceeded " + description: "You may create many gateways in a gateway group, but 2 is the tested limit" + + # NVMeoFSingleGatewayGroup + - interval: 1m + input_series: + - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.2"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{group="group-2",addr="1.1.1.4"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{group="group-2",addr="1.1.1.5"}' + values: '1+0x20' + promql_expr_test: + - expr: count by(group) (ceph_nvmeof_gateway_info) == 1 + eval_time: 1m + exp_samples: + - labels: '{group="group-1"}' + value: 1 + alert_rule_test: + - eval_time: 5m + alertname: NVMeoFSingleGatewayGroup + exp_alerts: + - exp_labels: + group: group-1 + severity: warning + type: ceph_default + exp_annotations: + summary: "The gateway group group-1 consists of a single gateway - HA is not possible " + description: "Although a single member gateway group is valid, it should only be used for test purposes" + + # NVMeoFHighGatewayCPU + - interval: 1m + input_series: + - series: 'ceph_nvmeof_reactor_seconds_total{mode="busy",name="nvmf_tgt_poll_group_0",instance="node-1:10008"}' + values: '880+5080x20' + promql_expr_test: + - expr: label_replace(avg by(instance) (rate(ceph_nvmeof_reactor_seconds_total{mode="busy"}[1m])),"instance","$1","instance","(.*):.*") > 80 + eval_time: 5m + exp_samples: + - labels: '{instance="node-1"}' + value: 8.466666666666667E+01 + alert_rule_test: + - eval_time: 15m + alertname: NVMeoFHighGatewayCPU + exp_alerts: + - exp_labels: + instance: node-1 + severity: warning + type: ceph_default + exp_annotations: + summary: "CPU used by node-1 NVMe-oF Gateway is high " + description: "Typically, high CPU may indicate degraded performance. Consider increasing the number of reactor cores" + + # NVMeoFGatewayOpenSecurity + - interval: 1m + input_series: + - series: 'ceph_nvmeof_subsystem_metadata{nqn="nqn.good", allow_any_host="no"}' + values: '1+0x10' + - series: 'ceph_nvmeof_subsystem_metadata{nqn="nqn.bad", allow_any_host="yes"}' + values: '1+0x10' + promql_expr_test: + - expr: ceph_nvmeof_subsystem_metadata{allow_any_host="yes"} + eval_time: 1m + exp_samples: + - labels: '{__name__="ceph_nvmeof_subsystem_metadata",nqn="nqn.bad",allow_any_host="yes"}' + value: 1 + alert_rule_test: + - eval_time: 5m + alertname: NVMeoFGatewayOpenSecurity + exp_alerts: + - exp_labels: + allow_any_host: yes + nqn: nqn.bad + severity: warning + type: ceph_default + exp_annotations: + summary: "Subsystem nqn.bad has been defined without host level security " + description: "It is good practice to ensure subsystems use host security to reduce the risk of unexpected data loss" + + # NVMeoFTooManySubsystems + - interval: 1m + input_series: + - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn1"}' + values: '1+0x10' + - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn2"}' + values: '1+0x10' + - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn3"}' + values: '1+0x10' + - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn4"}' + values: '1+0x10' + - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn5"}' + values: '1+0x10' + - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn6"}' + values: '1+0x10' + - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn7"}' + values: '1+0x10' + - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn8"}' + values: '1+0x10' + - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn9"}' + values: '1+0x10' + - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn10"}' + values: '1+0x10' + - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn11"}' + values: '1+0x10' + - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn12"}' + values: '1+0x10' + - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn13"}' + values: '1+0x10' + - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn14"}' + values: '1+0x10' + - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn15"}' + values: '1+0x10' + - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn16"}' + values: '1+0x10' + - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn17"}' + values: '1+0x10' + promql_expr_test: + - expr: count by(gateway_host) (label_replace(ceph_nvmeof_subsystem_metadata,"gateway_host","$1","instance","(.*):.*")) > 16 + eval_time: 1m + exp_samples: + - labels: '{gateway_host="node-1"}' + value: 17 + alert_rule_test: + - eval_time: 5m + alertname: NVMeoFTooManySubsystems + exp_alerts: + - exp_labels: + gateway_host: node-1 + severity: warning + type: ceph_default + exp_annotations: + summary: "The number of subsystems defined to the gateway exceeds supported values " + description: "Although you may continue to create subsystems in node-1, the configuration may not be supported" + + # NVMeoFVersionMismatch + - interval: 1m + input_series: + - series: 'ceph_nvmeof_gateway_info{version="0.0.7"}' + values: '1+0x80' + - series: 'ceph_nvmeof_gateway_info{version="1.0.0"}' + values: '1+0x80' + promql_expr_test: + - expr: count(count by(version) (ceph_nvmeof_gateway_info)) > 1 + eval_time: 1m + exp_samples: + - labels: '{}' + value: 2 + alert_rule_test: + - eval_time: 1h + alertname: NVMeoFVersionMismatch + exp_alerts: + - exp_labels: + severity: warning + type: ceph_default + exp_annotations: + summary: "The cluster has different NVMe-oF gateway releases active " + description: "This may indicate an issue with deployment. Check cephadm logs" + + # NVMeoFHighClientCount + - interval: 1m + input_series: + - series: 'ceph_nvmeof_subsystem_host_count{nqn="nqn1"}' + values: '2 2 2 4 4 8 8 8 10 10 20 20 32 34 34 38 38 40 44 44' + - series: 'ceph_nvmeof_subsystem_host_count{nqn="nqn2"}' + values: '2 2 2 8 8 8 16 16 16 16 16 16 16 16 16 16 16 16 16 16' + promql_expr_test: + - expr: ceph_nvmeof_subsystem_host_count > 32.00 + eval_time: 15m + exp_samples: + - labels: '{__name__="ceph_nvmeof_subsystem_host_count",nqn="nqn1"}' + value: 38 + alert_rule_test: + - eval_time: 20m + alertname: NVMeoFHighClientCount + exp_alerts: + - exp_labels: + nqn: nqn1 + severity: warning + type: ceph_default + exp_annotations: + summary: "The number of clients connected to nqn1 is too high " + description: "The supported limit for clients connecting to a subsystem is 32" + + # NVMeoFHighHostCPU + - interval: 1m + input_series: + - series: 'node_cpu_seconds_total{mode="idle",instance="node-1:9100",cpu="0"}' + values: '0+18x10 180+9x20' + - series: 'node_cpu_seconds_total{mode="idle",instance="node-1:9100",cpu="1"}' + values: '0+18x10 180+9x20' + - series: 'ceph_nvmeof_gateway_info{instance="node-1:10008"}' + values: '1.00+0x20' + promql_expr_test: + - expr: 100-((100*(avg by(host) (label_replace(rate(node_cpu_seconds_total{mode="idle"}[5m]),"host","$1","instance","(.*):.*")) * on(host) group_right label_replace(ceph_nvmeof_gateway_info,"host","$1","instance","(.*):.*")))) >= 80 + eval_time: 16m + exp_samples: + - labels: '{host="node-1",instance="node-1:10008"}' + value: 85 + alert_rule_test: + # negative match at 15m + - eval_time: 15m + alertname: NVMeoFHighHostCPU + # positive match at 25m + - eval_time: 25m + alertname: NVMeoFHighHostCPU + exp_alerts: + - exp_labels: + instance: node-1:10008 + host: node-1 + severity: warning + type: ceph_default + exp_annotations: + summary: "The CPU is high (85%) on NVMeoF Gateway host (node-1) " + description: "High CPU on a gateway host can lead to CPU contention and performance degradation" + + # NVMeoFInterfaceDown - triggered on eth0 only + - interval: 30s + input_series: + - series: 'ceph_nvmeof_subsystem_listener_iface_info{operstate="down", device="eth0"}' + values: '1+0x30' + - series: 'ceph_nvmeof_subsystem_listener_iface_info{operstate="up", device="eth1"}' + values: '1+0x30' + promql_expr_test: + - expr: ceph_nvmeof_subsystem_listener_iface_info{operstate="down"} + eval_time: 1m + exp_samples: + - labels: '{__name__="ceph_nvmeof_subsystem_listener_iface_info", device="eth0", operstate="down"}' + value: 1 + alert_rule_test: + - eval_time: 5m + alertname: NVMeoFInterfaceDown + exp_alerts: + - exp_labels: + oid: 1.3.6.1.4.1.50495.1.2.1.14.1 + operstate: down + device: eth0 + severity: warning + type: ceph_default + exp_annotations: + summary: "Network interface eth0 is down " + description: "A NIC used by one or more subsystems is in a down state" + + # NVMeoFInterfaceDuplex - triggered on eth1 only + - interval: 30s + input_series: + - series: 'ceph_nvmeof_subsystem_listener_iface_info{duplex="full", device="eth0"}' + values: '1+0x30' + - series: 'ceph_nvmeof_subsystem_listener_iface_info{duplex="half", device="eth1"}' + values: '1+0x30' + promql_expr_test: + - expr: ceph_nvmeof_subsystem_listener_iface_info{duplex!="full"} + eval_time: 30s + exp_samples: + - labels: '{__name__="ceph_nvmeof_subsystem_listener_iface_info", device="eth1", duplex="half"}' + value: 1 + alert_rule_test: + - eval_time: 5m + alertname: NVMeoFInterfaceDuplex + exp_alerts: + - exp_labels: + duplex: half + device: eth1 + severity: warning + type: ceph_default + exp_annotations: + summary: "Network interface eth1 is not running in full duplex mode " + description: "Until this is resolved, performance from the gateway will be degraded" + + # NVMeoFHighReadLatency + - interval: 30s + input_series: + - series: 'ceph_nvmeof_bdev_read_seconds_total{instance="node-1:10008",bdev_name="disk1"}' + values: '0+1680x10 19800+3000x20' + - series: 'ceph_nvmeof_bdev_reads_completed_total{instance="node-1:10008",bdev_name="disk1"}' + values: '0+286000x10 2980000+120000x20' + promql_expr_test: + - expr: label_replace((avg by(instance) ((rate(ceph_nvmeof_bdev_read_seconds_total[1m]) / rate(ceph_nvmeof_bdev_reads_completed_total[1m])))),"gateway","$1","instance","(.*):.*") > 0.02 + eval_time: 10m + exp_samples: + - labels: '{gateway="node-1",instance="node-1:10008"}' + value: 0.025 + alert_rule_test: + # negative test - latency is lower than 0.02s + - eval_time: 4m + alertname: NVMeoFHighReadLatency + # positive test - latency is higher than 0.02s + - eval_time: 15m + alertname: NVMeoFHighReadLatency + exp_alerts: + - exp_labels: + gateway: node-1 + instance: node-1:10008 + severity: warning + type: ceph_default + exp_annotations: + summary: "The average read latency over the last 5 mins has reached 10 ms or more on node-1" + description: "High latencies may indicate a constraint within the cluster e.g. CPU, network. Please investigate" + + # NVMeoFHighWriteLatency + - interval: 30s + input_series: + - series: 'ceph_nvmeof_bdev_write_seconds_total{instance="node-1:10008",bdev_name="disk1"}' + values: '0+1680x10 19800+3000x20' + - series: 'ceph_nvmeof_bdev_writes_completed_total{instance="node-1:10008",bdev_name="disk1"}' + values: '0+286000x10 2980000+120000x20' + promql_expr_test: + - expr: label_replace((avg by(instance) ((rate(ceph_nvmeof_bdev_write_seconds_total[1m]) / rate(ceph_nvmeof_bdev_writes_completed_total[1m])))),"gateway","$1","instance","(.*):.*") > 0.02 + eval_time: 10m + exp_samples: + - labels: '{gateway="node-1",instance="node-1:10008"}' + value: 0.025 + alert_rule_test: + # negative test - latency is lower than 0.02s + - eval_time: 4m + alertname: NVMeoFHighWriteLatency + # positive test - latency is higher than 0.02s + - eval_time: 15m + alertname: NVMeoFHighWriteLatency + exp_alerts: + - exp_labels: + gateway: node-1 + instance: node-1:10008 + severity: warning + type: ceph_default + exp_annotations: + summary: "The average write latency over the last 5 mins has reached 20 ms or more on node-1" + description: "High latencies may indicate a constraint within the cluster e.g. CPU, network. Please investigate" + \ No newline at end of file diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/tests_alerts/test_syntax.py b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/tests_alerts/test_syntax.py new file mode 100755 index 000000000..966d768bd --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/tests_alerts/test_syntax.py @@ -0,0 +1,42 @@ +import pytest +import os +import yaml +from .utils import promtool_available, call +from .settings import ALERTS_FILE, UNIT_TESTS_FILE + + +def load_yaml(file_name): + yaml_data = None + with open(file_name, 'r') as alert_file: + raw = alert_file.read() + try: + yaml_data = yaml.safe_load(raw) + except yaml.YAMLError as e: + pass + + return yaml_data + + +def test_alerts_present(): + assert os.path.exists(ALERTS_FILE), f"{ALERTS_FILE} not found" + + +def test_unittests_present(): + assert os.path.exists(UNIT_TESTS_FILE), f"{UNIT_TESTS_FILE} not found" + + +@pytest.mark.skipif(not os.path.exists(ALERTS_FILE), reason=f"{ALERTS_FILE} missing") +def test_rules_format(): + assert load_yaml(ALERTS_FILE) + + +@pytest.mark.skipif(not os.path.exists(UNIT_TESTS_FILE), reason=f"{UNIT_TESTS_FILE} missing") +def test_unittests_format(): + assert load_yaml(UNIT_TESTS_FILE) + + +@pytest.mark.skipif(not promtool_available(), reason="promtool is not installed. Unable to check syntax") +def test_rule_syntax(): + completion = call(f"promtool check rules {ALERTS_FILE}") + assert completion.returncode == 0 + assert b"SUCCESS" in completion.stdout diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/tests_alerts/test_unittests.py b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/tests_alerts/test_unittests.py new file mode 100644 index 000000000..4cfb2b600 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/tests_alerts/test_unittests.py @@ -0,0 +1,19 @@ +import pytest +import os +from .utils import promtool_available, call +from .settings import ALERTS_FILE, UNIT_TESTS_FILE + + +def test_alerts_present(): + assert os.path.exists(ALERTS_FILE), f"{ALERTS_FILE} not found" + + +def test_unittests_present(): + assert os.path.exists(UNIT_TESTS_FILE), f"{UNIT_TESTS_FILE} not found" + + +@pytest.mark.skipif(not promtool_available(), reason="promtool is not installed. Unable to run unit tests") +def test_run_unittests(): + completion = call(f"promtool test rules {UNIT_TESTS_FILE}") + assert completion.returncode == 0 + assert b"SUCCESS" in completion.stdout diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/tests_alerts/utils.py b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/tests_alerts/utils.py new file mode 100644 index 000000000..842924447 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/tests_alerts/utils.py @@ -0,0 +1,12 @@ +import pytest +import shutil +import subprocess + + +def promtool_available() -> bool: + return shutil.which('promtool') is not None + + +def call(cmd): + completion = subprocess.run(cmd.split(), stdout=subprocess.PIPE) + return completion diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/tests_alerts/validate_rules.py b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/tests_alerts/validate_rules.py new file mode 100755 index 000000000..c24ce5c59 --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/tests_alerts/validate_rules.py @@ -0,0 +1,571 @@ +#!/usr/bin/env python3 +# +# Check the Prometheus rules for format, and integration +# with the unit tests. This script has the following exit +# codes: +# 0 .. Everything worked +# 4 .. rule problems or missing unit tests +# 8 .. Missing fields in YAML +# 12 .. Invalid YAML - unable to load +# 16 .. Missing input files +# +# Externals +# snmptranslate .. used to determine the oid's in the MIB to verify the rule -> MIB is correct +# + +import re +import os +import sys +import yaml +import shutil +import string +from bs4 import BeautifulSoup +from typing import List, Any, Dict, Set, Optional, Tuple +import subprocess + +import urllib.request +import urllib.error +from urllib.parse import urlparse + +from settings import ALERTS_FILE, MIB_FILE, UNIT_TESTS_FILE + +DOCLINK_NAME = 'documentation' + + +def isascii(s: str) -> bool: + try: + s.encode('ascii') + except UnicodeEncodeError: + return False + return True + + +def read_file(file_name: str) -> Tuple[str, str]: + try: + with open(file_name, 'r') as input_file: + raw_data = input_file.read() + except OSError: + return '', f"Unable to open {file_name}" + + return raw_data, '' + + +def load_yaml(file_name: str) -> Tuple[Dict[str, Any], str]: + data = {} + errs = '' + + raw_data, err = read_file(file_name) + if not err: + + try: + data = yaml.safe_load(raw_data) + except yaml.YAMLError as e: + errs = f"filename '{file_name} is not a valid YAML file" + + return data, errs + + +def run_command(command: str): + c = command.split() + completion = subprocess.run(c, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + return (completion.returncode, + completion.stdout.decode('utf-8').split('\n'), + completion.stderr.decode('utf-8').split('\n')) + + +class HTMLCache: + def __init__(self) -> None: + self.cache: Dict[str, Tuple[int, str]] = {} + + def fetch(self, url_str: str) -> None: + parsed = urlparse(url_str) + url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}" + + if url in self.cache: + return self.cache[url] + + req = urllib.request.Request(url) + try: + r = urllib.request.urlopen(req) + except urllib.error.HTTPError as e: + self.cache[url] = e.code, e.reason + return self.cache[url] + except urllib.error.URLError as e: + self.cache[url] = 400, e.reason + return self.cache[url] + + if r.status == 200: + html = r.read().decode('utf-8') + self.cache[url] = 200, html + return self.cache[url] + + self.cache[url] = r.status, r.reason + return r.status, r.reason + + @property + def cached_pages(self) -> List[str]: + return self.cache.keys() + + @property + def cached_pages_total(self) -> int: + return len(self.cache.keys()) + +class PrometheusRule: + expected_attrs = [ + 'alert', + 'expr', + 'labels', + 'annotations' + ] + + def __init__(self, rule_group, rule_data: Dict[str, Any]): + + assert 'alert' in rule_data + self.group: RuleGroup = rule_group + self.name = rule_data.get('alert') + self.rule = rule_data + self.errors: List[str] = [] + self.warnings: List[str] = [] + self.validate() + + @property + def has_oid(self): + return True if self.rule.get('labels', {}).get('oid', '') else False + + @property + def labels(self) -> Dict[str, str]: + return self.rule.get('labels', {}) + + @property + def annotations(self) -> Dict[str, str]: + return self.rule.get('annotations', {}) + + def _check_alert_name(self): + # this is simplistic, but works in the context of the alert name + if self.name[0] in string.ascii_uppercase and \ + self.name != self.name.lower() and \ + self.name != self.name.upper() and \ + " " not in self.name and \ + "_" not in self.name: + return + + self.warnings.append("Alert name is not in CamelCase format") + + def _check_structure(self): + rule_attrs = self.rule.keys() + missing_attrs = [a for a in PrometheusRule.expected_attrs if a not in rule_attrs] + + if missing_attrs: + self.errors.append( + f"invalid alert structure. Missing field{'s' if len(missing_attrs) > 1 else ''}" + f": {','.join(missing_attrs)}") + + def _check_labels(self): + for rqd in ['severity', 'type']: + if rqd not in self.labels.keys(): + self.errors.append(f"rule is missing {rqd} label definition") + + def _check_annotations(self): + for rqd in ['summary', 'description']: + if rqd not in self.annotations: + self.errors.append(f"rule is missing {rqd} annotation definition") + + def _check_doclink(self): + doclink = self.annotations.get(DOCLINK_NAME, '') + + if doclink: + url = urlparse(doclink) + status, content = self.group.fetch_html_page(doclink) + if status == 200: + if url.fragment: + soup = BeautifulSoup(content, 'html.parser') + if not soup.find(id=url.fragment): + self.errors.append(f"documentation link error: {url.fragment} anchor not found on the page") + else: + # catch all + self.errors.append(f"documentation link error: {status} {content}") + + def _check_snmp(self): + oid = self.labels.get('oid', '') + + if self.labels.get('severity', '') == 'critical' and not oid: + self.warnings.append("critical level alert is missing an SNMP oid entry") + if oid and not re.search('^1.3.6.1.4.1.50495.1.2.\\d+.\\d+.\\d+$', oid): + self.errors.append("invalid OID format provided") + if self.group.get_oids(): + if oid and oid not in self.group.get_oids(): + self.errors.append(f"rule defines an OID {oid} that is missing from the MIB file({os.path.basename(MIB_FILE)})") + + def _check_ascii(self): + if 'oid' not in self.labels: + return + + desc = self.annotations.get('description', '') + summary = self.annotations.get('summary', '') + if not isascii(desc): + self.errors.append(f"non-ascii characters found in 'description' field will cause issues in associated snmp trap.") + if not isascii(summary): + self.errors.append(f"non-ascii characters found in 'summary' field will cause issues in associated snmp trap.") + + def validate(self): + + self._check_alert_name() + self._check_structure() + self._check_labels() + self._check_annotations() + self._check_doclink() + self._check_snmp() + self._check_ascii() + char = '.' + + if self.errors: + char = 'E' + self.group.update('error', self.name) + elif self.warnings: + char = 'W' + self.group.update('warning', self.name) + + sys.stdout.write(char) + + +class RuleGroup: + + def __init__(self, rule_file, group_name: str, group_name_width: int): + self.rule_file: RuleFile = rule_file + self.group_name = group_name + self.rules: Dict[str, PrometheusRule] = {} + self.problems = { + "error": [], + "warning": [], + } + + sys.stdout.write(f"\n\t{group_name:<{group_name_width}} : ") + + def add_rule(self, rule_data:Dict[str, Any]): + alert_name = rule_data.get('alert') + self.rules[alert_name] = PrometheusRule(self, rule_data) + + def update(self, problem_type:str, alert_name:str): + assert problem_type in ['error', 'warning'] + + self.problems[problem_type].append(alert_name) + self.rule_file.update(self.group_name) + + def fetch_html_page(self, url): + return self.rule_file.fetch_html_page(url) + + def get_oids(self): + return self.rule_file.oid_list + + @property + def error_count(self): + return len(self.problems['error']) + + def warning_count(self): + return len(self.problems['warning']) + + @property + def count(self): + return len(self.rules) + + +class RuleFile: + + def __init__(self, parent, file_name, rules, oid_list): + self.parent = parent + self.file_name = file_name + self.rules: Dict[str, Any] = rules + self.oid_list = oid_list + self.problems: Set[str] = set() + self.group: Dict[str, RuleGroup] = {} + self.alert_names_seen: Set[str] = set() + self.duplicate_alert_names:List[str] = [] + self.html_cache = HTMLCache() + + assert 'groups' in self.rules + self.max_group_name_width = self.get_max_group_name() + self.load_groups() + + def update(self, group_name): + self.problems.add(group_name) + self.parent.mark_invalid() + + def fetch_html_page(self, url): + return self.html_cache.fetch(url) + + @property + def group_count(self): + return len(self.rules['groups']) + + @property + def rule_count(self): + rule_count = 0 + for _group_name, rule_group in self.group.items(): + rule_count += rule_group.count + return rule_count + + @property + def oid_count(self): + oid_count = 0 + for _group_name, rule_group in self.group.items(): + for _rule_name, rule in rule_group.rules.items(): + if rule.has_oid: + oid_count += 1 + return oid_count + + @property + def group_names(self): + return self.group.keys() + + @property + def problem_count(self): + return len(self.problems) + + def get_max_group_name(self): + group_name_list = [] + for group in self.rules.get('groups'): + group_name_list.append(group['name']) + return max([len(g) for g in group_name_list]) + + def load_groups(self): + sys.stdout.write("\nChecking rule groups") + for group in self.rules.get('groups'): + group_name = group['name'] + rules = group['rules'] + self.group[group_name] = RuleGroup(self, group_name, self.max_group_name_width) + for rule_data in rules: + if 'alert' in rule_data: + alert_name = rule_data.get('alert') + if alert_name in self.alert_names_seen: + self.duplicate_alert_names.append(alert_name) + else: + self.alert_names_seen.add(alert_name) + self.group[group_name].add_rule(rule_data) + else: + # skipped recording rule + pass + + def report(self): + def max_width(item_list: Set[str], min_width: int = 0) -> int: + return max([len(i) for i in item_list] + [min_width]) + + if not self.problems and not self.duplicate_alert_names: + print("\nNo problems detected in the rule file") + return + + print("\nProblem Report\n") + + group_width = max_width(self.problems, 5) + alert_names = set() + for g in self.problems: + group = self.group[g] + alert_names.update(group.problems.get('error', [])) + alert_names.update(group.problems.get('warning', [])) + alert_width = max_width(alert_names, 10) + + template = " {group:<{group_width}} {severity:<8} {alert_name:<{alert_width}} {description}" + + print(template.format( + group="Group", + group_width=group_width, + severity="Severity", + alert_name="Alert Name", + alert_width=alert_width, + description="Problem Description")) + + print(template.format( + group="-----", + group_width=group_width, + severity="--------", + alert_name="----------", + alert_width=alert_width, + description="-------------------")) + + for group_name in sorted(self.problems): + group = self.group[group_name] + rules = group.rules + for alert_name in group.problems.get('error', []): + for desc in rules[alert_name].errors: + print(template.format( + group=group_name, + group_width=group_width, + severity="Error", + alert_name=alert_name, + alert_width=alert_width, + description=desc)) + for alert_name in group.problems.get('warning', []): + for desc in rules[alert_name].warnings: + print(template.format( + group=group_name, + group_width=group_width, + severity="Warning", + alert_name=alert_name, + alert_width=alert_width, + description=desc)) + if self.duplicate_alert_names: + print("Duplicate alert names detected:") + for a in self.duplicate_alert_names: + print(f" - {a}") + + +class UnitTests: + expected_attrs = [ + 'rule_files', + 'tests', + 'evaluation_interval' + ] + def __init__(self, filename): + self.filename = filename + self.unit_test_data: Dict[str, Any] = {} + self.alert_names_seen: Set[str] = set() + self.problems: List[str] = [] + self.load() + + def load(self): + self.unit_test_data, errs = load_yaml(self.filename) + if errs: + print(f"\n\nError in unit tests file: {errs}") + sys.exit(12) + + missing_attr = [a for a in UnitTests.expected_attrs if a not in self.unit_test_data.keys()] + if missing_attr: + print(f"\nMissing attributes in unit tests: {','.join(missing_attr)}") + sys.exit(8) + + def _check_alert_names(self, alert_names: List[str]): + alerts_tested: Set[str] = set() + for t in self.unit_test_data.get('tests'): + test_cases = t.get('alert_rule_test', []) + if not test_cases: + continue + for case in test_cases: + alertname = case.get('alertname', '') + if alertname: + alerts_tested.add(alertname) + + alerts_defined = set(alert_names) + self.problems = list(alerts_defined.difference(alerts_tested)) + + def process(self, defined_alert_names: List[str]): + self._check_alert_names(defined_alert_names) + + def report(self) -> None: + + if not self.problems: + print("\nNo problems detected in unit tests file") + return + + print("\nUnit tests are incomplete. Tests missing for the following alerts;") + for p in self.problems: + print(f" - {p}") + +class RuleChecker: + + def __init__(self, rules_filename: str = None, test_filename: str = None): + self.rules_filename = rules_filename or ALERTS_FILE + self.test_filename = test_filename or UNIT_TESTS_FILE + self.rule_file: Optional[RuleFile] = None + self.unit_tests: Optional[UnitTests] = None + self.rule_file_problems: bool = False + self.errors = {} + self.warnings = {} + self.error_count = 0 + self.warning_count = 0 + self.oid_count = 0 + + self.oid_list = self.build_oid_list() + + def build_oid_list(self) -> List[str]: + + cmd = shutil.which('snmptranslate') + if not cmd: + return [] + + rc, stdout, stderr = run_command(f"{cmd} -Pu -Tz -M ../../snmp:/usr/share/snmp/mibs -m CEPH-MIB") + if rc != 0: + return [] + + oid_list: List[str] = [] + for line in stdout[:-1]: + _label, oid = line.replace('"', '').replace('\t', ' ').split() + oid_list.append(oid) + + return oid_list + + @property + def status(self): + if self.rule_file_problems or self.unit_tests.problems: + return 4 + + return 0 + + def mark_invalid(self): + self.rule_file_problems = True + + def summarise_rule_file(self): + for group_name in self.rule_file.problems: + group = self.rule_file.group[group_name] + self.error_count += len(group.problems['error']) + self.warning_count += len(group.problems['warning']) + + def ready(self): + errs: List[str] = [] + ready_state = True + if not os.path.exists(self.rules_filename): + errs.append(f"rule file '{self.rules_filename}' not found") + ready_state = False + + if not os.path.exists(self.test_filename): + errs.append(f"test file '{self.test_filename}' not found") + ready_state = False + + return ready_state, errs + + def run(self): + + ready, errs = self.ready() + if not ready: + print("Unable to start:") + for e in errs: + print(f"- {e}") + sys.exit(16) + + rules, errs = load_yaml(self.rules_filename) + if errs: + print(errs) + sys.exit(12) + + self.rule_file = RuleFile(self, self.rules_filename, rules, self.oid_list) + self.summarise_rule_file() + + self.unit_tests = UnitTests(self.test_filename) + self.unit_tests.process(self.rule_file.alert_names_seen) + + def report(self): + print("\n\nSummary\n") + print(f"Rule file : {self.rules_filename}") + print(f"Unit Test file : {self.test_filename}") + print(f"\nRule groups processed : {self.rule_file.group_count:>3}") + print(f"Rules processed : {self.rule_file.rule_count:>3}") + print(f"SNMP OIDs declared : {self.rule_file.oid_count:>3} {'(snmptranslate missing, unable to cross check)' if not self.oid_list else ''}") + print(f"Rule errors : {self.error_count:>3}") + print(f"Rule warnings : {self.warning_count:>3}") + print(f"Rule name duplicates : {len(self.rule_file.duplicate_alert_names):>3}") + print(f"Unit tests missing : {len(self.unit_tests.problems):>3}") + + self.rule_file.report() + self.unit_tests.report() + + +def main(): + checker = RuleChecker() + + checker.run() + checker.report() + print() + + sys.exit(checker.status) + + +if __name__ == '__main__': + main() diff --git a/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/tests_dashboards/__init__.py b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/tests_dashboards/__init__.py new file mode 100644 index 000000000..ea41d01be --- /dev/null +++ b/build/kube-prometheus/libraries/74e445ae4a2582f978bae2e0e9b63024d7f759d6/vendor/github.com/ceph/ceph/monitoring/ceph-mixin/tests_dashboards/__init__.py @@ -0,0 +1,189 @@ +import re +import subprocess +import sys +import tempfile +from dataclasses import asdict, dataclass, field +from typing import Any, List + +import yaml + +from .util import replace_grafana_expr_variables + + +@dataclass +class InputSeries: + series: str = '' + values: str = '' + +@dataclass +class ExprSample: + labels: str = '' + value: float = -1 + +@dataclass +class PromqlExprTest: + expr: str = '' + eval_time: str = '1m' + exp_samples: List[ExprSample] = field(default_factory=list) + +@dataclass +class Test: + interval: str = '1m' + input_series: List[InputSeries] = field(default_factory=list) + promql_expr_test: List[PromqlExprTest] = field(default_factory=list) + + +@dataclass +class TestFile: + evaluation_interval: str = '1m' + tests: List[Test] = field(default_factory=list) + + +class PromqlTest: + """ + Base class to provide prometheus query test capabilities. After setting up + the query test with its input and expected output it's expected to run promtool. + + https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/#test-yml + + The workflow of testing would be something like: + + # add prometheus query to test + self.set_expression('node_bonding_slaves > 0') + + # add some prometheus input series + self.add_series('node_bonding_slaves{master="bond0"}', '2') + self.add_series('node_bonding_slaves{master="bond1"}', '3') + self.add_series('node_network_receive_bytes{instance="127.0.0.1", + device="eth1"}', "10 100 230 22") + + # expected output of the query + self.add_exp_samples('node_bonding_slaves{master="bond0"}', 2) + self.add_exp_samples('node_bonding_slaves{master="bond1"}', 3) + + # at last, always call promtool with: + self.assertTrue(self.run_promtool()) + # assertTrue means it expect promtool to succeed + """ + + def __init__(self): + self.test_output_file = tempfile.NamedTemporaryFile('w+') + + self.test_file = TestFile() + self.test = Test() + self.promql_expr_test = PromqlExprTest() + self.test.promql_expr_test.append(self.promql_expr_test) + self.test_file.tests.append(self.test) + + self.variables = {} + + def __del__(self): + self.test_output_file.close() + + + def set_evaluation_interval(self, interval: int, unit: str = 'm') -> None: + """ + Set the evaluation interval of the time series + + Args: + interval (int): number of units. + unit (str): unit type: 'ms', 's', 'm', etc... + """ + self.test_file.evaluation_interval = f'{interval}{unit}' + + def set_interval(self, interval: int, unit: str = 'm') -> None: + """ + Set the duration of the time series + + Args: + interval (int): number of units. + unit (str): unit type: 'ms', 's', 'm', etc... + """ + self.test.interval = f'{interval}{unit}' + + def set_expression(self, expr: str) -> None: + """ + Set the prometheus expression/query used to filter data. + + Args: + expr(str): expression/query. + """ + self.promql_expr_test.expr = expr + + def add_series(self, series: str, values: str) -> None: + """ + Add a series to the input. + + Args: + series(str): Prometheus series. + Notation: '{