From 81cd2c6a463f5fd80b5459e51a0f161e77cfa96c Mon Sep 17 00:00:00 2001 From: Rambabu Bolla Date: Wed, 16 Oct 2024 10:36:07 +0000 Subject: [PATCH] CASMMON-446 --- .spelling | 7 ++ operations/README.md | 1 - ...ewal_for_Kubernetes_and_Bare_Metal_EtcD.md | 15 +-- .../network/dns/PowerDNS_Configuration.md | 6 +- ...Failing_to_Discover_Services_Workaround.md | 62 +++++----- ..._Provisioned_with_External_IP_Addresses.md | 37 +++++- ...ccess_System_Management_Health_Services.md | 27 +++-- ...e_Prometheus_Alerta_Alert_Notifications.md | 108 ------------------ ...re_Prometheus_Email_Alert_Notifications.md | 6 +- .../system_management_health/Grafterm.md | 58 ---------- .../System_Management_Health.md | 2 +- ...tem_Management_Health_Checks_and_Alerts.md | 53 ++++----- 12 files changed, 124 insertions(+), 258 deletions(-) delete mode 100644 operations/system_management_health/Configure_Prometheus_Alerta_Alert_Notifications.md delete mode 100644 operations/system_management_health/Grafterm.md diff --git a/.spelling b/.spelling index 34076c548854..691149493f16 100644 --- a/.spelling +++ b/.spelling @@ -35,6 +35,13 @@ Vmalert Vminsert Vmselect Vmstorage +vmstorage +vmalert +vmselect +victoria +kubectl +kubelet + 1.0.x 1.2.x diff --git a/operations/README.md b/operations/README.md index ae2ce7a5b2e7..477c42a9b848 100644 --- a/operations/README.md +++ b/operations/README.md @@ -454,7 +454,6 @@ confident that a lack of issues indicates the system is operating normally. - [Configure Prometheus Email Alert Notifications](system_management_health/Configure_Prometheus_Email_Alert_Notifications.md) - [Grafana Dashboards by Component](system_management_health/Grafana_Dashboards_by_Component.md) - [Troubleshoot Grafana Dashboard](system_management_health/Troubleshoot_Grafana_Dashboard.md) -- [Grafterm](system_management_health/Grafterm.md) - [Remove Kiali](system_management_health/Remove_Kiali.md) - [`prometheus-kafka-adapter` errors during installation](system_management_health/Prometheus_Kafka_Error.md) - [`grok-exporter` errors during installation](system_management_health/Grok-Exporter_Error.md) diff --git a/operations/kubernetes/Cert_Renewal_for_Kubernetes_and_Bare_Metal_EtcD.md b/operations/kubernetes/Cert_Renewal_for_Kubernetes_and_Bare_Metal_EtcD.md index 584dda450055..777b91063de9 100644 --- a/operations/kubernetes/Cert_Renewal_for_Kubernetes_and_Bare_Metal_EtcD.md +++ b/operations/kubernetes/Cert_Renewal_for_Kubernetes_and_Bare_Metal_EtcD.md @@ -565,26 +565,27 @@ Run the following steps from a master node. 1. Restart Prometheus. ```bash - kubectl rollout restart -n sysmgmt-health statefulSet/prometheus-cray-sysmgmt-health-kube-p-prometheus - kubectl rollout status -n sysmgmt-health statefulSet/prometheus-cray-sysmgmt-health-kube-p-prometheus + kubectl rollout restart deployment -n sysmgmt-health vmagent-vms-0 + kubectl rollout status -n sysmgmt-health deployment.apps/vmagent-vms-0 + kubectl rollout restart deployment -n sysmgmt-health vmagent-vms-1 + kubectl rollout status -n sysmgmt-health deployment.apps/vmagent-vms-1 ``` Example output: ```text - Waiting for 1 pods to be ready... - statefulset rolling update complete ... + deployment "vmagent-vms-0" successfully rolled out ``` 1. Check for any `tls` errors from the active Prometheus targets. No errors are expected. ```bash - PROM_IP=$(kubectl get services -n sysmgmt-health cray-sysmgmt-health-kube-p-prometheus -o json | jq -r '.spec.clusterIP') - curl -s http://${PROM_IP}:9090/api/v1/targets | jq -r '.data.activeTargets[] | select(."scrapePool" == "sysmgmt-health/cray-sysmgmt-health-kube-p-kube-etcd/0")' | grep lastError | sort -u + PROM_IP=$(kubectl get services -n sysmgmt-health vmagent-vms -o json | jq -r '.spec.clusterIP') + curl -s http://${PROM_IP}:8429/targets | grep kube-etcd | sort -u ``` Example output: ```text - "lastError": "", + state=up, endpoint=https://10.252.1.10:2379/metrics, labels={endpoint="http-metrics",instance="10.252.1.10:2379",job="kube-etcd",namespace="kube-system",service="vms-kube-etcd"}, scrapes_total=28114, scrapes_failed=0, last_scrape=14838ms ago, scrape_duration=14ms, samples_scraped=1487, error= ``` diff --git a/operations/network/dns/PowerDNS_Configuration.md b/operations/network/dns/PowerDNS_Configuration.md index 93db8250e1ba..aa4f1df8fd0b 100644 --- a/operations/network/dns/PowerDNS_Configuration.md +++ b/operations/network/dns/PowerDNS_Configuration.md @@ -4,7 +4,7 @@ PowerDNS replaces the CoreDNS server that earlier versions of CSM used to provide External DNS services. -The `cray-dns-powerdns-can-tcp` and `cray-dns-powerdns-can-udp` LoadBalancer resources are configured to service external DNS requests using the IP address specified by the CSI `--cmn-external-dns` command line argument. +The `cray-dns-powerdns-can-tcp` and `cray-dns-powerdns-can-udp` `LoadBalancer` resources are configured to service external DNS requests using the IP address specified by the CSI `--cmn-external-dns` command line argument. The CSI `--system-name` and `--site-domain` command line arguments are combined to form the subdomain used for External DNS. @@ -134,7 +134,7 @@ zone "8.101.10.in-addr.arpa" { The CSM implementation of PowerDNS supports the DNS Security Extensions (DNSSEC) and the signing of zones with a user-supplied zone signing key. -If DNSSEC is to be used for zone transfer then the `dnssec` SealedSecret in `customizations.yaml` should be updated to include a base64 encoded version of the private key portion of the desired zone signing key. +If DNSSEC is to be used for zone transfer then the `dnssec` SealedSecret in `customizations.yaml` should be updated to include a `base64` encoded version of the private key portion of the desired zone signing key. Here is an example of a zone signing key. @@ -221,7 +221,7 @@ spec: key: dnFC5euKixIKXAr6sZhI7kVQbQCXoDG5R5eHSYZiBxY= ``` -> **`IMPORTANT`** The key used for TSIG **must** have `.tsig` in the name and unlike the zone signing key it should not be base64 encoded. +> **`IMPORTANT`** The key used for TSIG **must** have `.tsig` in the name and unlike the zone signing key it should not be `base64` encoded. #### Example configuration for BIND diff --git a/operations/network/external_dns/External_DNS_Failing_to_Discover_Services_Workaround.md b/operations/network/external_dns/External_DNS_Failing_to_Discover_Services_Workaround.md index 05d64e2ea56c..b015d1dd5288 100644 --- a/operations/network/external_dns/External_DNS_Failing_to_Discover_Services_Workaround.md +++ b/operations/network/external_dns/External_DNS_Failing_to_Discover_Services_Workaround.md @@ -30,7 +30,7 @@ Use this procedure to resolve any external DNS routing issues with backend servi services sma-kibana [services-gateway] [sma-kibana.cmn.SYSTEM_DOMAIN_NAME] 2d16h sysmgmt-health cray-sysmgmt-health-alertmanager [services/services-gateway] [alertmanager.cmn.SYSTEM_DOMAIN_NAME] 2d16h sysmgmt-health cray-sysmgmt-health-grafana [services/services-gateway] [grafana.cmn.SYSTEM_DOMAIN_NAME] 2d16h - sysmgmt-health cray-sysmgmt-health-prometheus [services/services-gateway] [prometheus.cmn.SYSTEM_DOMAIN_NAME] 2d16h + sysmgmt-health cray-sysmgmt-health-vm-select [services/services-gateway] [vmselect.cmn.SYSTEM_DOMAIN_NAME] 2d16h ``` 1. (`ncn-mw#`) Inspect the `VirtualService` objects to learn the destination service and port. @@ -47,37 +47,41 @@ Use this procedure to resolve any external DNS routing issues with backend servi apiVersion: networking.istio.io/v1beta1 kind: VirtualService metadata: - creationTimestamp: "2020-07-09T17:49:07Z" - generation: 1 - labels: - app: cray-sysmgmt-health-prometheus - app.kubernetes.io/instance: cray-sysmgmt-health - app.kubernetes.io/managed-by: Tiller - app.kubernetes.io/name: cray-sysmgmt-health - app.kubernetes.io/version: 8.15.4 - helm.sh/chart: cray-sysmgmt-health-0.3.1 - name: cray-sysmgmt-health-prometheus - namespace: sysmgmt-health - resourceVersion: "41620" - selfLink: /apis/networking.istio.io/v1beta1/namespaces/sysmgmt-health/virtualservices/cray-sysmgmt-health-prometheus - uid: d239dfcc-a827-4a51-9b73-6eccfb937088 - spec: - gateways: - - services/services-gateway - hosts: - - prometheus.cmn.SYSTEM_DOMAIN_NAME + annotations: + meta.helm.sh/release-name: cray-sysmgmt-health + meta.helm.sh/release-namespace: sysmgmt-health + creationTimestamp: "2024-10-15T12:59:14Z" + generation: 1 + labels: + app: cray-sysmgmt-health-vm-select + app.kubernetes.io/instance: cray-sysmgmt-health + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: cray-sysmgmt-health + app.kubernetes.io/version: 0.17.5 + helm.sh/chart: cray-sysmgmt-health-1.0.17-20241016103148_b40f1aa + name: cray-sysmgmt-health-vm-select + namespace: sysmgmt-health + resourceVersion: "149049132" + uid: d166065d-1b3b-4434-b25b-e95cb8940b01 + spec: + gateways: + - services/services-gateway + - services/customer-admin-gateway + hosts: + - vmselect.cmn.mug.hpc.amslabs.hpecorp.net http: - match: - - authority: - exact: prometheus.cmn.SYSTEM_DOMAIN_NAME - route: - - destination: - host: cray-sysmgmt-health-kube-p-prometheus - port: - number: 9090 + - authority: + exact: vmselect.cmn.mug.hpc.amslabs.hpecorp.net + route: + - destination: + host: vmselect-vms + port: + number: 8481 + ``` - From the `VirtualService data`, it is straightforward to see how traffic will be routed. In this example, connections to `prometheus.cmn.SYSTEM_DOMAIN_NAME` will be routed to the - `cray-sysmgmt-health-prometheus` service in the `sysmgmt-health` namespace on port 9090. + From the `VirtualService data`, it is straightforward to see how traffic will be routed. In this example, connections to `vmselect.cmn.SYSTEM_DOMAIN_NAME` will be routed to the + `cray-sysmgmt-health-prometheus` service in the `sysmgmt-health` namespace on port 8481. External DNS will now be connected to the backend service. diff --git a/operations/network/external_dns/Troubleshoot_Systems_Not_Provisioned_with_External_IP_Addresses.md b/operations/network/external_dns/Troubleshoot_Systems_Not_Provisioned_with_External_IP_Addresses.md index 130cac20aa57..811e2f8cd9b4 100644 --- a/operations/network/external_dns/Troubleshoot_Systems_Not_Provisioned_with_External_IP_Addresses.md +++ b/operations/network/external_dns/Troubleshoot_Systems_Not_Provisioned_with_External_IP_Addresses.md @@ -33,7 +33,7 @@ The Customer Management Network \(CMN\) is not supported on the system. services sma-kibana [services-gateway] [sma-kibana.cmn.SYSTEM_DOMAIN_NAME] 2d16h sysmgmt-health cray-sysmgmt-health-alertmanager [services/services-gateway] [alertmanager.cmn.SYSTEM_DOMAIN_NAME] 2d16h sysmgmt-health cray-sysmgmt-health-grafana [services/services-gateway] [grafana.cmn.SYSTEM_DOMAIN_NAME] 2d16h - sysmgmt-health cray-sysmgmt-health-prometheus [services/services-gateway] [prometheus.cmn.SYSTEM_DOMAIN_NAME] 2d16h + sysmgmt-health cray-sysmgmt-health-prometheus [services/services-gateway] [vmselect.cmn.SYSTEM_DOMAIN_NAME] 2d16h ``` 2. Lookup the cluster IP and port for service. @@ -48,7 +48,7 @@ The Customer Management Network \(CMN\) is not supported on the system. ```console NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE - cray-sysmgmt-health-kube-p-prometheus ClusterIP 10.25.124.159 9090/TCP 23h + cray-sysmgmt-health-grafana ClusterIP 10.25.124.159 9090/TCP 23h ``` 3. Setup port forwarding from a laptop or workstation to access the service. @@ -62,3 +62,36 @@ The Customer Management Network \(CMN\) is not supported on the system. ``` 4. Visit `http://localhost:9090/` in a laptop or workstation browser. + +5. There is no `clusterip` for vmselect due to headless service + Below are the steps to access headless service + + a) Lookup the service and port for vmselect service + The example below is for the `vmselect-vms` service. + + ```bash + kubectl -n sysmgmt-health get service vmselect-vms + ``` + + Example output: + + ```console + NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE + vmselect-vms ClusterIP None 8481/TCP 14d + ``` + use kubectl port-forward to connect to a vmselect server running in a Kubernetes cluster + ```bash + kubectl port-forward -n sysmgmt-health service/vmselect-vms 8082:8481 + ``` + + Setup port forwarding from a laptop or workstation to access the service. + + Use the cluster IP and port for the service obtained in the previous step. If the port is unprivileged, use the same port number on the local side. + + Replace the cluster IP, port, and system name values in the example below. + + ```bash + # ssh -L 9090:10.25.124.159:8082 root@SYSTEM_NCN_DOMAIN_NAME + ``` + + b Visit `http://localhost:9090/` in a laptop or workstation browser. diff --git a/operations/system_management_health/Access_System_Management_Health_Services.md b/operations/system_management_health/Access_System_Management_Health_Services.md index 9fa657cad684..32310acc1463 100644 --- a/operations/system_management_health/Access_System_Management_Health_Services.md +++ b/operations/system_management_health/Access_System_Management_Health_Services.md @@ -44,34 +44,33 @@ When accessing the URLs listed below, it will be necessary to accept one or more logging in. The details of the security warning will indicate that a self-signed certificate/unknown issuer is being used for the site. Support for incorporation of certificates from Trusted Certificate Authorities is planned for a future release. -### Prometheus +### VictoriaMetrics UI -URL: `https://prometheus.cmn.SYSTEM_DOMAIN_NAME/` +URL: `https://vmselect..cmn.SYSTEM_DOMAIN_NAME/select/0/prometheus/vmui` -Central Prometheus instance scrapes metrics from Kubernetes, Ceph, and the hosts (part of `kube-prometheus-stack` Helm chart). +Vmagent instance scrapes metrics from Kubernetes, Ceph, and the hosts (part of ` victoria-metrics-k8s-stack` Helm chart). -Prometheus generates alerts based on metrics and reports them to the Alertmanager. The 'Alerts' link at the top of the page will show all of the inactive, pending, and firing alerts on the system. +Victoria metrics generates alerts based on metrics and reports them to the Alertmanager. The 'Alerts' link at the top of the page will show all of the inactive, pending, and firing alerts on the system. Clicking on any of the alerts will expand them, enabling users to use the 'Labels' data to discern the details of the alert. The details will also show the state of the alert, how long it has been active, and the value for the alert. -For more information regarding the use of the Prometheus interface, see -[Getting Started/](https://prometheus.io/docs/prometheus/latest/getting_started/) in the Prometheus online documentation. +For more information regarding the use of the victoria metrics interface, see +[Getting Started/](https://docs.victoriametrics.com/) in the victoria metrics online documentation. Some alerts may be falsely triggered. This occurs if they are alerts which will be improved in the future, or if they are alerts impacted by whether all software products have been installed yet. See [Troubleshoot Prometheus Alerts](Troubleshoot_Prometheus_Alerts.md). -### Thanos +### VMalert + +URL: `https://vmselect.cmn.SYSTEM_DOMAIN_NAME/select/0/prometheus/vmalert/` +VMAlert - executes a list of given alerting or recording rules against configured address. -URL: `https://thanos.cmn.SYSTEM_DOMAIN_NAME/` +The VMAlert CRD declaratively defines a desired VMAlert setup to run in a Kubernetes cluster. -Thanos is a set of components that can be composed into a highly available, multi Prometheus metric system with potentially unlimited storage capacity, if your Object Storage allows for it. -It leverages the Prometheus 2.0 storage format to cost-efficiently store historical metric data in any object storage while retaining fast query latencies. -Additionally, it provides a global query view across all Prometheus installations and can merge data from Prometheus HA pairs. +It has few required config options - `datasource` and notifier are required, for other config parameters check doc. -For more information regarding the use of the Thanos interface, see -[Getting Started/](https://thanos.io/tip/thanos/getting-started.md/) in the thanos online documentation. +For each VMAlert resource, the Operator deploys a properly configured Deployment in the same namespace. The VMAlert Pods are configured to mount a list of `Configmaps` prefixed with -number containing the configuration for alerting rules. -### Alertmanager URL: `https://alertmanager.cmn.SYSTEM_DOMAIN_NAME/` diff --git a/operations/system_management_health/Configure_Prometheus_Alerta_Alert_Notifications.md b/operations/system_management_health/Configure_Prometheus_Alerta_Alert_Notifications.md deleted file mode 100644 index e5cb1f8874a7..000000000000 --- a/operations/system_management_health/Configure_Prometheus_Alerta_Alert_Notifications.md +++ /dev/null @@ -1,108 +0,0 @@ -# Configure Prometheus Alerta Alert Notifications - -Configure an Alerta alert notification for Prometheus Alertmanager alerts. - -## System domain name - -The `SYSTEM_DOMAIN_NAME` value found in some of the URLs on this page is expected to be the system's fully qualified domain name (FQDN). - -(`ncn-mw#`) The FQDN can be found by running the following command on any Kubernetes NCN. - -```bash -kubectl get secret site-init -n loftsman -o jsonpath='{.data.customizations\.yaml}' | base64 -d | yq r - spec.network.dns.external -``` - -Example output: - -```text -system.hpc.amslabs.hpecorp.net -``` - -Be sure to modify the example URLs on this page by replacing `SYSTEM_DOMAIN_NAME` with the actual value found using the above command. - -## Procedure - -This procedure can be performed on any master or worker NCN. - -1. (`ncn-mw#`) Save the current alert notification configuration, in case a rollback is needed. - - ```bash - kubectl get secret -n sysmgmt-health alertmanager-cray-sysmgmt-health-promet-alertmanager \ - -ojsonpath='{.data.alertmanager\.yaml}' | base64 --decode > /tmp/alertmanager-default.yaml - ``` - -1. (`ncn-mw#`) Create a secret and an alert configuration that will be used to add Alerta notifications for the alerts. - - 1. Create the secret file. - - Create a file named `/tmp/alertmanager-secret.yaml` with the following contents: - - ```yaml - apiVersion: v1 - data: - alertmanager.yaml: ALERTMANAGER_CONFIG - kind: Secret - metadata: - labels: - app: kube-prometheus-stack-alertmanager - chart: kube-prometheus-stack-45.1.1 - heritage: Tiller - release: cray-sysmgmt-health - name: alertmanager-cray-sysmgmt-health-kube-p-alertmanager - namespace: sysmgmt-health - type: Opaque - ``` - - 1. Create the Alerta alert configuration file. - - In the following example file, the Alerta server is used to send the notification to `http://sma-alerta.sma.svc.cluster.local:8080/webhooks/prometheus`. - Update the fields under `webhook_configs:` to reflect the desired configuration. - - Create a file named `/tmp/alertmanager-new.yaml` with the following contents: - - ```yaml - global: - resolve_timeout: 5m - route: - group_by: ['alertname'] - group_wait: 30s - group_interval: 5m - repeat_interval: 1h - receiver: 'web.hook' - receivers: - - name: 'web.hook' - webhook_configs: - - url: 'http://sma-alerta.sma.svc.cluster.local:8080/webhooks/prometheus' - inhibit_rules: - - source_match: - severity: 'critical' - target_match: - severity: 'warning' - equal: ['alertname', 'dev', 'instance'] - ``` - -1. (`ncn-mw#`) Replace the alert notification configuration based on the files created in the previous steps. - - ```bash - sed "s/ALERTMANAGER_CONFIG/$(cat /tmp/alertmanager-new.yaml \ - | base64 -w0)/g" /tmp/alertmanager-secret.yaml \ - | kubectl replace --force -f - - ``` - -1. (`ncn-mw#`) Validate the configuration changes. - - 1. View the current configuration. - - ```bash - kubectl exec alertmanager-cray-sysmgmt-health-promet-alertmanager-0 \ - -n sysmgmt-health -c alertmanager -- cat /etc/alertmanager/config/alertmanager.yaml - ``` - - 1. If the configuration does not look accurate, check the logs for errors. - - ```bash - kubectl logs -f -n sysmgmt-health pod/alertmanager-cray-sysmgmt-health-promet-alertmanager-0 alertmanager - ``` - -An Alerta notification will be sent once either of the alerts set in this procedure is `FIRING` in Prometheus. -See `https://prometheus.cmn.SYSTEM_DOMAIN_NAME/alerts` for more information. diff --git a/operations/system_management_health/Configure_Prometheus_Email_Alert_Notifications.md b/operations/system_management_health/Configure_Prometheus_Email_Alert_Notifications.md index 9dbff890e630..f786dc5a3107 100644 --- a/operations/system_management_health/Configure_Prometheus_Email_Alert_Notifications.md +++ b/operations/system_management_health/Configure_Prometheus_Email_Alert_Notifications.md @@ -62,10 +62,10 @@ This procedure can be performed on any master or worker NCN. ```yaml global: - resolve_timeout: 5m + resolve_timeout: 5h route: group_by: - - job + - group group_interval: 5m group_wait: 30s receiver: "null" @@ -93,11 +93,13 @@ This procedure can be performed on any master or worker NCN. - to: receiver-email@yourcompany.com from: sender-email@gmail.com # Your smtp server address + require_tls: false smarthost: smtp.gmail.com:587 auth_username: sender-email@gmail.com auth_identity: sender-email@gmail.com auth_password: xxxxxxxxxxxxxxxx ``` +NOTE: set `require_tls:` false per receiver, if `tls` needs to be disabled. 1. (`ncn-mw#`) Replace the alert notification configuration based on the files created in the previous steps. diff --git a/operations/system_management_health/Grafterm.md b/operations/system_management_health/Grafterm.md deleted file mode 100644 index f40dd61c62a3..000000000000 --- a/operations/system_management_health/Grafterm.md +++ /dev/null @@ -1,58 +0,0 @@ -# Grafterm - -Visualize metrics dashboards on the terminal, like a simplified and minimalist version of **Grafana** for terminal. - -The utility(script) can be found in the /opt/cray/platform-utils directory in all NCNs. - -## Running options - -Exit with `q` or `Esc`. - -### Help - -```bash -./grafterm.sh --help -``` - -### List available terminal dashboards - -```bash -./grafterm.sh --list -``` - -### Default usage -To view the dashboard, pass the value(dashboard json file) to the `-c` parameter to the script. -The Grafterm will query for all data accessible in the datasource by default, and the dashboard refresh frequency is set to 10 seconds. -```bash -./grafterm.sh -c critical_services_dashboard.json -``` - -### Relative time - -```bash -./grafterm.sh -c critical_services_dashboard.json -d 3h -``` - -### Refresh interval - -```bash -./grafterm.sh -c critical_services_dashboard.json -r 10s -``` - -```bash -./grafterm.sh -c critical_services_dashboard.json -d 3h -r 10s -``` - -### Fixed time - -Set a fixed time range to visualize the metrics using duration notation. In the following example, the start time is `now-23h` and end time is `now-18h`. - -```bash -./grafterm.sh -c critical_services_dashboard.json -s 23h -e 18h -``` - -Set a fixed time range to visualize the metrics using timestamp [ISO 8601] notation. - -```bash -./grafterm.sh -c critical_services_dashboard.json -s 2021-10-30T11:25:10+05:00 -e 2021-10-30T11:55:10+05:00 -``` diff --git a/operations/system_management_health/System_Management_Health.md b/operations/system_management_health/System_Management_Health.md index bf488ae72c62..d50e48cb628b 100644 --- a/operations/system_management_health/System_Management_Health.md +++ b/operations/system_management_health/System_Management_Health.md @@ -13,7 +13,7 @@ include support for the Prometheus API. The System Management Health service rel Alertmanager instances, scrape metrics from service endpoints, and trigger alerts - Grafana supports pulling data from VictoriaMetrics, and dashboards for system components are readily available from the open source community -- The `stable/kube-prometheus-stack` Helm chart integrates the Prometheus operator, Prometheus, Alertmanager, Grafana, +- The `victoria-metrics-k8s-stack` Helm chart integrates the victoria operator, victoria metrics, Alertmanager, Grafana, node exporters \(DaemonSet\), and `kube-state-metrics` to provide a monitoring solution for Kubernetes clusters - Istio supports service mesh observability using Kiali diff --git a/operations/system_management_health/System_Management_Health_Checks_and_Alerts.md b/operations/system_management_health/System_Management_Health_Checks_and_Alerts.md index ae891c6396c3..7caa87a06740 100644 --- a/operations/system_management_health/System_Management_Health_Checks_and_Alerts.md +++ b/operations/system_management_health/System_Management_Health_Checks_and_Alerts.md @@ -1,13 +1,13 @@ # System Management Health Checks and Alerts -A health check corresponds to a Prometheus query against metrics aggregated to the Prometheus instance. Core platform +A health check corresponds to a VMalert query against metrics aggregated to the VMalert instance. Core platform components like Kubernetes and Istio collect service-related metrics by default, which enables the System Management Health service to implement generic service health checks without custom instrumentation. Health checks are intended to be coarse-grained and comprehensive, as opposed to fine-grained and exhaustive. Health checks related to infrastructure adhere to the Utilization Saturation Errors \(USE\) method whereas services follow the Rate Errors Duration \(RED\) method. -Prometheus alerting rules periodically evaluate health checks and trigger alerts to Alertmanager, which manages +VMalert alerting rules periodically evaluate health checks and trigger alerts to Alertmanager, which manages silencing, inhibition, aggregation, and sending out notifications. Alertmanager supports a number of notification options, but the most relevant ones are listed below: @@ -15,13 +15,13 @@ options, but the most relevant ones are listed below: - Slack - Publishes notifications to a Slack channel - Webhook- Send an HTTP request to a configurable URL \(requires custom integration\) -Similar to Prometheus metrics, alerts use labels to identify a particular dimensional instantiation, and the +Similar to Victoria metrics, alerts use labels to identify a particular dimensional instantiation, and the Alertmanager dashboard enables operators to preemptively silence alerts based on them. ## Check Active Alerts from NCNs -Prometheus includes the `/api/v1/alerts` endpoint, which returns a JSON object containing the active alerts. From a -non-compute node \(NCN\), can connect to `sysmgmt-health/cray-sysmgmt-health-kube-p-prometheus` directly and bypass +VMalert includes the `api/v1/alerts` endpoint, which returns a JSON object containing the active alerts. From a +non-compute node \(NCN\), can connect to `vmalert-vms` directly and bypass service authentication and authorization. Obtain the cluster IP address: @@ -33,48 +33,35 @@ kubectl -n sysmgmt-health get svc cray-sysmgmt-health-kube-p-prometheus Example output: ```text -NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE -cray-sysmgmt-health-kube-p-prometheus ClusterIP 10.16.201.80 9090/TCP 2d6h +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +vmalert-vms ClusterIP 10.21.216.162 8080/TCP 2d14h ``` Get active alerts, which includes `KubeletTooManyPods` if it is going off: ```bash -curl -s http://CLUSTER-IP:PORT/api/v1/alerts | jq . | grep -B 10 -A 20 KubeletTooManyPods +curl 10.21.216.162:8080/api/v1/alerts | jq . | grep -B 10 -A 20 KubePersistentVolumeInodesFillingUp ``` Example output: ```json { - "status": "success", - "data": { - "alerts": [ - { + + "state": "firing", + "name": "KubePersistentVolumeInodesFillingUp", + "value": "0", "labels": { - "alertname": "KubeletTooManyPods", - "endpoint": "https-metrics", - "instance": "10.252.1.6:10250", + "alertgroup": "kubernetes-storage", + "alertname": "KubePersistentVolumeInodesFillingUp", + "beta_kubernetes_io_arch": "amd64", + "beta_kubernetes_io_os": "linux", + "cluster": "cluster-name", + "group": "prometheus", + "instance": "ncn-w010", "job": "kubelet", - "namespace": "kube-system", - "node": "ncn-w003", - "prometheus": "kube-monitoring/cray-prometheus-operator-prometheus", - "prometheus_replica": "prometheus-cray-prometheus-operator-prometheus-0", - "service": "cray-prometheus-operator-kubelet", - "severity": "warning" - }, - "annotations": { - "message": "Kubelet 10.252.1.6:10250 is running 107 Pods, close to the limit of 110.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods" - }, - "state": "firing", - "activeAt": "2020-01-11T18:13:35.086499854Z", - "value": 107 - }, - { - "labels": { + ``` - In the example above, the alert actually indicates it is getting close to the limit, but the value included in the alert is the actual number of pods on `ncn-w003`.