From b87dd08ea7df01fc74d1f133ce150788fdb04845 Mon Sep 17 00:00:00 2001 From: fbalak Date: Thu, 29 Feb 2024 14:15:42 +0100 Subject: [PATCH 1/9] update message of StorageClientHeartbeatMissed alert Signed-off-by: fbalak --- .../monitoring/prometheus/alerts/test_provider_client.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/functional/monitoring/prometheus/alerts/test_provider_client.py b/tests/functional/monitoring/prometheus/alerts/test_provider_client.py index d51c65832c7..55f31b6f6ea 100644 --- a/tests/functional/monitoring/prometheus/alerts/test_provider_client.py +++ b/tests/functional/monitoring/prometheus/alerts/test_provider_client.py @@ -44,8 +44,7 @@ def test_change_client_ocs_version_and_stop_heartbeat( target_alerts = [ { "label": constants.ALERT_STORAGECLIENTHEARTBEATMISSED, - "msg": f"Storage Client ({client_name}) heartbeat missed for more than 120 (s). " - "Lossy network connectivity might exist", + "msg": f"Storage Client ({client_name}) heartbeat missed for more than 120 (s).", }, { "label": constants.ALERT_STORAGECLIENTINCOMPATIBLEOPERATORVERSION, From 97f689563fff0d3c3a6290951843569f19a967f2 Mon Sep 17 00:00:00 2001 From: fbalak Date: Thu, 29 Feb 2024 14:25:37 +0100 Subject: [PATCH 2/9] remove dot from the alert message Signed-off-by: fbalak --- .../monitoring/prometheus/alerts/test_provider_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/functional/monitoring/prometheus/alerts/test_provider_client.py b/tests/functional/monitoring/prometheus/alerts/test_provider_client.py index 55f31b6f6ea..017fa237aa8 100644 --- a/tests/functional/monitoring/prometheus/alerts/test_provider_client.py +++ b/tests/functional/monitoring/prometheus/alerts/test_provider_client.py @@ -44,7 +44,7 @@ def test_change_client_ocs_version_and_stop_heartbeat( target_alerts = [ { "label": constants.ALERT_STORAGECLIENTHEARTBEATMISSED, - "msg": f"Storage Client ({client_name}) heartbeat missed for more than 120 (s).", + "msg": f"Storage Client ({client_name}) heartbeat missed for more than 120 (s)", }, { "label": constants.ALERT_STORAGECLIENTINCOMPATIBLEOPERATORVERSION, From 83e77982926bd8ab41a13261b800ed5fe8cfb819 Mon Sep 17 00:00:00 2001 From: fbalak Date: Thu, 29 Feb 2024 15:12:09 +0100 Subject: [PATCH 3/9] update alert data Signed-off-by: fbalak --- .../prometheus/alerts/test_provider_client.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/tests/functional/monitoring/prometheus/alerts/test_provider_client.py b/tests/functional/monitoring/prometheus/alerts/test_provider_client.py index 017fa237aa8..40a5fd8c47f 100644 --- a/tests/functional/monitoring/prometheus/alerts/test_provider_client.py +++ b/tests/functional/monitoring/prometheus/alerts/test_provider_client.py @@ -45,11 +45,17 @@ def test_change_client_ocs_version_and_stop_heartbeat( { "label": constants.ALERT_STORAGECLIENTHEARTBEATMISSED, "msg": f"Storage Client ({client_name}) heartbeat missed for more than 120 (s)", + "severity": "warning", + }, + { + "label": constants.ALERT_STORAGECLIENTHEARTBEATMISSED, + "msg": f"Storage Client ({client_name}) heartbeat missed for more than 300 (s)", + "severity": "error", }, { "label": constants.ALERT_STORAGECLIENTINCOMPATIBLEOPERATORVERSION, - "msg": f"Storage Client Operator ({client_name}) differs by more " - "than 1 minor version. Client configuration may be incompatible and unsupported", + "msg": f"Storage Client Operator ({client_name}) differs by more than 1 minor version", + "severity": "error", }, ] states = ["firing"] @@ -60,14 +66,7 @@ def test_change_client_ocs_version_and_stop_heartbeat( msg=target_alert["msg"], alerts=alerts, states=states, - severity="error", - ) - prometheus.check_alert_list( - label=target_alert["label"], - msg=target_alert["msg"], - alerts=alerts, - states=states, - severity="warning", + severity=target_alert["severity"], ) api.check_alert_cleared( label=target_alert["label"], From 8e2f8040dc7d6ab0c7c76b7b7bb28152c30e8954 Mon Sep 17 00:00:00 2001 From: fbalak Date: Thu, 29 Feb 2024 15:20:57 +0100 Subject: [PATCH 4/9] increase alert collecting time Signed-off-by: fbalak --- tests/functional/monitoring/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/functional/monitoring/conftest.py b/tests/functional/monitoring/conftest.py index eebded7b77c..62877c4f9ec 100644 --- a/tests/functional/monitoring/conftest.py +++ b/tests/functional/monitoring/conftest.py @@ -1177,7 +1177,7 @@ def change_client_version(): nonlocal client nonlocal original_cluster # run_time of operation - run_time = 60 * 3 + run_time = 60 * 7 client.stop_heartbeat() client.set_ocs_version("4.13.0") logger.info(f"Waiting for {run_time} seconds") From fad79707cabfca06c3ea7375fa520f1589e7ea1a Mon Sep 17 00:00:00 2001 From: fbalak Date: Wed, 15 May 2024 13:44:30 +0200 Subject: [PATCH 5/9] update alert messages Signed-off-by: fbalak --- .../prometheus/alerts/test_provider_client.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/tests/functional/monitoring/prometheus/alerts/test_provider_client.py b/tests/functional/monitoring/prometheus/alerts/test_provider_client.py index 40a5fd8c47f..81310f9e59f 100644 --- a/tests/functional/monitoring/prometheus/alerts/test_provider_client.py +++ b/tests/functional/monitoring/prometheus/alerts/test_provider_client.py @@ -1,6 +1,7 @@ import logging import pytest +from ocs_ci.framework import config from ocs_ci.framework.pytest_customization.marks import blue_squad from ocs_ci.framework.testlib import ( tier4c, @@ -41,20 +42,31 @@ def test_change_client_ocs_version_and_stop_heartbeat( client_name = measure_change_client_ocs_version_and_stop_heartbeat.get( "metadata" ).get("client_name") + cluster_namespace = config.ENV_DATA["cluster_namespace"] + cluster_name = config.ENV_DATA["storage_cluster_name"] target_alerts = [ { "label": constants.ALERT_STORAGECLIENTHEARTBEATMISSED, - "msg": f"Storage Client ({client_name}) heartbeat missed for more than 120 (s)", + "msg": ( + f"Storage Client ({client_name}) heartbeat missed for more than 120 (s) " + f"in namespace:cluster {cluster_namespace}:{cluster_name}." + ), "severity": "warning", }, { "label": constants.ALERT_STORAGECLIENTHEARTBEATMISSED, - "msg": f"Storage Client ({client_name}) heartbeat missed for more than 300 (s)", + "msg": ( + f"Storage Client ({client_name}) heartbeat missed for more than 300 (s) " + f"in namespace:cluster {cluster_namespace}:{cluster_name}." + ), "severity": "error", }, { "label": constants.ALERT_STORAGECLIENTINCOMPATIBLEOPERATORVERSION, - "msg": f"Storage Client Operator ({client_name}) differs by more than 1 minor version", + "msg": ( + f"Storage Client Operator ({client_name}) differs by more than 1 minor " + f"version in namespace:cluster {cluster_namespace}:{cluster_name}." + ), "severity": "error", }, ] From 74210c9340bdf07cf3d7453911564e9b7ca9de7c Mon Sep 17 00:00:00 2001 From: fbalak Date: Wed, 15 May 2024 15:24:24 +0200 Subject: [PATCH 6/9] update check_alert_list to reflect multiple messages for one alert Signed-off-by: fbalak --- ocs_ci/utility/prometheus.py | 47 ++++++++++++++---------------------- 1 file changed, 18 insertions(+), 29 deletions(-) diff --git a/ocs_ci/utility/prometheus.py b/ocs_ci/utility/prometheus.py index bd5c012eed0..c6a5a4f7f53 100644 --- a/ocs_ci/utility/prometheus.py +++ b/ocs_ci/utility/prometheus.py @@ -41,38 +41,27 @@ def check_alert_list( target_alerts = [ alert for alert in alerts if alert.get("labels").get("alertname") == label ] - logger.info(f"Checking properties of found {label} alerts") - if ignore_more_occurences: - for state in states: - delete = False - for key, alert in reversed(list(enumerate(target_alerts))): - if alert.get("state") == state: - if delete: - d_msg = f"Ignoring {alert} as alert already appeared." - logger.debug(d_msg) - target_alerts.pop(key) - else: - delete = True - assert_msg = ( - f"Incorrect number of {label} alerts ({len(target_alerts)} " - f"instead of {len(states)} with states: {states})." - f"\nAlerts: {target_alerts}" - ) - assert len(target_alerts) == len(states), assert_msg for key, state in enumerate(states): - - assert_msg = "Alert message for alert {label} is not correct" - assert target_alerts[key]["annotations"]["message"] == msg, assert_msg - - assert_msg = f"Alert {label} doesn't have {severity} severity" - assert ( - target_alerts[key]["annotations"]["severity_level"] == severity - ), assert_msg - - assert_msg = f"Alert {label} is not in {state} state" - assert target_alerts[key]["state"] == state, assert_msg + target_alerts = [ + alert + for alert in target_alerts + if alert["message"] == msg + and alert["severity_level"] == severity + and alert["state"] == state + ] + assert_msg = ( + f"There was not found alert {label} with message: {msg}, " + f"severity: {severity} in state: {state}" + ) + assert target_alerts, assert_msg + if not ignore_more_occurences: + assert_msg = ( + f"There are multiple instances of alert {label} with " + f"message: {msg}, severity: {severity} in state: {state}" + ) + assert len(target_alerts) == 1, assert_msg logger.info("Alerts were triggered correctly during utilization") From a7ef10bc672c793d5942473f194f25f450c82fc7 Mon Sep 17 00:00:00 2001 From: fbalak Date: Wed, 22 May 2024 14:41:13 +0200 Subject: [PATCH 7/9] specify namespace in patch command Signed-off-by: fbalak --- ocs_ci/ocs/resources/storageconsumer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ocs_ci/ocs/resources/storageconsumer.py b/ocs_ci/ocs/resources/storageconsumer.py index 5d63f7f55a9..94fd8ea0b00 100644 --- a/ocs_ci/ocs/resources/storageconsumer.py +++ b/ocs_ci/ocs/resources/storageconsumer.py @@ -77,6 +77,8 @@ def set_ocs_version(self, version): + "'", "--subresource", "status", + "--namespace", + config.cluster_ctx.ENV_DATA["cluster_namespace"], ] exec_cmd(" ".join(cmd)) From a77d3966e5753cea00a2b7cfc67366119063e3cd Mon Sep 17 00:00:00 2001 From: fbalak Date: Wed, 22 May 2024 15:36:10 +0200 Subject: [PATCH 8/9] fix alert dictionary keys Signed-off-by: fbalak --- ocs_ci/utility/prometheus.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ocs_ci/utility/prometheus.py b/ocs_ci/utility/prometheus.py index c6a5a4f7f53..a2c982f55b0 100644 --- a/ocs_ci/utility/prometheus.py +++ b/ocs_ci/utility/prometheus.py @@ -47,8 +47,8 @@ def check_alert_list( target_alerts = [ alert for alert in target_alerts - if alert["message"] == msg - and alert["severity_level"] == severity + if alert["annotations"]["message"] == msg + and alert["annotations"]["severity_level"] == severity and alert["state"] == state ] assert_msg = ( From 520a562310f31a9a09d9fec92b120ce276373154 Mon Sep 17 00:00:00 2001 From: fbalak Date: Thu, 23 May 2024 10:03:06 +0200 Subject: [PATCH 9/9] fix severity level Signed-off-by: fbalak --- .../monitoring/prometheus/alerts/test_provider_client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/functional/monitoring/prometheus/alerts/test_provider_client.py b/tests/functional/monitoring/prometheus/alerts/test_provider_client.py index 81310f9e59f..9a6c42b4fff 100644 --- a/tests/functional/monitoring/prometheus/alerts/test_provider_client.py +++ b/tests/functional/monitoring/prometheus/alerts/test_provider_client.py @@ -59,7 +59,7 @@ def test_change_client_ocs_version_and_stop_heartbeat( f"Storage Client ({client_name}) heartbeat missed for more than 300 (s) " f"in namespace:cluster {cluster_namespace}:{cluster_name}." ), - "severity": "error", + "severity": "critical", }, { "label": constants.ALERT_STORAGECLIENTINCOMPATIBLEOPERATORVERSION, @@ -67,7 +67,7 @@ def test_change_client_ocs_version_and_stop_heartbeat( f"Storage Client Operator ({client_name}) differs by more than 1 minor " f"version in namespace:cluster {cluster_namespace}:{cluster_name}." ), - "severity": "error", + "severity": "critical", }, ] states = ["firing"]