Skip to content

Commit

Permalink
Elastic search krkn-lib integration (#658)
Browse files Browse the repository at this point in the history
* Elastic search krkn-lib integration

Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>

removed default urls

Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>

* Fix alerts bug on prometheus

Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>

* fixed prometheus object initialization bug

Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>

* updated requirements to krkn-lib 2.1.8

Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>

* disabled alerts and metrics by default

Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>

* reverted requirement to elastic branch on krkn-lib

Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>

* numpy downgrade

Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>

* maximium retries added to hijacking funtest

Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>

* added elastic settings to funtest config

Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>

* krkn-lib 3.0.0 update

Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>

---------

Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>
  • Loading branch information
tsebastiani authored Aug 28, 2024
1 parent 9cd086f commit 6186555
Show file tree
Hide file tree
Showing 6 changed files with 205 additions and 19 deletions.
12 changes: 12 additions & 0 deletions CI/config/common_test_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,3 +50,15 @@ telemetry:
oc_cli_path: /usr/bin/oc # optional, if not specified will be search in $PATH
events_backup: True # enables/disables cluster events collection
telemetry_group: "funtests"
elastic:
enable_elastic: True
collect_metrics: False
collect_alerts: False
verify_certs: False
elastic_url: "https://192.168.39.196" # To track results in elasticsearch, give url to server here; will post telemetry details when url and index not blank
elastic_port: 32766
username: "elastic"
password: "test"
metrics_index: "krkn-metrics"
alerts_index: "krkn-alerts"
telemetry_index: "krkn-telemetry"
9 changes: 8 additions & 1 deletion CI/tests/test_service_hijacking.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,14 @@ function functional_test_service_hijacking {
python3 -m coverage run -a run_kraken.py -c CI/config/service_hijacking.yaml > /dev/null 2>&1 &
PID=$!
#Waiting the hijacking to have effect
while [ `curl -X GET -s -o /dev/null -I -w "%{http_code}" $SERVICE_URL/list/index.php` == 404 ]; do echo "waiting scenario to kick in."; sleep 1; done;
COUNTER=0
while [ `curl -X GET -s -o /dev/null -I -w "%{http_code}" $SERVICE_URL/list/index.php` == 404 ]
do
echo "waiting scenario to kick in."
sleep 1
COUNTER=$((COUNTER+1))
[ $COUNTER -eq "100" ] && echo "maximum number of retry reached, test failed" && exit 1
done

#Checking Step 1 GET on /list/index.php
OUT_GET="`curl -X GET -s $SERVICE_URL/list/index.php`"
Expand Down
21 changes: 17 additions & 4 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,12 +55,27 @@ cerberus:
performance_monitoring:
deploy_dashboards: False # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift
repo: "https://github.com/cloud-bulldozer/performance-dashboards.git"
prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
prometheus_url: '' # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
uuid: # uuid for the run is generated by default if not set
enable_alerts: False # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error
enable_metrics: False
alert_profile: config/alerts.yaml # Path or URL to alert profile with the prometheus queries
metrics_profile: config/metrics.yaml
check_critical_alerts: False # When enabled will check prometheus for critical alerts firing post chaos
elastic:
enable_elastic: False
collect_metrics: False
collect_alerts: False
verify_certs: False
elastic_url: "" # To track results in elasticsearch, give url to server here; will post telemetry details when url and index not blank
elastic_port: 32766
username: "elastic"
password: "test"
metrics_index: "krkn-metrics"
alerts_index: "krkn-alerts"
telemetry_index: "krkn-telemetry"

tunings:
wait_duration: 60 # Duration to wait between each chaos scenario
iterations: 1 # Number of times to execute the scenarios
Expand Down Expand Up @@ -94,9 +109,7 @@ telemetry:
- "(\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d+Z).+" # 2023-09-15T11:20:36.123425532Z log
oc_cli_path: /usr/bin/oc # optional, if not specified will be search in $PATH
events_backup: True # enables/disables cluster events collection
elastic:
elastic_url: "" # To track results in elasticsearch, give url to server here; will post telemetry details when url and index not blank
elastic_index: "" # Elastic search index pattern to post results to




Expand Down
87 changes: 83 additions & 4 deletions kraken/prometheus/client.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,30 @@
from __future__ import annotations

import datetime
import os.path
from typing import Optional
from typing import Optional, List, Dict, Any

import urllib3
import logging
import sys

import yaml
from krkn_lib.elastic.krkn_elastic import KrknElastic
from krkn_lib.models.elastic.models import ElasticAlert
from krkn_lib.models.krkn import ChaosRunAlertSummary, ChaosRunAlert
from krkn_lib.prometheus.krkn_prometheus import KrknPrometheus


urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
def alerts(prom_cli: KrknPrometheus, start_time, end_time, alert_profile):
def alerts(prom_cli: KrknPrometheus,
elastic: KrknElastic,
run_uuid,
start_time,
end_time,
alert_profile,
elastic_colllect_alerts,
elastic_alerts_index
):

if alert_profile is None or os.path.exists(alert_profile) is False:
logging.error(f"{alert_profile} alert profile does not exist")
Expand All @@ -20,17 +34,28 @@ def alerts(prom_cli: KrknPrometheus, start_time, end_time, alert_profile):
profile_yaml = yaml.safe_load(profile)
if not isinstance(profile_yaml, list):
logging.error(f"{alert_profile} wrong file format, alert profile must be "
f"a valid yaml file containing a list of items with 3 properties: "
f"a valid yaml file containing a list of items with at least 3 properties: "
f"expr, description, severity" )
sys.exit(1)

for alert in profile_yaml:
if list(alert.keys()).sort() != ["expr", "description", "severity"].sort():
logging.error(f"wrong alert {alert}, skipping")

prom_cli.process_alert(alert,
processed_alert = prom_cli.process_alert(alert,
datetime.datetime.fromtimestamp(start_time),
datetime.datetime.fromtimestamp(end_time))
if processed_alert[0] and processed_alert[1] and elastic_colllect_alerts:
elastic_alert = ElasticAlert(run_uuid=run_uuid,
severity=alert["severity"],
alert=processed_alert[1],
created_at=datetime.datetime.fromtimestamp(processed_alert[0])
)
result = elastic.push_alert(elastic_alert, elastic_alerts_index)
if result == -1:
logging.error("failed to save alert on ElasticSearch")
pass



def critical_alerts(prom_cli: KrknPrometheus,
Expand Down Expand Up @@ -86,3 +111,57 @@ def critical_alerts(prom_cli: KrknPrometheus,

if not firing_alerts:
logging.info("No critical alerts are firing!!")


def metrics(prom_cli: KrknPrometheus,
elastic: KrknElastic,
run_uuid,
start_time,
end_time,
metrics_profile,
elastic_colllect_metrics,
elastic_metrics_index
) -> list[dict[str, list[(int, float)] | str]]:
metrics_list: list[dict[str, list[(int, float)] | str]] = []
if metrics_profile is None or os.path.exists(metrics_profile) is False:
logging.error(f"{metrics_profile} alert profile does not exist")
sys.exit(1)
with open(metrics_profile) as profile:
profile_yaml = yaml.safe_load(profile)
if not profile_yaml["metrics"] or not isinstance(profile_yaml["metrics"], list):
logging.error(f"{metrics_profile} wrong file format, alert profile must be "
f"a valid yaml file containing a list of items with 3 properties: "
f"expr, description, severity" )
sys.exit(1)

for metric_query in profile_yaml["metrics"]:
if list(metric_query.keys()).sort() != ["query", "metricName", "instant"].sort():
logging.error(f"wrong alert {metric_query}, skipping")
metrics_result = prom_cli.process_prom_query_in_range(
metric_query["query"],
start_time=datetime.datetime.fromtimestamp(start_time),
end_time=datetime.datetime.fromtimestamp(end_time)

)

metric = {"name": metric_query["metricName"], "values":[]}
for returned_metric in metrics_result:
if "values" in returned_metric:
for value in returned_metric["values"]:
try:
metric["values"].append((value[0], float(value[1])))
except ValueError:
pass
metrics_list.append(metric)

if elastic_colllect_metrics:
result = elastic.upload_metrics_to_elasticsearch(run_uuid=run_uuid, index=elastic_metrics_index, raw_data=metrics_list)
if result == -1:
logging.error("failed to save metrics on ElasticSearch")


return metrics_list




3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,10 @@ google-api-python-client==2.116.0
ibm_cloud_sdk_core==3.18.0
ibm_vpc==0.20.0
jinja2==3.1.4
krkn-lib==2.1.9
krkn-lib==3.0.0
lxml==5.1.0
kubernetes==28.1.0
numpy==1.26.4
oauth2client==4.1.3
pandas==2.2.0
openshift-client==1.0.21
Expand Down
92 changes: 83 additions & 9 deletions run_kraken.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
import uuid
import time

from krkn_lib.elastic.krkn_elastic import KrknElastic
from krkn_lib.models.elastic import ElasticChaosRunTelemetry
from krkn_lib.models.krkn import ChaosRunOutput, ChaosRunAlertSummary
from krkn_lib.prometheus.krkn_prometheus import KrknPrometheus
import kraken.time_actions.common_time_functions as time_actions
Expand All @@ -30,7 +32,6 @@
from kraken import plugins, syn_flood
from krkn_lib.k8s import KrknKubernetes
from krkn_lib.ocp import KrknOpenshift
from krkn_lib.telemetry.elastic import KrknElastic
from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes
from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
from krkn_lib.models.telemetry import ChaosRunTelemetry
Expand Down Expand Up @@ -94,14 +95,61 @@ def main(cfg) -> int:
enable_alerts = get_yaml_item_value(
config["performance_monitoring"], "enable_alerts", False
)
enable_metrics = get_yaml_item_value(
config["performance_monitoring"], "enable_metrics", False
)
# elastic search
enable_elastic = get_yaml_item_value(
config["elastic"], "enable_elastic", False
)
elastic_collect_metrics = get_yaml_item_value(
config["elastic"], "collect_metrics", False
)

elastic_colllect_alerts = get_yaml_item_value(
config["elastic"], "collect_alerts", False
)

elastic_url = get_yaml_item_value(
config["elastic"], "elastic_url", ""
)

elastic_verify_certs = get_yaml_item_value(
config["elastic"], "verify_certs", False
)

elastic_port = get_yaml_item_value(
config["elastic"], "elastic_port", 32766
)

elastic_username = get_yaml_item_value(
config["elastic"], "username", ""
)
elastic_password = get_yaml_item_value(
config["elastic"], "password", ""
)

elastic_metrics_index = get_yaml_item_value(
config["elastic"], "metrics_index", "krkn-metrics"
)

elastic_alerts_index = get_yaml_item_value(
config["elastic"], "alerts_index", "krkn-alerts"
)

elastic_telemetry_index = get_yaml_item_value(
config["elastic"], "telemetry_index", "krkn-telemetry"
)



alert_profile = config["performance_monitoring"].get("alert_profile")
metrics_profile = config["performance_monitoring"].get("metrics_profile")
check_critical_alerts = get_yaml_item_value(
config["performance_monitoring"], "check_critical_alerts", False
)
telemetry_api_url = config["telemetry"].get("api_url")
elastic_config = get_yaml_item_value(config,"elastic",{})
elastic_url = get_yaml_item_value(elastic_config,"elastic_url","")
elastic_index = get_yaml_item_value(elastic_config,"elastic_index","")


# Initialize clients
if (not os.path.isfile(kubeconfig_path) and
Expand Down Expand Up @@ -167,7 +215,7 @@ def main(cfg) -> int:
cv = ""
if distribution == "openshift":
cv = ocpcli.get_clusterversion_string()
if prometheus_url is None:
if not prometheus_url:
try:
connection_data = ocpcli.get_prometheus_api_connection_data()
if connection_data:
Expand All @@ -189,9 +237,16 @@ def main(cfg) -> int:
# KrknTelemetry init
telemetry_k8s = KrknTelemetryKubernetes(safe_logger, kubecli)
telemetry_ocp = KrknTelemetryOpenshift(safe_logger, ocpcli)
telemetry_elastic = KrknElastic(safe_logger,elastic_url)
if enable_elastic:
elastic_search = KrknElastic(safe_logger,
elastic_url,
elastic_port,
elastic_verify_certs,
elastic_username,
elastic_password
)
summary = ChaosRunAlertSummary()
if enable_alerts or check_critical_alerts:
if enable_metrics or enable_alerts or check_critical_alerts:
prometheus = KrknPrometheus(prometheus_url, prometheus_bearer_token)

logging.info("Server URL: %s" % kubecli.get_host())
Expand Down Expand Up @@ -400,7 +455,12 @@ def main(cfg) -> int:
decoded_chaos_run_telemetry = ChaosRunTelemetry(json.loads(chaos_telemetry.to_json()))
chaos_output.telemetry = decoded_chaos_run_telemetry
logging.info(f"Chaos data:\n{chaos_output.to_json()}")
telemetry_elastic.upload_data_to_elasticsearch(decoded_chaos_run_telemetry.to_json(), elastic_index)
if enable_elastic:
elastic_telemetry = ElasticChaosRunTelemetry(chaos_run_telemetry=decoded_chaos_run_telemetry)
result = elastic_search.push_telemetry(elastic_telemetry, elastic_telemetry_index)
if result == -1:
safe_logger.error(f"failed to save telemetry on elastic search: {chaos_output.to_json()}")

if config["telemetry"]["enabled"]:
logging.info(f'telemetry data will be stored on s3 bucket folder: {telemetry_api_url}/files/'
f'{(config["telemetry"]["telemetry_group"] if config["telemetry"]["telemetry_group"] else "default")}/'
Expand Down Expand Up @@ -451,14 +511,28 @@ def main(cfg) -> int:
if alert_profile:
prometheus_plugin.alerts(
prometheus,
elastic_search,
run_uuid,
start_time,
end_time,
alert_profile,
elastic_colllect_alerts,
elastic_alerts_index
)

else:
logging.error("Alert profile is not defined")
#sys.exit(1)
return 1
#sys.exit(1)
if enable_metrics:
prometheus_plugin.metrics(prometheus,
elastic_search,
start_time,
run_uuid,
end_time,
metrics_profile,
elastic_collect_metrics,
elastic_metrics_index)

if post_critical_alerts > 0:
logging.error("Critical alerts are firing, please check; exiting")
Expand Down

0 comments on commit 6186555

Please sign in to comment.