From dd0272414969cbcac7fcc73cfd1853a6a3cd0498 Mon Sep 17 00:00:00 2001 From: Shea Stewart Date: Mon, 26 Feb 2024 16:16:09 -0500 Subject: [PATCH] Updates/rs and owner (#340) * replica health script * integrate task * fix runwhen flag * add util to find related resources for certain label / annotation patterns * update default error codes * switch to a script for future improvements * add additional related objects * simplify issue titles --- .../runbook.robot | 24 ++- .../check_replicaset.sh | 146 ++++++++++++++++++ .../event_anomalies.sh | 51 ++++++ .../k8s-deployment-healthcheck/runbook.robot | 69 ++++++++- .../k8s-jaeger-http-query/runbook.robot | 2 +- libraries/RW/K8sHelper/__init__.py | 1 + libraries/RW/K8sHelper/k8s_helper.py | 40 +++++ 7 files changed, 309 insertions(+), 24 deletions(-) create mode 100755 codebundles/k8s-deployment-healthcheck/check_replicaset.sh create mode 100755 codebundles/k8s-deployment-healthcheck/event_anomalies.sh create mode 100644 libraries/RW/K8sHelper/__init__.py create mode 100644 libraries/RW/K8sHelper/k8s_helper.py diff --git a/codebundles/curl-gmp-nginx-ingress-inspection/runbook.robot b/codebundles/curl-gmp-nginx-ingress-inspection/runbook.robot index 5f042cb6..28ff08b3 100644 --- a/codebundles/curl-gmp-nginx-ingress-inspection/runbook.robot +++ b/codebundles/curl-gmp-nginx-ingress-inspection/runbook.robot @@ -8,6 +8,7 @@ Metadata Supports GCP,GMP,Ingress,Nginx,Metrics Library BuiltIn Library RW.Core Library RW.CLI +Library RW.K8sHelper Library RW.platform Library OperatingSystem @@ -42,6 +43,12 @@ Fetch Nginx HTTP Errors From GMP for Ingress `${INGRESS_OBJECT_NAME}` ${owner_name}= RW.CLI.Run Cli ... cmd=echo "${k8s_ingress_details.stdout}" | grep 'Owner:[^ ]*' | awk -F': ' '{print $2}' |awk -F':' '{print $2}'| sed 's/ *$//' | tr -d '\n' ... include_in_history=false + ${k8s_ingress_details}= RW.CLI.Run Cli + ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get ingress ${INGRESS_OBJECT_NAME} -n ${NAMESPACE} --context ${CONTEXT} -o json + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ${related_resource_recommendations}= RW.K8sHelper.Get Related Resource Recommendations + ... k8s_object=${k8s_ingress_details.stdout} RW.CLI.Parse Cli Output By Line ... rsp=${gmp_rsp} ... set_severity_level=2 @@ -49,7 +56,7 @@ Fetch Nginx HTTP Errors From GMP for Ingress `${INGRESS_OBJECT_NAME}` ... set_issue_actual=We found the following HTTP error codes: ${ERROR_CODES} associated with the ingress in $_line ... set_issue_title=Detected HTTP Error Codes for Ingress `${INGRESS_OBJECT_NAME}` ... set_issue_details=HTTP error codes in ingress and service "$_line". Troubleshoot the application associated with ${owner_kind.stdout} `${owner_name.stdout}` - ... set_issue_next_steps=Check Deployment Log For Issues with `${owner_name.stdout}`\nQuery Traces for HTTP Errors in Namespace `${NAMESPACE}` + ... set_issue_next_steps=Check Deployment Log For Issues with `${owner_name.stdout}`\nQuery Traces for HTTP Errors in Namespace `${NAMESPACE}`\n${related_resource_recommendations} ... _line__raise_issue_if_contains=Host ${ingress_info}= Set Variable ${gmp_rsp.stdout} IF """${ingress_info}""" == "" or """${ingress_info}""".isspace() @@ -75,7 +82,6 @@ Find Owner and Service Health for Ingress `${INGRESS_OBJECT_NAME}` RW.Core.Add Pre To Report Commands Used: ${history} RW.Core.Add Pre To Report Ingress Info:\n${k8s_ingress_details.stdout} - *** Keywords *** Suite Initialization ${kubeconfig}= RW.Core.Import Secret @@ -84,10 +90,6 @@ Suite Initialization ... description=The kubernetes kubeconfig yaml containing connection configuration used to connect to cluster(s). ... pattern=\w* ... example=For examples, start here https://kubernetes.io/docs/concepts/configuration/organize-cluster-access-kubeconfig/ - ${kubectl}= RW.Core.Import Service kubectl - ... description=The location service used to interpret shell commands. - ... default=kubectl-service.shared - ... example=kubectl-service.shared ${KUBERNETES_DISTRIBUTION_BINARY}= RW.Core.Import User Variable KUBERNETES_DISTRIBUTION_BINARY ... type=string ... description=Which binary to use for Kubernetes CLI commands. @@ -105,12 +107,6 @@ Suite Initialization ... pattern=\w* ... example=otel-demo ... default= - ${GCLOUD_SERVICE}= RW.Core.Import Service gcloud - ... type=string - ... description=The selected RunWhen Service to use for accessing services within a network. - ... pattern=\w* - ... example=gcloud-service.shared - ... default=gcloud-service.shared ${gcp_credentials_json}= RW.Core.Import Secret gcp_credentials_json ... type=string ... description=GCP service account json used to authenticate with GCP APIs. @@ -151,15 +147,13 @@ Suite Initialization ... description=Which http status codes to look for and classify as errors. ... pattern=\w* ... example=500 - ... default=500|501|502 + ... default=500|501|502|503|504 ${OS_PATH}= Get Environment Variable PATH Set Suite Variable ${kubeconfig} ${kubeconfig} - Set Suite Variable ${kubectl} ${kubectl} Set Suite Variable ${KUBERNETES_DISTRIBUTION_BINARY} ${KUBERNETES_DISTRIBUTION_BINARY} Set Suite Variable ${CONTEXT} ${CONTEXT} Set Suite Variable ${NAMESPACE} ${NAMESPACE} Set Suite Variable ${ERROR_CODES} ${ERROR_CODES} - Set Suite Variable ${GCLOUD_SERVICE} ${GCLOUD_SERVICE} Set Suite Variable ${gcp_credentials_json} ${gcp_credentials_json} Set Suite Variable ${GCP_PROJECT_ID} ${GCP_PROJECT_ID} Set Suite Variable ${INGRESS_HOST} ${INGRESS_HOST} diff --git a/codebundles/k8s-deployment-healthcheck/check_replicaset.sh b/codebundles/k8s-deployment-healthcheck/check_replicaset.sh new file mode 100755 index 00000000..7d9a38a4 --- /dev/null +++ b/codebundles/k8s-deployment-healthcheck/check_replicaset.sh @@ -0,0 +1,146 @@ +#!/bin/bash + +# Kubernetes Deployment ReplicaSet Management Script +# This script checks Kubernetes deployments to ensure they are running the latest ReplicaSet. It is designed to manage +# ReplicaSets during normal operations and rolling updates, checking for multiple ReplicaSets, verifying the active latest ReplicaSet, and providing actionable insights for any inactive or conflicting ReplicaSets. + +# Function to check for rolling update status +check_rolling_update_status() { + # Extract conditions and replica counts + local progressingCondition=$(echo "$DEPLOYMENT_JSON" | jq '.status.conditions[] | select(.type=="Progressing")') + local availableCondition=$(echo "$DEPLOYMENT_JSON" | jq '.status.conditions[] | select(.type=="Available").status') + local replicas=$(echo "$DEPLOYMENT_JSON" | jq '.status.replicas // 0') + local updatedReplicas=$(echo "$DEPLOYMENT_JSON" | jq '.status.updatedReplicas // 0') + local availableReplicas=$(echo "$DEPLOYMENT_JSON" | jq '.status.availableReplicas // 0') + local readyReplicas=$(echo "$DEPLOYMENT_JSON" | jq '.status.readyReplicas // 0') + + # Interpret 'Progressing' condition more accurately + local progressingStatus=$(echo "$progressingCondition" | jq -r '.status') + local progressingReason=$(echo "$progressingCondition" | jq -r '.reason') + local lastUpdateTime=$(echo "$progressingCondition" | jq -r '.lastUpdateTime') + + # Current time in UTC for comparison (assuming 'date' command is available and system timezone is correctly set) + local currentTime=$(date -u +"%Y-%m-%dT%H:%M:%SZ") + + # Compare replica counts for a more accurate ongoing rollout check + if [[ "$progressingStatus" == "True" && "$progressingReason" == "NewReplicaSetAvailable" && "$updatedReplicas" == "$replicas" && "$availableReplicas" == "$updatedReplicas" && "$readyReplicas" == "$updatedReplicas" ]]; then + # Check how recent the last update was to consider a buffer for stabilization + if [[ $(date -d "$lastUpdateTime" +%s) -lt $(date -d "$currentTime" +%s --date='-2 minutes') ]]; then + echo "Deployment $DEPLOYMENT_NAME is stable. No active rollout detected." + ROLLING_UPDATE_STATUS=1 # Indicates no update is in progress + else + echo "Deployment $DEPLOYMENT_NAME has recently updated and may still be stabilizing." + ROLLING_UPDATE_STATUS=0 # Indicates recent update, considering stabilization + fi + elif [[ "$updatedReplicas" -lt "$replicas" ]] || [[ "$availableReplicas" -lt "$updatedReplicas" ]] || [[ "$readyReplicas" -lt "$updatedReplicas" ]]; then + echo "Deployment $DEPLOYMENT_NAME is undergoing a rollout." + ROLLING_UPDATE_STATUS=0 # Indicates an update is in progress + else + echo "Deployment $DEPLOYMENT_NAME is stable. No active rollout detected." + ROLLING_UPDATE_STATUS=1 # Indicates no update is in progress + fi +} + + + +verify_pods_association_with_latest_rs() { + # Fetch all pods associated with the deployment + PODS_JSON=$(${KUBERNETES_DISTRIBUTION_BINARY} get pods -n $NAMESPACE --context $CONTEXT --selector=app=$DEPLOYMENT_NAME --context $CONTEXT -o json) + PODS_COUNT=$(echo "$PODS_JSON" | jq '.items | length') + OUTDATED_PODS_COUNT=0 + + for ((i=0; i 0 FOR ${item} IN @{object_list} @@ -163,7 +170,7 @@ Troubleshoot Deployment Warning Events for `${DEPLOYMENT_NAME}` ... title= Deployment `${DEPLOYMENT_NAME}` generated warning events for ${item["kind"]} `${item["name"]}`. ... reproduce_hint=View Commands Used in Report Output ... details=${item["kind"]} `${item["name"]}` generated the following warning details:\n`${item}` - ... next_steps=${item_next_steps.stdout} + ... next_steps=${item_next_steps.stdout}\n${related_resource_recommendations} END END ${history}= RW.CLI.Pop Shell History @@ -216,7 +223,7 @@ Troubleshoot Deployment Replicas for `${DEPLOYMENT_NAME}` ... severity=1 ... expected=Deployment `${DEPLOYMENT_NAME}` in namespace `${NAMESPACE}` should have minimum availability / pod. ... actual=Deployment `${DEPLOYMENT_NAME}` in namespace `${NAMESPACE}` does not have minimum availability / pods. - ... title= Deployment `${DEPLOYMENT_NAME}` has status: ${deployment_status["available_condition"]["message"]} + ... title= Deployment `${DEPLOYMENT_NAME}` is unavailable. Status: `${deployment_status["available_condition"]["message"]}` ... reproduce_hint=View Commands Used in Report Output ... details=Deployment `${DEPLOYMENT_NAME}` has ${deployment_status["ready_replicas"]} pods and needs ${deployment_status["desired_replicas"]}:\n`${deployment_status}` ... next_steps=${item_next_steps.stdout} @@ -225,7 +232,7 @@ Troubleshoot Deployment Replicas for `${DEPLOYMENT_NAME}` ... severity=3 ... expected=Deployment `${DEPLOYMENT_NAME}` in namespace `${NAMESPACE}` should have ${deployment_status["desired_replicas"]} pods. ... actual=Deployment `${DEPLOYMENT_NAME}` in namespace `${NAMESPACE}` has ${deployment_status["ready_replicas"]} pods. - ... title= Deployment `${DEPLOYMENT_NAME}` has ${deployment_status["unavailable_replicas"]} unavailable pods. + ... title= Deployment `${DEPLOYMENT_NAME}` has ${deployment_status["unavailable_replicas"]} pods that are not running. ... reproduce_hint=View Commands Used in Report Output ... details=Deployment `${DEPLOYMENT_NAME}` has minimum availability, but has unready pods:\n`${deployment_status}` ... next_steps=Troubleshoot Deployment Warning Events for `${DEPLOYMENT_NAME}` @@ -256,12 +263,18 @@ Check Deployment Event Anomalies for `${DEPLOYMENT_NAME}` ... occurences ... connection error ... ${DEPLOYMENT_NAME} - ${recent_anomalies}= RW.CLI.Run Cli - ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get events --context ${CONTEXT} -n ${NAMESPACE} -o json | jq '(now - (60*60)) as $time_limit | [ .items[] | select(.type != "Warning" and (.involvedObject.kind == "Deployment" or .involvedObject.kind == "ReplicaSet" or .involvedObject.kind == "Pod") and (.involvedObject.name | tostring | contains("${DEPLOYMENT_NAME}"))) | {kind: .involvedObject.kind, count: .count, name: .involvedObject.name, reason: .reason, message: .message, firstTimestamp: .firstTimestamp, lastTimestamp: .lastTimestamp, duration: (if (((.lastTimestamp | fromdateiso8601) - (.firstTimestamp | fromdateiso8601)) == 0) then 1 else (((.lastTimestamp | fromdateiso8601) - (.firstTimestamp | fromdateiso8601))/60) end) } ] | group_by([.kind, .name]) | map({kind: .[0].kind, name: .[0].name, count: (map(.count) | add), reasons: map(.reason) | unique, messages: map(.message) | unique, average_events_per_minute: (if .[0].duration == 1 then 1 else ((map(.count) | add)/.[0].duration ) end),firstTimestamp: map(.firstTimestamp | fromdateiso8601) | sort | .[0] | todateiso8601, lastTimestamp: map(.lastTimestamp | fromdateiso8601) | sort | reverse | .[0] | todateiso8601})' + ${recent_anomalies}= RW.CLI.Run Bash File + ... bash_file=event_anomalies.sh ... env=${env} ... secret_file__kubeconfig=${kubeconfig} + ... include_in_history=false ... show_in_rwl_cheatsheet=true - ... render_in_commandlist=true + ${k8s_deployment_details}= RW.CLI.Run Cli + ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get deployment ${DEPLOYMENT_NAME} -n ${NAMESPACE} --context ${CONTEXT} -o json + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ${related_resource_recommendations}= RW.K8sHelper.Get Related Resource Recommendations + ... k8s_object=${k8s_deployment_details.stdout} ${anomaly_list}= Evaluate json.loads(r'''${recent_anomalies.stdout}''') json IF len($anomaly_list) > 0 FOR ${item} IN @{anomaly_list} @@ -279,7 +292,7 @@ Check Deployment Event Anomalies for `${DEPLOYMENT_NAME}` ... title= ${item["kind"]} `${item["name"]}` has an average of ${item["average_events_per_minute"]} events per minute (above the threshold of ${ANOMALY_THRESHOLD}) ... reproduce_hint=View Commands Used in Report Output ... details=${item["kind"]} `${item["name"]}` has ${item["count"]} normal events that should be reviewed:\n`${item}` - ... next_steps=${item_next_steps.stdout} + ... next_steps=${item_next_steps.stdout}\n${related_resource_recommendations} END END ${anomalies_report_output}= Set Variable ${recent_anomalies.stdout} @@ -291,6 +304,46 @@ Check Deployment Event Anomalies for `${DEPLOYMENT_NAME}` RW.Core.Add To Report ${anomalies_report_output}\n RW.Core.Add Pre To Report Commands Used:\n${history} +Check ReplicaSet Health for Deployment `${DEPLOYMENT_NAME}` + [Documentation] Fetches all replicasets related to deployment to ensure that conflicting versions don't exist. + [Tags] + ... replica + ... replicaset + ... versions + ... container + ... pods + ... deployment + ... ${DEPLOYMENT_NAME} + ${check_replicaset}= RW.CLI.Run Bash File + ... bash_file=check_replicaset.sh + ... cmd_override=./check_replicaset.sh | tee "${SCRIPT_TMP_DIR}/rs_analysis" + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=180 + ... include_in_history=false + ... show_in_rwl_cheatsheet=true + ${recommendations}= RW.CLI.Run Cli + ... cmd=awk "/Recommended Next Steps:/ {start=1; getline} start" "${SCRIPT_TMP_DIR}/rs_analysis" + ... env=${env} + ... include_in_history=false + IF $recommendations.stdout != "" + ${recommendation_list}= Evaluate json.loads(r'''${recommendations.stdout}''') json + IF len(@{recommendation_list}) > 0 + FOR ${item} IN @{recommendation_list} + RW.Core.Add Issue + ... severity=${item["severity"]} + ... expected=Deployment `${DEPLOYMENT_NAME}` should only have one active replicaset in namespace `${NAMESPACE}` + ... actual=Deployment `${DEPLOYMENT_NAME}` has more than one active replicaset in namespace `${NAMESPACE}` + ... title=${item["title"]} + ... reproduce_hint=${check_replicaset.cmd} + ... details=${item["details"]} + ... next_steps=${item["next_steps"]} + END + END + END + RW.Core.Add Pre To Report ${check_replicaset.stdout}\n + ${history}= RW.CLI.Pop Shell History + RW.Core.Add Pre To Report Commands Used: ${history} *** Keywords *** Suite Initialization @@ -339,7 +392,7 @@ Suite Initialization ... description=Pattern used to exclude entries from log results when searching in log results. ... pattern=\w* ... example=(node_modules|opentelemetry) - ... default=(node_modules|opentelemetry) + ... default=("") ${KUBERNETES_DISTRIBUTION_BINARY}= RW.Core.Import User Variable KUBERNETES_DISTRIBUTION_BINARY ... type=string ... description=Which binary to use for Kubernetes CLI commands. diff --git a/codebundles/k8s-jaeger-http-query/runbook.robot b/codebundles/k8s-jaeger-http-query/runbook.robot index 23970327..95344588 100644 --- a/codebundles/k8s-jaeger-http-query/runbook.robot +++ b/codebundles/k8s-jaeger-http-query/runbook.robot @@ -22,7 +22,7 @@ Query Traces in Jaeger for Unhealthy HTTP Response Codes in Namespace `${NAMESPA ... secret_file__kubeconfig=${kubeconfig} ... timeout_seconds=180 ... include_in_history=false - ... render_in_commandlist=true + ... show_in_rwl_cheatsheet=true ${recommendations}= RW.CLI.Run Cli ... cmd=echo '${http_traces.stdout}' | awk '/Recommended Next Steps:/ {flag=1; next} flag' ... env=${env} diff --git a/libraries/RW/K8sHelper/__init__.py b/libraries/RW/K8sHelper/__init__.py new file mode 100644 index 00000000..fac3a82e --- /dev/null +++ b/libraries/RW/K8sHelper/__init__.py @@ -0,0 +1 @@ +from .k8s_helper import * diff --git a/libraries/RW/K8sHelper/k8s_helper.py b/libraries/RW/K8sHelper/k8s_helper.py new file mode 100644 index 00000000..f7d93706 --- /dev/null +++ b/libraries/RW/K8sHelper/k8s_helper.py @@ -0,0 +1,40 @@ +import json + +def get_related_resource_recommendations(k8s_object): + """ + Parse a Kubernetes object JSON for specific annotations or labels and return recommendations. + + Args: + obj_json (dict): The Kubernetes object JSON. + + Returns: + str: Recommendations based on the object's annotations or labels. + """ + # Convert the string representation of the JSON to a Python dictionary + try: + obj_json = json.loads(k8s_object) + except json.JSONDecodeError as e: + return f"Error decoding JSON: {e}" + + recommendations = "No recommendations available." + + # Check for specific labels or annotations in the object JSON + labels = obj_json.get("metadata", {}).get("labels", {}) + annotations = obj_json.get("metadata", {}).get("annotations", {}) + + # Checking for an ArgoCD label + if 'argocd.argoproj.io/instance' in labels: + app_name = labels['argocd.argoproj.io/instance'].split('_')[0] + recommendations = f"Troubleshoot ArgoCD Application `{app_name.capitalize()}`" + + # Check for Flux Resources + if 'helm.toolkit.fluxcd.io/name' in labels: + fluxcd_helm_name = labels['helm.toolkit.fluxcd.io/name'] + fluxcd_helm_namespace = labels['helm.toolkit.fluxcd.io/namespace'] + recommendations = f"Troubleshoot `{fluxcd_helm_name}` Helm Release Health in Namespace `{fluxcd_helm_namespace}`" + + # Extend this function to check for other specific labels or annotations as needed + + return recommendations + +