From ec971e30a7a07d3aee7f4150a1abcab03e01eb3e Mon Sep 17 00:00:00 2001 From: Shea Stewart Date: Mon, 20 Nov 2023 13:11:05 -0500 Subject: [PATCH] pre-merge-resolution --- .../k8s-certmanager-healthcheck/runbook.robot | 96 ---- .../deployment_logs.sh | 318 -------------- .../k8s-deployment-healthcheck/runbook.robot | 234 ---------- .../k8s-namespace-healthcheck/runbook.robot | 409 ------------------ codebundles/k8s-pvc-healthcheck/runbook.robot | 225 ---------- 5 files changed, 1282 deletions(-) delete mode 100644 codebundles/k8s-certmanager-healthcheck/runbook.robot delete mode 100755 codebundles/k8s-deployment-healthcheck/deployment_logs.sh delete mode 100644 codebundles/k8s-deployment-healthcheck/runbook.robot delete mode 100644 codebundles/k8s-namespace-healthcheck/runbook.robot delete mode 100644 codebundles/k8s-pvc-healthcheck/runbook.robot diff --git a/codebundles/k8s-certmanager-healthcheck/runbook.robot b/codebundles/k8s-certmanager-healthcheck/runbook.robot deleted file mode 100644 index 46512d8e..00000000 --- a/codebundles/k8s-certmanager-healthcheck/runbook.robot +++ /dev/null @@ -1,96 +0,0 @@ -*** Settings *** -Documentation This taskset checks that your cert manager certificates are renewing as expected, raising issues when they are past due in the configured namespace -Metadata Author jon-funk -Metadata Display Name Kubernetes CertManager Healthcheck -Metadata Supports Kubernetes,AKS,EKS,GKE,OpenShift,CertManager - -Library BuiltIn -Library RW.Core -Library RW.CLI -Library RW.platform -Library OperatingSystem -Library DateTime -Library Collections - -Suite Setup Suite Initialization - - -*** Tasks *** -Get Namespace `${NAMESPACE}` Certificate Summary - [Documentation] Gets a list of certmanager certificates and summarize their information for review. - [Tags] tls certificates kubernetes objects expiration summary certmanager - ${cert_info}= RW.CLI.Run Cli - ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get certificates.cert-manager.io --context=${CONTEXT} -n ${NAMESPACE} -ojson | jq -r --arg now "$(date +%Y-%m-%dT%H:%M:%SZ)" '.items[] | select(.status.conditions[] | select(.type == "Ready" and .status == "True")) | select(.status.renewalTime) | select((.status.notAfter | strptime("%Y-%m-%dT%H:%M:%SZ") | mktime) <= ($now | strptime("%Y-%m-%dT%H:%M:%SZ") | mktime)) | "Namespace:" + .metadata.namespace + " URL:" + .spec.dnsNames[0] + " Renews:" + .status.renewalTime + " Expires:" + .status.notAfter' - ... render_in_commandlist=true - ... env=${env} - ... secret_file__kubeconfig=${kubeconfig} - RW.CLI.Parse Cli Output By Line - ... rsp=${cert_info} - ... set_severity_level=3 - ... set_issue_expected=No certificates found past their set renewal date in the namespace ${NAMESPACE} - ... set_issue_actual=Certificates were found in the namespace ${NAMESPACE} that are past their renewal time and not renewed - ... set_issue_title=Found certificates due for renewal in namespace ${NAMESPACE} that are not renewing - ... set_issue_details=CertManager certificates not renewing: "$_stdout" - investigate CertManager. - ... set_issue_next_steps=Check the health of the `cert-manager` deployment in the `cert-manager` namespace\nInspect `cert-manager` deployment logs for renewal errors in `cert-manager` namespace - ... _line__raise_issue_if_contains=Namespace - RW.Core.Add Pre To Report Certificate Information:\n${cert_info.stdout} - ${history}= RW.CLI.Pop Shell History - RW.Core.Add Pre To Report Commands Used: ${history} - -Find Failed Certificate Requests and Identify Issues In Namespace `${NAMESPACE}` - [Documentation] Gets a list of failed certmanager certificates and summarize their issues. - [Tags] tls certificates kubernetes objects failed certificaterequest certmanager - ${failed_certificaterequests}= RW.CLI.Run Cli - ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get certificaterequests.cert-manager.io --context=${CONTEXT} -n ${NAMESPACE} -o json | jq -r '.items[] | select(.status.conditions[] | select(.type == "Ready" and .status != "True")) | {certRequest: .metadata.name, certificate: (.metadata.ownerReferences[].name), issuer: .spec.issuerRef.name, readyStatus: (.status.conditions[] | select(.type == "Ready")).status, readyMessage: (.status.conditions[] | select(.type == "Ready")).message, approvedStatus: (.status.conditions[] | select(.type == "Approved")).status, approvedMessage: (.status.conditions[] | select(.type == "Approved")).message} | "---\\nCertificateRequest: \\(.certRequest)", "Certificate: \\(.certificate)", "Issuer: \\(.issuer)", "Ready Status: \\(.readyStatus)", "Ready Message: \\(.readyMessage)", "Approved Status: \\(.approvedStatus)", "Approved Message: \\(.approvedMessage)"' - ... render_in_commandlist=true - ... env=${env} - ... secret_file__kubeconfig=${kubeconfig} - RW.CLI.Parse Cli Output By Line - ... rsp=${failed_certificaterequests} - ... set_severity_level=2 - ... set_issue_expected=All certifiactes to be ready in ${NAMESPACE} - ... set_issue_actual=Certificates are not ready in ${NAMESPACE} - ... set_issue_title=Found failed certificates in namespace ${NAMESPACE} - ... set_issue_details=CertManager certificates failed: "$_stdout" - investigate Issuers or ClusterIssuers. - ... set_issue_next_steps=Check the health of the `cert-manager` deployment in `cert-manager` namespace\nInspect `cert-manager` deployment logs for renewal errors in `cert-manager` namespace - ... _line__raise_issue_if_contains=- - RW.Core.Add Pre To Report Certificate Information:\n${failed_certificaterequests.stdout} - ${history}= RW.CLI.Pop Shell History - RW.Core.Add Pre To Report Commands Used: ${history} - - -*** Keywords *** -Suite Initialization - ${kubeconfig}= RW.Core.Import Secret - ... kubeconfig - ... type=string - ... description=The kubernetes kubeconfig yaml containing connection configuration used to connect to cluster(s). - ... pattern=\w* - ... example=For examples, start here https://kubernetes.io/docs/concepts/configuration/organize-cluster-access-kubeconfig/ - ${kubectl}= RW.Core.Import Service kubectl - ... description=The location service used to interpret shell commands. - ... default=kubectl-service.shared - ... example=kubectl-service.shared - ${KUBERNETES_DISTRIBUTION_BINARY}= RW.Core.Import User Variable KUBERNETES_DISTRIBUTION_BINARY - ... type=string - ... description=Which binary to use for Kubernetes CLI commands. - ... enum=[kubectl,oc] - ... example=kubectl - ... default=kubectl - ${CONTEXT}= RW.Core.Import User Variable CONTEXT - ... type=string - ... description=Which Kubernetes context to operate within. - ... pattern=\w* - ... example=my-main-cluster - ${NAMESPACE}= RW.Core.Import User Variable NAMESPACE - ... type=string - ... description=The name of the namespace to search. - ... pattern=\w* - ... example=otel-demo - ... default= - Set Suite Variable ${kubeconfig} ${kubeconfig} - Set Suite Variable ${kubectl} ${kubectl} - Set Suite Variable ${KUBERNETES_DISTRIBUTION_BINARY} ${KUBERNETES_DISTRIBUTION_BINARY} - Set Suite Variable ${CONTEXT} ${CONTEXT} - Set Suite Variable ${NAMESPACE} ${NAMESPACE} - Set Suite Variable ${env} {"KUBECONFIG":"./${kubeconfig.key}"} diff --git a/codebundles/k8s-deployment-healthcheck/deployment_logs.sh b/codebundles/k8s-deployment-healthcheck/deployment_logs.sh deleted file mode 100755 index 9c01f0cb..00000000 --- a/codebundles/k8s-deployment-healthcheck/deployment_logs.sh +++ /dev/null @@ -1,318 +0,0 @@ -#!/bin/bash - -# ----------------------------------------------------------------------------- -# Script Information and Metadata -# ----------------------------------------------------------------------------- -# Author: @stewartshea -# Description: This script is designed to fetch and process Kubernetes logs -# and provide helpful insights based on the logs. It uses lnav to sift and -# query the logs for detail, and then tryies to match namespace resources with -# some of the text created by the queries. This can be extended as needed to -# cover many logfile use cases -# ----------------------------------------------------------------------------- - -# Update PATH to ensure script dependencies are found -export PATH="$PATH:$HOME/.lnav:$HOME/.local/bin" - -# -------------------------- Function Definitions ----------------------------- - -# Check if a command exists -function check_command_exists() { - if ! command -v $1 &> /dev/null; then - echo "$1 could not be found" - exit - fi -} - -# Function to filter out common words -filter_common_words() { - local input_string="$1" - local common_words=" to on add could desc not lookup " - local filtered_string="" - - # Loop through each word in the input string - while IFS= read -r word; do - # If the word is not in the common words list, add to filtered string - if [[ ! " $common_words " =~ " $word " ]]; then - filtered_string+="$word"$'\n' - fi - done <<< "$input_string" - - echo "$filtered_string" -} -# ------------------------- Dependency Verification --------------------------- - -# Ensure all the required binaries are accessible -check_command_exists ${KUBERNETES_DISTRIBUTION_BINARY} -check_command_exists jq -check_command_exists lnav - -# Load custom formats for lnav if it's installed -# FIXME: This could be done more efficiently -# Search for the formats directory -lnav_formats_path=$(find / -type d -path '*/extras/lnav/formats' -print -quit 2>/dev/null) -cp -rf $lnav_formats_path/* $HOME/.lnav/formats/installed - - -# Ensure a deployment name was provided -if [ -z "$DEPLOYMENT_NAME" ]; then - echo "You must provide a Kubernetes Deployment name." - exit 1 -fi - -# Fetch label selectors for the provided deployment -SELECTOR=$(${KUBERNETES_DISTRIBUTION_BINARY} get deployment $DEPLOYMENT_NAME --namespace=$NAMESPACE -o=jsonpath='{.spec.selector.matchLabels}' | jq -r 'to_entries | .[] | "\(.key)=\(.value)"' | tr '\n' ',' | sed 's/,$//') -if [ -z "$SELECTOR" ]; then - echo "No label selectors found for Deployment $DEPLOYMENT_NAME." - exit 1 -fi - -# Iterate through the pods based on the selector and fetch logs -LOG_FILES=() -while read POD; do - CONTAINERS=$(${KUBERNETES_DISTRIBUTION_BINARY} get pod $POD --namespace=$NAMESPACE -o=jsonpath='{range .spec.containers[*]}{.name}{"\n"}{end}') - for CONTAINER in $CONTAINERS; do - if [ -n "$LOGS_ERROR_PATTERN" ] && [ -n "$LOGS_EXCLUDE_PATTERN" ]; then - # Both error and exclusion patterns provided - LOGS=$(${KUBERNETES_DISTRIBUTION_BINARY} logs $POD -c $CONTAINER --namespace=$NAMESPACE --limit-bytes=256000 --since=3h --context=${CONTEXT} -n ${NAMESPACE} | grep -Ei "${LOGS_ERROR_PATTERN}" | grep -Eiv "${LOGS_EXCLUDE_PATTERN}") - elif [ -n "$LOGS_ERROR_PATTERN" ]; then - # Only error pattern provided - LOGS=$(${KUBERNETES_DISTRIBUTION_BINARY} logs $POD -c $CONTAINER --namespace=$NAMESPACE --limit-bytes=256000 --since=3h --context=${CONTEXT} -n ${NAMESPACE} | grep -Ei "${LOGS_ERROR_PATTERN}") - elif [ -n "$LOGS_EXCLUDE_PATTERN" ]; then - # Only exclusion pattern provided - LOGS=$(${KUBERNETES_DISTRIBUTION_BINARY} logs $POD -c $CONTAINER --namespace=$NAMESPACE --limit-bytes=256000 --since=3h --context=${CONTEXT} -n ${NAMESPACE} | grep -Eiv "${LOGS_EXCLUDE_PATTERN}") - else - # Neither pattern provided - LOGS=$(${KUBERNETES_DISTRIBUTION_BINARY} logs $POD -c $CONTAINER --namespace=$NAMESPACE --limit-bytes=256000 --since=3h --context=${CONTEXT} -n ${NAMESPACE}) - fi - - # Check log format and store appropriately - FIRST_LINE=$(echo "$LOGS" | head -n 1) - EXT=$(echo "$FIRST_LINE" | jq -e . &>/dev/null && echo "json" || echo "txt") - FILENAME="${POD}_${CONTAINER}_logs.$EXT" - LOG_FILES+=("$FILENAME") - echo "Fetching logs for Pod: $POD, Container: $CONTAINER. Saving to $FILENAME." - echo "$LOGS" > $FILENAME - done -done < <(${KUBERNETES_DISTRIBUTION_BINARY} get pods --selector=$SELECTOR --namespace=$NAMESPACE -o=jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}') - -# Initialize an issue description array -issue_descriptions=() - -# ------------------------------- lnav queries -------------------------------- -# The gist here is to provide various types of lnav queries. If a query has -# results, then we can perform some additional tasks that suggest resources -# which might be related -#------------------------------------------------------------------------------- - - -# NOTE: Work needs to be done here to scale this - as we have hard coded in the -# fields and the format - need to figure out how to best match the right formats, -# or can we just use logline - -SEARCH_RESOURCES="" -##### Begin query ##### - -# Format file / table http_logrus_custom -# Search for http log format used by online-boutique (which uses logrus but is custom) -for FILE in "${LOG_FILES[@]}"; do - echo "$FILE" - LOG_SUMMARY=$(lnav -n -c ';SELECT COUNT(*) AS error_count, CASE WHEN "http.req.path" LIKE "/product%" THEN "/product" ELSE "http.req.path" END AS root_path, "http.resp.status" FROM http_logrus_custom WHERE "http.resp.status" = 500 AND NOT "http.req.path" = "/" GROUP BY root_path, "http.resp.status" ORDER BY error_count DESC;' $FILE) - echo "$LOG_SUMMARY" - INTERESTING_PATHS+=$(echo "$LOG_SUMMARY" | awk 'NR>1 && NR<5 {sub(/^\//, "", $2); print $2}')$'\n' -done - -if [[ -n "$INTERESTING_PATHS" ]]; then - SEARCH_RESOURCES+=$(echo "$INTERESTING_PATHS" | awk -F'/' '{for (i=1; i<=NF; i++) print $i}' | sort | uniq) - issue_descriptions+=("HTTP Errors found for paths: $SEARCH_RESOURCES") -else - echo "No interesting HTTP paths found." -fi - -# Search for error fields and strings -for FILE in "${LOG_FILES[@]}"; do - echo "$FILE" - ERROR_SUMMARY=$(lnav -n -c ';SELECT error, COUNT(*) AS count FROM http_logrus_custom WHERE error IS NOT NULL GROUP BY error;' $FILE) - echo "$ERROR_SUMMARY" - ERROR_FUZZY_STRING+=$(echo "$ERROR_SUMMARY" | head -n 3 | tr -d '":' | tr ' ' '\n' | awk '{ for (i=1; i<=NF; i++) if (i != 2) print $i }') -done -ERROR_FUZZY_STRING=$(echo "$ERROR_FUZZY_STRING" | sort | uniq) -##### End query ##### - - - - -# # Fetch a list of all resources in the namespace -## Heavyweight - this times out after 30s, but is a better way to get any and all resources -# SEARCH_LIST=$(${KUBERNETES_DISTRIBUTION_BINARY} api-resources --verbs=list --namespaced -o name | xargs -n 1 ${KUBERNETES_DISTRIBUTION_BINARY} get --show-kind --ignore-not-found -n $NAMESPACE) - -## Lightweight - we explicitly specify which resources we want to search -# Run RESOURCE_SEARCH_LIST only if SEARCH_RESOURCES has content -if [[ -n "$SEARCH_RESOURCES" ]]; then - RESOURCE_SEARCH_LIST=$(${KUBERNETES_DISTRIBUTION_BINARY} get deployment,pods,service,statefulset --context=${CONTEXT} -n ${NAMESPACE}) -else - echo "No search queries returned results." - exit -fi - - - -# # Fuzzy match env vars in deployments with ERROR_FUZZY_STRING -# declare -a FUZZY_ENV_VAR_RESOURCE_MATCHES -# if [[ -n "$SEARCH_RESOURCES" && -n "$ERROR_FUZZY_STRING" ]]; then -# # Filter out common words from ERROR_FUZZY_STRING -# FILTERED_ERROR_STRING=$(filter_common_words "$ERROR_FUZZY_STRING") - -# # Convert FILTERED_ERROR_STRING into an array -# mapfile -t PATTERNS <<< "$FILTERED_ERROR_STRING" - -# for resource_type in "deployments" "statefulsets"; do -# for pattern in "${PATTERNS[@]}"; do -# while read -r resource_name; do -# FUZZY_ENV_VAR_RESOURCE_MATCHES+=("$resource_type/$resource_name") -# done < <(${KUBERNETES_DISTRIBUTION_BINARY} get "$resource_type" -n "$NAMESPACE" -o=json | jq --arg pattern "$pattern" -r \ -# ".items[] | -# select( -# .spec.template.spec.containers[]? | -# .env[]? | -# select( -# (.name? // empty | ascii_downcase | contains(\$pattern)) or -# (.value? // empty | ascii_downcase | contains(\$pattern)) -# ) -# ) | -# .metadata.name") -# done -# done -# else -# echo "No search queries or fuzzy matches to perform." -# exit -# fi - -# Fuzzy match env vars in deployments with ERROR_FUZZY_STRING -declare -a FUZZY_ENV_VAR_RESOURCE_MATCHES -if [[ -n "$SEARCH_RESOURCES" && -n "$ERROR_FUZZY_STRING" ]]; then - # Filter out common words from ERROR_FUZZY_STRING - FILTERED_ERROR_STRING=$(filter_common_words "$ERROR_FUZZY_STRING") - - # Convert FILTERED_ERROR_STRING into an array - mapfile -t PATTERNS <<< "$FILTERED_ERROR_STRING" - - for resource_type in "deployments" "statefulsets"; do - for pattern in "${PATTERNS[@]}"; do - while IFS="|" read -r resource_name env_key env_value; do - formatted_string="$pattern:$resource_type/$resource_name:$env_key:$env_value" - FUZZY_ENV_VAR_RESOURCE_MATCHES+=("$formatted_string") - done < <(${KUBERNETES_DISTRIBUTION_BINARY} get "$resource_type" -n "$NAMESPACE" -o=json | jq --arg pattern "$pattern" -r \ - ".items[] | - select( - .spec.template.spec.containers[]? | - .env[]? | - select( - (.name? // empty | ascii_downcase | contains(\$pattern)) or - (.value? // empty | ascii_downcase | contains(\$pattern)) - ) - ) | - {resource_name: .metadata.name, matched_env: (.spec.template.spec.containers[] | .env[] | select((.name? // empty | ascii_downcase | contains(\$pattern)) or (.value? // empty | ascii_downcase | contains(\$pattern))))} | - [.resource_name, .matched_env.name, .matched_env.value] | join(\"|\")") - - done - done -else - echo "No search queries or fuzzy matches to perform." - exit -fi - -for match in "${FUZZY_ENV_VAR_RESOURCE_MATCHES[@]}"; do - IFS=':' read -ra parts <<< "$match" - string=${parts[0]} - resource=${parts[1]} - env_key=${parts[2]} - env_value=${parts[3]} - echo "Found string \`$string\` in resource \`$resource\`. Check manifest and environment variable \`$env_key\` for accuracy. " -done - -# Fetch namespace events for searching through -EVENT_SEARCH_LIST=$(${KUBERNETES_DISTRIBUTION_BINARY} get events --context=${CONTEXT} -n ${NAMESPACE}) -event_details="\nThe namespace ${NAMESPACE} has produced the following interesting events:" -event_details+="\n" - -# For each value, search the namespace for applicable resources and events -for RESOURCE in "${SEARCH_RESOURCES[@]}"; do - event_details+=$(echo "$EVENT_SEARCH_LIST" | grep "$RESOURCE" | grep -Eiv "Normal") - INTERESTING_RESOURCES+=$(echo "$RESOURCE_SEARCH_LIST" | grep "$RESOURCE") -done - - -# Try to generate some recommendations from the resource strings we discovered -recommendations=() - -declare -A seen_resources - -if [[ ${#FUZZY_ENV_VAR_RESOURCE_MATCHES[@]} -ne 0 ]]; then - for match in "${FUZZY_ENV_VAR_RESOURCE_MATCHES[@]}"; do - IFS=':' read -ra parts <<< "$match" - string=${parts[0]} - resource=${parts[1]} - env_key=${parts[2]} - env_value=${parts[3]} - - if [[ -z ${seen_resources[$resource]} ]]; then - recommendations+=("Review manifest for \`$resource\` in namespace: \`${NAMESPACE}\`. Matched error log string \`$string\` in environment variable \`$env_key\`. ") - seen_resources[$resource]=1 - fi - done -fi - -if [[ -n "$INTERESTING_RESOURCES" ]]; then - while read -r line; do - # Splitting columns into array - IFS=' ' read -ra cols <<< "$line" - resource="${cols[0]}" - status="${cols[1]}" - restarts="${cols[3]}" - - # Extracting resource type and name - IFS='/' read -ra details <<< "$resource" - type="${details[0]}" - name="${details[1]}" - - case "$type" in - pod) - if [[ "$status" != "Running" ]]; then - recommendations+=("Troubleshoot *failed pods* in *namespace* \`${NAMESPACE}\` ") - fi - if ((restarts > 0)); then - recommendations+=("Troubleshoot *container restarts* in *namespace* \`${NAMESPACE}\` ") - fi - ;; - deployment|deployment.apps) - recommendations+=("Check *deployment* health \`$name\` in *namespace* \`${NAMESPACE}\` ") - ;; - service) - recommendations+=("Check *service* health \`$name\` in *namespace* \`${NAMESPACE}\` ") - ;; - statefulset|statefulset.apps) - recommendations+=("Check *statefulset* health \`$name\` in *namespace* \`${NAMESPACE}\` ") - ;; - esac - done <<< "$INTERESTING_RESOURCES" -else - echo "No resources found based on log query output" -fi - -# Display the issue descriptions -if [[ ${#issue_descriptions[@]} -ne 0 ]]; then - printf "\nIssues Identified: \n" - printf "%s\n" "${issue_descriptions[@]}" | sort -u -fi - -# Display the interesting events for report details -if [[ -n "$event_details" ]]; then - echo -e "$event_details" -fi - -# Display all unique recommendations that can be shown as Next Steps -if [[ ${#recommendations[@]} -ne 0 ]]; then - printf "\nRecommended Next Steps: \n" - printf "%s\n" "${recommendations[@]}" | sort -u -fi \ No newline at end of file diff --git a/codebundles/k8s-deployment-healthcheck/runbook.robot b/codebundles/k8s-deployment-healthcheck/runbook.robot deleted file mode 100644 index c4a696d1..00000000 --- a/codebundles/k8s-deployment-healthcheck/runbook.robot +++ /dev/null @@ -1,234 +0,0 @@ -*** Settings *** -Documentation Triages issues related to a deployment and its replicas. -Metadata Author jon-funk -Metadata Display Name Kubernetes Deployment Triage -Metadata Supports Kubernetes,AKS,EKS,GKE,OpenShift - -Library BuiltIn -Library RW.Core -Library RW.CLI -Library RW.platform -Library OperatingSystem - -Suite Setup Suite Initialization - - -*** Tasks *** -Check Deployment ${DEPLOYMENT_NAME} Log For Issues - [Documentation] Fetches recent logs for the given deployment in the namespace and checks the logs output for issues. - [Tags] fetch log pod container errors inspect trace info deployment - ${logs}= RW.CLI.Run Bash File - ... bash_file=deployment_logs.sh - ... env=${env} - ... secret_file__kubeconfig=${kubeconfig} - ... timeout_seconds=180 - ${recommendations}= RW.CLI.Run Cli - ... cmd=echo '''${logs.stdout}''' | awk "/Recommended Next Steps:/ {start=1; getline} start" - ... env=${env} - ... include_in_history=false - ${issues}= RW.CLI.Run Cli - ... cmd=echo '''${logs.stdout}''' | awk '/Issues Identified:/ {start=1} /The namespace online-boutique has produced the following interesting events:/ {start=0} start' - ... env=${env} - ... include_in_history=false - RW.CLI.Parse Cli Output By Line - ... rsp=${logs} - ... set_severity_level=2 - ... set_issue_expected=No logs matching error patterns found in deployment ${DEPLOYMENT_NAME} in namespace: ${NAMESPACE} - ... set_issue_actual=Error logs found in deployment ${DEPLOYMENT_NAME} in namespace: ${NAMESPACE} - ... set_issue_title=Deployment ${DEPLOYMENT_NAME} in ${NAMESPACE} has: \n${issues.stdout} - ... set_issue_details=Deployment ${DEPLOYMENT_NAME} has error logs:\n\n$_stdout - ... set_issue_next_steps=${recommendations.stdout} - ... _line__raise_issue_if_contains=Recommended - ${history}= RW.CLI.Pop Shell History - RW.Core.Add Pre To Report - ... Recent logs from deployment/${DEPLOYMENT_NAME} in ${NAMESPACE}:\n\n${logs.stdout} - RW.Core.Add Pre To Report Commands Used: ${history} - -Troubleshoot Deployment `${DEPLOYMENT_NAME}` Warning Events - [Documentation] Fetches warning events related to the deployment workload in the namespace and triages any issues found in the events. - [Tags] events workloads errors warnings get deployment - ${events}= RW.CLI.Run Cli - ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get events --context ${CONTEXT} -n ${NAMESPACE} --field-selector type=Warning | grep -i "${DEPLOYMENT_NAME}" || true - ... env=${env} - ... secret_file__kubeconfig=${kubeconfig} - ... render_in_commandlist=true - RW.CLI.Parse Cli Output By Line - ... rsp=${events} - ... set_severity_level=1 - ... set_issue_expected=No events of type warning should exist for deployment. - ... set_issue_actual=Events of type warning found for deployment. - ... set_issue_title=The deployment ${DEPLOYMENT_NAME} has warning events - ... set_issue_details=Warning events found for deployment ${DEPLOYMENT_NAME} in namespace ${NAMESPACE}\n$_line\n - ... set_issue_next_steps=Run Application Level Troubleshooting on Deployment `${DEPLOYMENT_NAME}` In Namespace `${NAMESPACE}` and Check Logs For Errors. - ... _line__raise_issue_if_contains=Warning - ${history}= RW.CLI.Pop Shell History - RW.Core.Add Pre To Report ${events.stdout} - RW.Core.Add Pre To Report Commands Used: ${history} - -Get Deployment `${DEPLOYMENT_NAME}` Details For Report - [Documentation] Fetches the current state of the deployment for future review in the report. - [Tags] deployment details manifest info - ${deployment}= RW.CLI.Run Cli - ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get deployment/${DEPLOYMENT_NAME} --context ${CONTEXT} -n ${NAMESPACE} -o yaml - ... env=${env} - ... secret_file__kubeconfig=${kubeconfig} - ... render_in_commandlist=true - ${history}= RW.CLI.Pop Shell History - RW.Core.Add Pre To Report Snapshot of deployment state:\n\n${deployment.stdout} - RW.Core.Add Pre To Report Commands Used: ${history} - -Troubleshoot Deployment `${DEPLOYMENT_NAME}` Replicas - [Documentation] Pulls the replica information for a given deployment and checks if it's highly available - ... , if the replica counts are the expected / healthy values, and if not, what they should be. - [Tags] - ... deployment - ... replicas - ... desired - ... actual - ... available - ... ready - ... unhealthy - ... rollout - ... stuck - ... pods - ${deployment}= RW.CLI.Run Cli - ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get deployment/${DEPLOYMENT_NAME} --context ${CONTEXT} -n ${NAMESPACE} -o json - ... env=${env} - ... secret_file__kubeconfig=${kubeconfig} - ... render_in_commandlist=true - ${available_replicas}= RW.CLI.Parse Cli Json Output - ... rsp=${deployment} - ... extract_path_to_var__available_replicas=status.availableReplicas || `0` - ... available_replicas__raise_issue_if_lt=1 - ... assign_stdout_from_var=available_replicas - ... set_issue_title=No replicas available for deployment/${DEPLOYMENT_NAME} - ... set_issue_details=No replicas available for deployment/${DEPLOYMENT_NAME} in namespace ${NAMESPACE}, we found 0. - ... set_issue_next_steps=Run Application Level Troubleshooting For `Deployment/${DEPLOYMENT_NAME}` - RW.CLI.Parse Cli Json Output - ... rsp=${available_replicas} - ... extract_path_to_var__available_replicas=@ - ... available_replicas__raise_issue_if_lt=${EXPECTED_AVAILABILITY} - ... set_issue_title=Fewer Than Expected Available Replicas For Deployment ${DEPLOYMENT_NAME} - ... set_issue_details=Fewer than expected replicas available (we found $available_replicas) for deployment ${DEPLOYMENT_NAME} in namespace ${NAMESPACE} - check manifests, kubernetes events, pod logs, resource constraints and PersistentVolumes - ... set_issue_next_steps=Troubleshoot Container Restarts in Namespace `${NAMESPACE}` - ${desired_replicas}= RW.CLI.Parse Cli Json Output - ... rsp=${deployment} - ... extract_path_to_var__desired_replicas=status.replicas || `0` - ... desired_replicas__raise_issue_if_lt=1 - ... assign_stdout_from_var=desired_replicas - ... set_issue_title=Less than desired replicas for deployment/${DEPLOYMENT_NAME} - ... set_issue_details=Less than desired replicas for deployment/${DEPLOYMENT_NAME} in ${NAMESPACE}. - ... set_issue_next_steps=Troubleshoot Deployment `${DEPLOYMENT_NAME}` Warning Events - RW.CLI.Parse Cli Json Output - ... rsp=${desired_replicas} - ... extract_path_to_var__desired_replicas=@ - ... desired_replicas__raise_issue_if_neq=${available_replicas.stdout} - ... set_issue_title=Desired and ready pods for deployment/${DEPLOYMENT_NAME} do not match as expected - ... set_issue_details=Desired and ready pods for deployment/${DEPLOYMENT_NAME} do not match in namespace ${NAMESPACE}, desired: $desired_replicas vs ready: ${available_replicas.stdout}. We got ready:${available_replicas.stdout} vs desired: $desired_replicas - ... set_issue_next_steps=Troubleshoot Deployment `${DEPLOYMENT_NAME}` Warning Events - ${desired_replicas}= Convert To Number ${desired_replicas.stdout} - ${available_replicas}= Convert To Number ${available_replicas.stdout} - RW.Core.Add Pre To Report Deployment State:\n${deployment.stdout} - ${history}= RW.CLI.Pop Shell History - RW.Core.Add Pre To Report Commands Used: ${history} - -Check For Deployment Event Anomalies - [Documentation] Parses all events in a namespace within a timeframe and checks for unusual activity, raising issues for any found. - [Tags] deployment events info state anomolies count occurences - ${recent_anomalies}= RW.CLI.Run Cli - ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get events --field-selector type!=Warning --context ${CONTEXT} -n ${NAMESPACE} -o json | jq -r '.items[] | select(.involvedObject.name|contains("${DEPLOYMENT_NAME}")) | select( .count / ( if ((.lastTimestamp|fromdate)-(.firstTimestamp|fromdate))/60 == 0 then 1 else ((.lastTimestamp|fromdate)-(.firstTimestamp|fromdate))/60 end ) > ${ANOMALY_THRESHOLD}) | "Event(s) Per Minute:" + (.count / ( if ((.lastTimestamp|fromdate)-(.firstTimestamp|fromdate))/60 == 0 then 1 else ((.lastTimestamp|fromdate)-(.firstTimestamp|fromdate))/60 end ) |tostring) +" Count:" + (.count|tostring) + " Minute(s):" + (((.lastTimestamp|fromdate)-(.firstTimestamp|fromdate))/60|tostring)+ " Object:" + .involvedObject.namespace + "/" + .involvedObject.kind + "/" + .involvedObject.name + " Reason:" + .reason + " Message:" + .message' - ... env=${env} - ... secret_file__kubeconfig=${kubeconfig} - ... render_in_commandlist=true - RW.CLI.Parse Cli Output By Line - ... rsp=${recent_anomalies} - ... set_severity_level=2 - ... set_issue_expected=No unusual recent anomaly events with high counts in the namespace ${NAMESPACE} - ... set_issue_actual=We detected events in the namespace ${NAMESPACE} which are considered anomalies - ... set_issue_title=Event Anomalies Detected In Namespace ${NAMESPACE} - ... set_issue_details=Anomaly non-warning events in namespace ${NAMESPACE}:\n"$_stdout" - ... set_issue_next_steps=Check Deployment `${DEPLOYMENT_NAME}` Log For Issues - ... _line__raise_issue_if_contains=Object - ${history}= RW.CLI.Pop Shell History - ${recent_anomalies}= Set Variable ${recent_anomalies.stdout} - IF """${recent_anomalies}""" == "" - ${recent_anomalies}= Set Variable No anomalies were detected! - END - RW.Core.Add To Report Summary Of Anomalies Detected:\n - RW.Core.Add To Report ${recent_anomalies}\n - RW.Core.Add Pre To Report Commands Used:\n${history} - - -*** Keywords *** -Suite Initialization - ${kubeconfig}= RW.Core.Import Secret - ... kubeconfig - ... type=string - ... description=The kubernetes kubeconfig yaml containing connection configuration used to connect to cluster(s). - ... pattern=\w* - ... example=For examples, start here https://kubernetes.io/docs/concepts/configuration/organize-cluster-access-kubeconfig/ - ${kubectl}= RW.Core.Import Service kubectl - ... description=The location service used to interpret shell commands. - ... default=kubectl-service.shared - ... example=kubectl-service.shared - ${DEPLOYMENT_NAME}= RW.Core.Import User Variable DEPLOYMENT_NAME - ... type=string - ... description=Used to target the resource for queries and filtering events. - ... pattern=\w* - ... example=artifactory - ${NAMESPACE}= RW.Core.Import User Variable NAMESPACE - ... type=string - ... description=The name of the Kubernetes namespace to scope actions and searching to. - ... pattern=\w* - ... example=my-namespace - ${CONTEXT}= RW.Core.Import User Variable CONTEXT - ... type=string - ... description=Which Kubernetes context to operate within. - ... pattern=\w* - ... example=my-main-cluster - ${EXPECTED_AVAILABILITY}= RW.Core.Import User Variable EXPECTED_AVAILABILITY - ... type=string - ... description=The minimum numbers of replicas allowed considered healthy. - ... pattern=\d+ - ... example=3 - ... default=3 - ${ANOMALY_THRESHOLD}= RW.Core.Import User Variable - ... ANOMALY_THRESHOLD - ... type=string - ... description=The rate of occurence per minute at which an Event becomes classified as an anomaly, even if Kubernetes considers it informational. - ... pattern=\d+(\.\d+)? - ... example=1.0 - ... default=1.0 - ${LOGS_ERROR_PATTERN}= RW.Core.Import User Variable LOGS_ERROR_PATTERN - ... type=string - ... description=The error pattern to use when grep-ing logs. - ... pattern=\w* - ... example=(Error: 13|Error: 14) - ... default=(ERROR) - ${LOGS_EXCLUDE_PATTERN}= RW.Core.Import User Variable LOGS_EXCLUDE_PATTERN - ... type=string - ... description=Pattern used to exclude entries from log results when searching in log results. - ... pattern=\w* - ... example=(node_modules|opentelemetry) - ... default=(node_modules|opentelemetry) - ${KUBERNETES_DISTRIBUTION_BINARY}= RW.Core.Import User Variable KUBERNETES_DISTRIBUTION_BINARY - ... type=string - ... description=Which binary to use for Kubernetes CLI commands. - ... enum=[kubectl,oc] - ... example=kubectl - ... default=kubectl - ${HOME}= RW.Core.Import User Variable HOME - Set Suite Variable ${kubeconfig} ${kubeconfig} - Set Suite Variable ${kubectl} ${kubectl} - Set Suite Variable ${KUBERNETES_DISTRIBUTION_BINARY} ${KUBERNETES_DISTRIBUTION_BINARY} - Set Suite Variable ${CONTEXT} ${CONTEXT} - Set Suite Variable ${NAMESPACE} ${NAMESPACE} - Set Suite Variable ${DEPLOYMENT_NAME} ${DEPLOYMENT_NAME} - Set Suite Variable ${EXPECTED_AVAILABILITY} ${EXPECTED_AVAILABILITY} - Set Suite Variable ${ANOMALY_THRESHOLD} ${ANOMALY_THRESHOLD} - Set Suite Variable ${LOGS_ERROR_PATTERN} ${LOGS_ERROR_PATTERN} - Set Suite Variable ${LOGS_EXCLUDE_PATTERN} ${LOGS_EXCLUDE_PATTERN} - Set Suite Variable ${HOME} ${HOME} - Set Suite Variable - ... ${env} - ... {"KUBECONFIG":"./${kubeconfig.key}", "KUBERNETES_DISTRIBUTION_BINARY":"${KUBERNETES_DISTRIBUTION_BINARY}", "CONTEXT":"${CONTEXT}", "NAMESPACE":"${NAMESPACE}", "LOGS_ERROR_PATTERN":"${LOGS_ERROR_PATTERN}", "LOGS_EXCLUDE_PATTERN":"${LOGS_EXCLUDE_PATTERN}", "ANOMALY_THRESHOLD":"${ANOMALY_THRESHOLD}", "DEPLOYMENT_NAME": "${DEPLOYMENT_NAME}", "HOME":"${HOME}"} diff --git a/codebundles/k8s-namespace-healthcheck/runbook.robot b/codebundles/k8s-namespace-healthcheck/runbook.robot deleted file mode 100644 index 4352bb96..00000000 --- a/codebundles/k8s-namespace-healthcheck/runbook.robot +++ /dev/null @@ -1,409 +0,0 @@ -*** Settings *** -Documentation This taskset runs general troubleshooting checks against all applicable objects in a namespace, checks error events, and searches pod logs for error entries. -Metadata Author jon-funk -Metadata Display Name Kubernetes Namespace Troubleshoot -Metadata Supports Kubernetes,AKS,EKS,GKE,OpenShift - -Library BuiltIn -Library RW.Core -Library RW.CLI -Library RW.platform -Library OperatingSystem -Library DateTime -Library Collections - -Suite Setup Suite Initialization - - -*** Tasks *** -Trace And Troubleshoot Namespace `${NAMESPACE}` Warning Events And Errors - [Documentation] Queries all error events in a given namespace within the last 30 minutes, - ... fetches the list of involved pod names, requests logs from them and parses - ... the logs for exceptions. - [Tags] namespace trace error pods events logs grep - # get pods involved with error events - ${error_events}= RW.CLI.Run Cli - ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get events --field-selector type=Warning --context ${CONTEXT} -n ${NAMESPACE} -o json - ... env=${env} - ... secret_file__kubeconfig=${kubeconfig} - ... render_in_commandlist=true - ${recent_error_events}= RW.CLI.Parse Cli Json Output - ... rsp=${error_events} - ... extract_path_to_var__recent_events=items - ... recent_events__filter_older_than__60m=lastTimestamp - ... assign_stdout_from_var=recent_events - - ${involved_pod_names}= RW.CLI.Run Cli - ... cmd=cat << 'EOF' | jq -r '.items[] | select(.involvedObject.kind == "Pod") | .involvedObject.name' | tr -d "\n"\n${error_events.stdout}EOF - ... include_in_history=False - ${involved_pod_names_array}= Evaluate """${involved_pod_names.stdout}""".split("\\n") - ${event_messages}= RW.CLI.Run Cli - ... cmd=cat << 'EOF' | jq -r '.items[] | select(.involvedObject.kind == "Pod") | .message' | tr -d "\n"\n${error_events.stdout}EOF - ... include_in_history=False - ${event_messages_array}= Evaluate """${event_messages.stdout}""".split("\\n") - - ${involved_pod_names}= RW.CLI.Parse Cli Json Output - ... rsp=${error_events} - ... extract_path_to_var__involved_pod_names=items[?involvedObject.kind=='Pod'].involvedObject.name - ... from_var_with_path__involved_pod_names__to__pod_count=length(@) - ... pod_count__raise_issue_if_gt=0 - ... set_issue_title=$pod_count Pods Found With Recent Warning Events In Namespace ${NAMESPACE} - ... set_issue_details=Warning events in the namespace ${NAMESPACE}.\nName of pods with issues:\n"$involved_pod_names"\nTroubleshoot pod or namespace events:\n"${recent_error_events.stdout}" - ... set_issue_next_steps=Run Application Level Troubleshooting and Inspect Pod Logs In Namespace `${NAMESPACE}` For Pods: `${involved_pod_names_array}` - ... assign_stdout_from_var=involved_pod_names - # get pods with restarts > 0 - ${pods_in_namespace}= RW.CLI.Run Cli - ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get pods --context ${CONTEXT} -n ${NAMESPACE} -o json - ... env=${env} - ... secret_file__kubeconfig=${kubeconfig} - ${restart_age}= RW.CLI.String To Datetime 30m - ${pod_names}= RW.CLI.Run Cli - ... cmd=cat << 'EOF' | jq -r '.items[].metadata.name'\n${error_events.stdout}EOF - ... include_in_history=False - ${restarting_pods}= RW.CLI.Parse Cli Json Output - ... rsp=${pods_in_namespace} - ... extract_path_to_var__pod_restart_stats=items[].{name:metadata.name, containerRestarts:status.containerStatuses[].{restartCount:restartCount, terminated_at:lastState.terminated.finishedAt}|[?restartCount > `0` && terminated_at >= `${restart_age}`]} - ... from_var_with_path__pod_restart_stats__to__pods_with_recent_restarts=[].{name: name, restartSum:sum(containerRestarts[].restartCount || [`0`])}|[?restartSum > `0`] - ... from_var_with_path__pods_with_recent_restarts__to__restart_pod_names=[].name - ... from_var_with_path__pods_with_recent_restarts__to__pod_count=length(@) - ... pod_count__raise_issue_if_gt=0 - ... set_issue_title=Frequently Restarting Pods In Namespace ${NAMESPACE} - ... set_issue_details=Found $pod_count pods that are frequently restarting in ${NAMESPACE}. Troubleshoot these pods:\n"$pods_with_recent_restarts" - ... set_issue_next_steps=Run Application Level Troubleshooting and Inspect Pod Logs In Namespace `${NAMESPACE}` For Pods: `${pod_names.stdout}` - ... assign_stdout_from_var=restart_pod_names - # fetch logs with pod names - ${restarting_pods}= RW.CLI.From Json json_str=${restarting_pods.stdout} - ${involved_pod_names}= RW.CLI.From Json json_str=${involved_pod_names.stdout} - ${podnames_to_query}= Combine Lists ${restarting_pods} ${involved_pod_names} - ${pod_logs_errors}= RW.CLI.Run Cli - ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} logs --context=${CONTEXT} --namespace=${NAMESPACE} pod/{item} --tail=100 | grep -E -i "${ERROR_PATTERN}" || true - ... env=${env} - ... secret_file__kubeconfig=${kubeconfig} - ... loop_with_items=${podnames_to_query} - ${history}= RW.CLI.Pop Shell History - IF """${pod_logs_errors.stdout}""" != "" - ${error_trace_results}= Set Variable - ... Found error logs:\n${pod_logs_errors.stdout}\n\nEffected Pods: ${podnames_to_query}\n - ELSE - ${error_trace_results}= Set Variable No trace errors found! - END - RW.Core.Add Pre To Report Summary of error trace in namespace: ${NAMESPACE} - RW.Core.Add Pre To Report ${error_trace_results} - RW.Core.Add Pre To Report Commands Used:\n${history} - -Troubleshoot Container Restarts In Namespace `${NAMESPACE}` - [Documentation] Fetches pods that have container restarts and provides a report of the restart issues. - [Tags] namespace containers status restarts - ${container_restart_details}= RW.CLI.Run Cli - ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get pods --context=${CONTEXT} -n ${NAMESPACE} -o json | jq -r --argjson exit_code_explanations '{"0": "Success", "1": "Error", "2": "Misconfiguration", "130": "Pod terminated by SIGINT", "134": "Abnormal Termination SIGABRT", "137": "Pod terminated by SIGKILL - Possible OOM", "143":"Graceful Termination SIGTERM"}' '.items[] | select(.status.containerStatuses != null) | select(any(.status.containerStatuses[]; .restartCount > 0)) | "---\\npod_name: \\(.metadata.name)\\n" + (.status.containerStatuses[] | "containers: \\(.name)\\nrestart_count: \\(.restartCount)\\nmessage: \\(.state.waiting.message // "N/A")\\nterminated_reason: \\(.lastState.terminated.reason // "N/A")\\nterminated_finishedAt: \\(.lastState.terminated.finishedAt // "N/A")\\nterminated_exitCode: \\(.lastState.terminated.exitCode // "N/A")\\nexit_code_explanation: \\($exit_code_explanations[.lastState.terminated.exitCode | tostring] // "Unknown exit code")") + "\\n---\\n"' - ... env=${env} - ... secret_file__kubeconfig=${kubeconfig} - ... render_in_commandlist=true - ${container_restart_analysis}= RW.CLI.Run Bash File - ... bash_file=container_restarts.sh - ... env=${env} - ... secret_file__kubeconfig=${kubeconfig} - ${recommendations}= RW.CLI.Run Cli - ... cmd=echo "${container_restart_analysis.stdout}" | awk '/Recommended Next Steps:/ {flag=1; next} flag' - ... env=${env} - ... include_in_history=false - RW.CLI.Parse Cli Output By Line - ... rsp=${container_restart_analysis} - ... set_severity_level=2 - ... set_issue_expected=Containers should not be restarting. - ... set_issue_actual=We found the following containers with restarts: $_stdout - ... set_issue_title=Container Restarts Detected In Namespace ${NAMESPACE} - ... set_issue_details=${container_restart_analysis.stdout} - ... set_issue_next_steps=${recommendations.stdout} - ... _line__raise_issue_if_contains=Recommend - ${history}= RW.CLI.Pop Shell History - IF """${container_restart_details.stdout}""" == "" - ${container_restart_details}= Set Variable No container restarts found - ELSE - ${container_restart_details}= Set Variable ${container_restart_details.stdout} - END - RW.Core.Add Pre To Report Summary of unready container restarts in namespace: ${NAMESPACE} - RW.Core.Add Pre To Report ${container_restart_analysis.stdout} - RW.Core.Add Pre To Report Commands Used:\n${history} - -Troubleshoot Pending Pods In Namespace `${NAMESPACE}` - [Documentation] Fetches pods that are pending and provides details. - [Tags] namespace pods status pending - ${pending_pods}= RW.CLI.Run Cli - ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get pods --context=${CONTEXT} -n ${NAMESPACE} --field-selector=status.phase=Pending --no-headers -o json | jq -r '.items[] | "---\\npod_name: \\(.metadata.name)\\nstatus: \\(.status.phase // "N/A")\\nmessage: \\(.status.conditions[].message // "N/A")\\nreason: \\(.status.conditions[].reason // "N/A")\\ncontainerStatus: \\((.status.containerStatuses // [{}])[].state // "N/A")\\ncontainerMessage: \\((.status.containerStatuses // [{}])[].state?.waiting?.message // "N/A")\\ncontainerReason: \\((.status.containerStatuses // [{}])[].state?.waiting?.reason // "N/A")\\n---\\n"' - ... env=${env} - ... secret_file__kubeconfig=${kubeconfig} - ... render_in_commandlist=true - RW.CLI.Parse Cli Output By Line - ... rsp=${pending_pods} - ... set_severity_level=1 - ... set_issue_expected=Pods should not be stuck pending. - ... set_issue_actual=We found the following pods in a pending state: $_stdout - ... set_issue_title=Pending Pods Found In Namespace ${NAMESPACE} - ... set_issue_details=Pods pending with reasons:\n"$_stdout" in the namespace ${NAMESPACE} - ... set_issue_next_steps=Run Application Level Troubleshooting and Inspect Pod Logs In Namespace `${NAMESPACE}` - ... _line__raise_issue_if_contains=- - ${history}= RW.CLI.Pop Shell History - IF """${pending_pods.stdout}""" == "" - ${pending_pods}= Set Variable No pending pods found - ELSE - ${pending_pods}= Set Variable ${pending_pods.stdout} - END - RW.Core.Add Pre To Report Summary of pendind pods in namespace: ${NAMESPACE} - RW.Core.Add Pre To Report ${pending_pods} - RW.Core.Add Pre To Report Commands Used:\n${history} - -Troubleshoot Failed Pods In Namespace `${NAMESPACE}` - [Documentation] Fetches all pods which are not running (unready) in the namespace and adds them to a report for future review. - [Tags] namespace pods status unready not starting phase failed - ${unreadypods_details}= RW.CLI.Run Cli - ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get pods --context=${CONTEXT} -n ${NAMESPACE} --field-selector=status.phase=Failed --no-headers -o json | jq -r --argjson exit_code_explanations '{"0": "Success", "1": "Error", "2": "Misconfiguration", "130": "Pod terminated by SIGINT", "134": "Abnormal Termination SIGABRT", "137": "Pod terminated by SIGKILL - Possible OOM", "143":"Graceful Termination SIGTERM"}' '.items[] | "---\\npod_name: \\(.metadata.name)\\nrestart_count: \\(.status.containerStatuses[0].restartCount // "N/A")\\nmessage: \\(.status.message // "N/A")\\nterminated_finishedAt: \\(.status.containerStatuses[0].state.terminated.finishedAt // "N/A")\\nexit_code: \\(.status.containerStatuses[0].state.terminated.exitCode // "N/A")\\nexit_code_explanation: \\($exit_code_explanations[.status.containerStatuses[0].state.terminated.exitCode | tostring] // "Unknown exit code")\\n---\\n"' - ... env=${env} - ... secret_file__kubeconfig=${kubeconfig} - ... render_in_commandlist=true - RW.CLI.Parse Cli Output By Line - ... rsp=${unreadypods_details} - ... set_severity_level=1 - ... set_issue_expected=No pods should be in an unready state - ... set_issue_actual=We found the following unready pods: $_stdout - ... set_issue_title=Unready Pods Detected In Namespace ${NAMESPACE} - ... set_issue_details=Unready pods:\n"$_stdout" in the namespace ${NAMESPACE} - ... set_issue_next_steps=Run Application Level Troubleshooting and Inspect Pod Logs In Namespace `${NAMESPACE}` - ... _line__raise_issue_if_contains=- - ${history}= RW.CLI.Pop Shell History - IF """${unreadypods_details.stdout}""" == "" - ${unreadypods_details}= Set Variable No unready pods found - ELSE - ${unreadypods_details}= Set Variable ${unreadypods_details.stdout} - END - RW.Core.Add Pre To Report Summary of unready pods in namespace: ${NAMESPACE} - RW.Core.Add Pre To Report ${unreadypods_details} - RW.Core.Add Pre To Report Commands Used:\n${history} - -Troubleshoot Workload Status Conditions In Namespace `${NAMESPACE}` - [Documentation] Parses all workloads in a namespace and inspects their status conditions for issues. Status conditions with a status value of False are considered an error. - [Tags] namespace status conditions pods reasons workloads - ${all_resources}= RW.CLI.Run Cli - ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get all --context ${CONTEXT} -n ${NAMESPACE} -o json - ... env=${env} - ... secret_file__kubeconfig=${kubeconfig} - ... render_in_commandlist=true - ${workload_info}= RW.CLI.Run Cli - ... cmd=cat << 'EOF' | jq -r '[.items[] | {kind: .kind, name: .metadata.name, conditions: .status.conditions[]? | select(.status == "False")}][0] // null'\n${all_resources.stdout}EOF - ... include_in_history=False - ... env=${env} - ... secret_file__kubeconfig=${kubeconfig} - ${condition}= RW.CLI.Run Cli - ... cmd=cat << 'EOF' | jq -r '.conditions.reason' | tr -d "\n"\n${workload_info.stdout}EOF - ... include_in_history=False - ${workload_name}= RW.CLI.Run Cli - ... cmd=cat << 'EOF' | jq -r '.name' | tr -d "\n"\n${workload_info.stdout}EOF - ... include_in_history=False - ${workload_kind}= RW.CLI.Run Cli - ... cmd=cat << 'EOF' | jq -r '.kind' | tr -d "\n"\n${workload_info.stdout}EOF - ... include_in_history=False - ${failing_conditions}= RW.CLI.Parse Cli Json Output - ... rsp=${all_resources} - ... extract_path_to_var__workload_conditions=items[].{kind:kind, name:metadata.name, conditions:status.conditions[?status == `False`]} - ... from_var_with_path__workload_conditions__to__failing_workload_conditions=[?length(conditions || `[]`) > `0`] - ... from_var_with_path__failing_workload_conditions__to__aggregate_failures=[].{kind:kind,name:name,conditions:conditions[].{reason:reason, type:type, status:status}} - ... from_var_with_path__aggregate_failures__to__pods_with_failures=length(@) - ... pods_with_failures__raise_issue_if_gt=0 - ... set_severity_level=1 - ... set_issue_title=$pods_with_failures Pods With Unhealthy Status In Namespace ${NAMESPACE} - ... set_issue_details=Pods with unhealthy status condition in the namespace ${NAMESPACE}. Here's a summary of potential issues we found:\n"$aggregate_failures" - ... set_issue_next_steps=Run Application Level Troubleshooting On Workload `${workload_name.stdout}` to further diagnose condition: `${condition.stdout}` - ... assign_stdout_from_var=aggregate_failures - ${history}= RW.CLI.Pop Shell History - IF """${failing_conditions.stdout}""" == "" - ${failing_conditions}= Set Variable No unready pods found - ELSE - ${failing_conditions}= Set Variable ${failing_conditions.stdout} - END - RW.Core.Add Pre To Report Summary of Pods with Failing Conditions in Namespace: ${NAMESPACE} - RW.Core.Add Pre To Report ${failing_conditions} - RW.Core.Add Pre To Report Commands Used:\n${history} - -Get Listing Of Resources In Namespace `${NAMESPACE}` - [Documentation] Simple fetch all to provide a snapshot of information about the workloads in the namespace for future review in a report. - [Tags] get all resources info workloads namespace manifests - ${all_results}= RW.CLI.Run Cli - ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} api-resources --verbs=list --namespaced -o name --context=${CONTEXT} | xargs -n 1 ${KUBERNETES_DISTRIBUTION_BINARY} get --show-kind --ignore-not-found -n ${NAMESPACE} --context=${CONTEXT} - ... env=${env} - ... secret_file__kubeconfig=${kubeconfig} - ... render_in_commandlist=true - ${history}= RW.CLI.Pop Shell History - RW.Core.Add Pre To Report Informational Get All for Namespace: ${NAMESPACE} - RW.Core.Add Pre To Report ${all_results.stdout} - RW.Core.Add Pre To Report Commands Used:\n${history} - -Check For Namespace `${NAMESPACE}` Event Anomalies - [Documentation] Parses all events in a namespace within a timeframe and checks for unusual activity, raising issues for any found. - [Tags] namespace events info state anomolies count occurences - ${recent_anomalies}= RW.CLI.Run Cli - ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get events --field-selector type!=Warning --context ${CONTEXT} -n ${NAMESPACE} -o json | jq -r '.items[] | select( .count / ( if ((.lastTimestamp|fromdate)-(.firstTimestamp|fromdate))/60 == 0 then 1 else ((.lastTimestamp|fromdate)-(.firstTimestamp|fromdate))/60 end ) > ${ANOMALY_THRESHOLD}) | "Event(s) Per Minute:" + (.count / ( if ((.lastTimestamp|fromdate)-(.firstTimestamp|fromdate))/60 == 0 then 1 else ((.lastTimestamp|fromdate)-(.firstTimestamp|fromdate))/60 end ) |tostring) +" Count:" + (.count|tostring) + " Minute(s):" + (((.lastTimestamp|fromdate)-(.firstTimestamp|fromdate))/60|tostring)+ " Object:" + .involvedObject.namespace + "/" + .involvedObject.kind + "/" + .involvedObject.name + " Reason:" + .reason + " Message:" + .message' - ... env=${env} - ... secret_file__kubeconfig=${kubeconfig} - ${event_messages}= RW.CLI.Run CLI - ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get events --field-selector type!=Warning --context ${CONTEXT} -n ${NAMESPACE} -o json | jq -r .items[].message - ... env=${env} - ... secret_file__kubeconfig=${kubeconfig} - ... include_in_history=False - ${event_messages}= Evaluate """${event_messages.stdout}""".split("\\n") - ${pod_name}= RW.CLI.Run Cli - ... cmd=echo "${recent_anomalies.stdout}" | grep -oP '(?<=Pod/)[^ ]*' | grep -oP '[^.]*(?=-[a-z0-9]+-[a-z0-9]+)' | head -n 1 - ... include_in_history=False - RW.CLI.Parse Cli Output By Line - ... rsp=${recent_anomalies} - ... expected_rsp_returncodes=[0,5] - ... set_severity_level=2 - ... set_issue_expected=No unusual recent anomaly events with high counts in the namespace ${NAMESPACE} - ... set_issue_actual=We detected events in the namespace ${NAMESPACE} which are considered anomalies - ... set_issue_title=Event Anomalies Detected In Namespace ${NAMESPACE} - ... set_issue_details=Anomaly non-warning events in namespace ${NAMESPACE}:\n"$_stdout" - ... set_issue_next_steps=Run Application Level Troubleshooting On Pod `${pod_name.stdout}` In Namespace `${NAMESPACE}` and Inspect Logs. - ... _line__raise_issue_if_contains=Object - ${history}= RW.CLI.Pop Shell History - RW.Core.Add To Report Summary Of Anomalies Detected:\n - RW.Core.Add To Report ${recent_anomalies.stdout}\n - RW.Core.Add Pre To Report Commands Used:\n${history} - -Troubleshoot Namespace `${NAMESPACE}` Services And Application Workloads - [Documentation] Iterates through the services within a namespace for a given timeframe and byte length max, checking the resulting logs for distinct entries matching a given pattern in order to determine a root issue. - [Tags] - ... namespace - ... services - ... applications - ... workloads - ... deployments - ... apps - ... ingress - ... http - ... networking - ... endpoints - ... logs - ... aggregate - ... filter - ${aggregate_service_logs}= RW.CLI.Run Cli - ... cmd=services=($(${KUBERNETES_DISTRIBUTION_BINARY} get svc -o=name --context=${CONTEXT} -n ${NAMESPACE})); logs=""; for service in "\${services[@]}"; do logs+=$(${KUBERNETES_DISTRIBUTION_BINARY} logs $service --limit-bytes=256000 --since=2h --context=${CONTEXT} -n ${NAMESPACE} | grep -Ei "${SERVICE_ERROR_PATTERN}" | grep -Ev "${SERVICE_EXCLUDE_PATTERN}" | sort | uniq -c | awk '{print "Issue Occurences:",$0}'); done; echo "\${logs}" - ... env=${env} - ... secret_file__kubeconfig=${KUBECONFIG} - ... render_in_commandlist=true - RW.CLI.Parse Cli Output By Line - ... rsp=${aggregate_service_logs} - ... set_severity_level=3 - ... set_issue_expected=Service workload logs in namespace ${NAMESPACE} should not contain any error entries - ... set_issue_actual=Service workload logs in namespace ${NAMESPACE} contain errors entries - ... set_issue_title=Service Workloads In Namespace ${NAMESPACE} Have Error Log Entries - ... set_issue_details=We found the following distinctly counted errors in the service workloads of namespace ${NAMESPACE}:\n\n$_stdout\n\nThese errors may be related to other workloads that need triaging - ... set_issue_next_steps=Check For Deployment Event Anomalies - ... _line__raise_issue_if_contains=Error - ${history}= RW.CLI.Pop Shell History - RW.Core.Add To Report Sample Of Aggregate Counted Logs Found:\n - RW.Core.Add To Report ${aggregate_service_logs.stdout}\n - RW.Core.Add Pre To Report Commands Used:\n${history} - -Check Missing or Risky PodDisruptionBudget Policies In Namespace `${NAMESPACE}` - [Documentation] Searches through deployemnts and statefulsets to determine if they are missing PodDistruptionBudgets or have them configured in a risky way that prohibits cluster or node upgrades. - [Tags] poddisruptionbudget availability unavailable risky missing policy - ${pdb_check}= RW.CLI.Run Cli - ... cmd=context="${CONTEXT}"; namespace="${NAMESPACE}"; check_health() { local type=$1; local name=$2; local replicas=$3; local selector=$4; local pdbs=$(${KUBERNETES_DISTRIBUTION_BINARY} --context "$context" --namespace "$namespace" get pdb -o json | jq -c --arg selector "$selector" '.items[] | select(.spec.selector.matchLabels | to_entries[] | .key + "=" + .value == $selector)'); if [[ $replicas -gt 1 && -z "$pdbs" ]]; then printf "%-30s %-30s %-10s\\n" "$type/$name" "" "Missing"; else echo "$pdbs" | jq -c . | while IFS= read -r pdb; do local pdbName=$(echo "$pdb" | jq -r '.metadata.name'); local minAvailable=$(echo "$pdb" | jq -r '.spec.minAvailable // ""'); local maxUnavailable=$(echo "$pdb" | jq -r '.spec.maxUnavailable // ""'); if [[ "$minAvailable" == "100%" || "$maxUnavailable" == "0" || "$maxUnavailable" == "0%" ]]; then printf "%-30s %-30s %-10s\\n" "$type/$name" "$pdbName" "Risky"; elif [[ $replicas -gt 1 && ("$minAvailable" != "100%" || "$maxUnavailable" != "0" || "$maxUnavailable" != "0%") ]]; then printf "%-30s %-30s %-10s\\n" "$type/$name" "$pdbName" "OK"; fi; done; fi; }; echo "Deployments:"; echo "-----------"; printf "%-30s %-30s %-10s\\n" "NAME" "PDB" "STATUS"; ${KUBERNETES_DISTRIBUTION_BINARY} --context "$context" --namespace "$namespace" get deployments -o json | jq -c '.items[] | "\\(.metadata.name) \\(.spec.replicas) \\(.spec.selector.matchLabels | to_entries[] | .key + "=" + .value)"' | while read -r line; do check_health "Deployment" $(echo $line | tr -d '"'); done; echo ""; echo "Statefulsets:"; echo "-------------"; printf "%-30s %-30s %-10s\\n" "NAME" "PDB" "STATUS"; ${KUBERNETES_DISTRIBUTION_BINARY} --context "$context" --namespace "$namespace" get statefulsets -o json | jq -c '.items[] | "\\(.metadata.name) \\(.spec.replicas) \\(.spec.selector.matchLabels | to_entries[] | .key + "=" + .value)"' | while read -r line; do check_health "StatefulSet" $(echo $line | tr -d '"'); done - ... env=${env} - ... secret_file__kubeconfig=${KUBECONFIG} - ... render_in_commandlist=true - ${risky_pdbs}= RW.CLI.Run Cli - ... cmd=echo "${pdb_check.stdout}" | grep 'Risky' | cut -f 1 -d ' ' | awk -F'/' '{print $1 ":" $2}' - ... include_in_history=False - ${missing_pdbs}= RW.CLI.Run Cli - ... cmd=echo "${pdb_check.stdout}" | grep 'Missing' | cut -f 1 -d ' ' | awk -F'/' '{print $1 ":" $2}' - ... include_in_history=False - RW.CLI.Parse Cli Output By Line - ... rsp=${pdb_check} - ... set_severity_level=2 - ... set_issue_expected=PodDisruptionBudgets in `${NAMESPACE}` should not block regular maintenance - ... set_issue_actual=We detected PodDisruptionBudgets in namespace `${NAMESPACE}` which are considered Risky to maintenance operations - ... set_issue_title=Risky PodDisruptionBudgets Found in namespace `${NAMESPACE}` - ... set_issue_details=Review the PodDisruptionBudget check for `${NAMESPACE}`:$_stdout - ... set_issue_next_steps=Review & Edit PodDisruptionBudget for `${risky_pdbs.stdout}` - ... _line__raise_issue_if_contains=(.*?) - RW.CLI.Parse Cli Output By Line - ... rsp=${pdb_check} - ... set_severity_level=4 - ... set_issue_expected=PodDisruptionBudgets in `${NAMESPACE}` should exist for applications that have more than 1 replica - ... set_issue_actual=We detected Deployments or StatefulSets in namespace `${NAMESPACE}` which are missing PodDisruptionBudgets - ... set_issue_title=Deployments or StatefulSets in namespace `${NAMESPACE}` are missing PodDisruptionBudgets - ... set_issue_details=Review the Deployments and StatefulSets missing PodDisruptionBudget in `${NAMESPACE}`:\n$_stdout - ... set_issue_next_steps=Create missing Pod Distruption Budgets for `${missing_pdbs.stdout}` - ... _line__raise_issue_if_contains=Missing - ${history}= RW.CLI.Pop Shell History - RW.Core.Add To Report ${pdb_check.stdout}\n - RW.Core.Add Pre To Report Commands Used:\n${history} - - -*** Keywords *** -Suite Initialization - ${kubeconfig}= RW.Core.Import Secret - ... kubeconfig - ... type=string - ... description=The kubernetes kubeconfig yaml containing connection configuration used to connect to cluster(s). - ... pattern=\w* - ... example=For examples, start here https://kubernetes.io/docs/concepts/configuration/organize-cluster-access-kubeconfig/ - ${kubectl}= RW.Core.Import Service kubectl - ... description=The location service used to interpret shell commands. - ... default=kubectl-service.shared - ... example=kubectl-service.shared - ${NAMESPACE}= RW.Core.Import User Variable NAMESPACE - ... type=string - ... description=The name of the Kubernetes namespace to scope actions and searching to. - ... pattern=\w* - ... example=my-namespace - ${CONTEXT}= RW.Core.Import User Variable CONTEXT - ... type=string - ... description=Which Kubernetes context to operate within. - ... pattern=\w* - ... example=my-main-cluster - ${ERROR_PATTERN}= RW.Core.Import User Variable ERROR_PATTERN - ... type=string - ... description=The error pattern to use when grep-ing logs. - ... pattern=\w* - ... example=(Error|Exception) - ... default=(Error|Exception) - ${SERVICE_ERROR_PATTERN}= RW.Core.Import User Variable SERVICE_ERROR_PATTERN - ... type=string - ... description=The error pattern to use when grep-ing logs for services. - ... pattern=\w* - ... example=(Error: 13|Error: 14) - ... default=(Error:) - ${SERVICE_EXCLUDE_PATTERN}= RW.Core.Import User Variable SERVICE_EXCLUDE_PATTERN - ... type=string - ... description=Pattern used to exclude entries from log results when searching in service logs. - ... pattern=\w* - ... example=(node_modules|opentelemetry) - ... default=(node_modules|opentelemetry) - ${ANOMALY_THRESHOLD}= RW.Core.Import User Variable - ... ANOMALY_THRESHOLD - ... type=string - ... description=The rate of occurence per minute at which an Event becomes classified as an anomaly, even if Kubernetes considers it informational. - ... pattern=\d+(\.\d+)? - ... example=1.0 - ... default=1.0 - ${KUBERNETES_DISTRIBUTION_BINARY}= RW.Core.Import User Variable KUBERNETES_DISTRIBUTION_BINARY - ... type=string - ... description=Which binary to use for Kubernetes CLI commands. - ... enum=[kubectl,oc] - ... example=kubectl - ... default=kubectl - ${HOME}= RW.Core.Import User Variable HOME - Set Suite Variable ${kubeconfig} ${kubeconfig} - Set Suite Variable ${kubectl} ${kubectl} - Set Suite Variable ${CONTEXT} ${CONTEXT} - Set Suite Variable ${KUBERNETES_DISTRIBUTION_BINARY} ${KUBERNETES_DISTRIBUTION_BINARY} - Set Suite Variable ${NAMESPACE} ${NAMESPACE} - Set Suite Variable ${ERROR_PATTERN} ${ERROR_PATTERN} - Set Suite Variable ${ANOMALY_THRESHOLD} ${ANOMALY_THRESHOLD} - Set Suite Variable ${SERVICE_ERROR_PATTERN} ${SERVICE_ERROR_PATTERN} - Set Suite Variable ${SERVICE_EXCLUDE_PATTERN} ${SERVICE_EXCLUDE_PATTERN} - Set Suite Variable ${HOME} ${HOME} - Set Suite Variable - ... ${env} - ... {"KUBECONFIG":"./${kubeconfig.key}", "KUBERNETES_DISTRIBUTION_BINARY":"${KUBERNETES_DISTRIBUTION_BINARY}", "CONTEXT":"${CONTEXT}", "NAMESPACE":"${NAMESPACE}", "HOME":"${HOME}"} diff --git a/codebundles/k8s-pvc-healthcheck/runbook.robot b/codebundles/k8s-pvc-healthcheck/runbook.robot deleted file mode 100644 index a0729a65..00000000 --- a/codebundles/k8s-pvc-healthcheck/runbook.robot +++ /dev/null @@ -1,225 +0,0 @@ -*** Settings *** -Documentation This taskset collects information about storage such as PersistentVolumes and PersistentVolumeClaims to -... validate health or help troubleshoot potential storage issues. -Metadata Author stewartshea -Metadata Display Name Kubernetes Persistent Volume Healthcheck -Metadata Supports Kubernetes,AKS,EKS,GKE,OpenShift - -Library BuiltIn -Library RW.Core -Library RW.CLI -Library RW.platform -Library OperatingSystem -Library DateTime -Library Collections - -Suite Setup Suite Initialization - - -*** Tasks *** -Fetch Events for Unhealthy Kubernetes PersistentVolumeClaims In Namespace `${NAMESPACE}` - [Documentation] Lists events related to PersistentVolumeClaims within the namespace that are not bound to PersistentVolumes. - [Tags] - ... pvc - ... list - ... kubernetes - ... storage - ... persistentvolumeclaim - ... persistentvolumeclaims events - ... check event output and related nodes, persistentvolumes, persistentvolumeclaims, image registry authenticaiton, or fluxcd or argocd logs. - ${unbound_pvc_events}= RW.CLI.Run Cli - ... cmd=for pvc in $(${KUBERNETES_DISTRIBUTION_BINARY} get pvc -n ${NAMESPACE} --context ${CONTEXT} -o json | jq -r '.items[] | select(.status.phase != "Bound") | .metadata.name'); do ${KUBERNETES_DISTRIBUTION_BINARY} get events -n ${NAMESPACE} --context ${CONTEXT} --field-selector involvedObject.name=$pvc -o json | jq '.items[]| "Last Timestamp: " + .lastTimestamp + " Name: " + .involvedObject.name + " Message: " + .message'; done - ... env=${env} - ... secret_file__kubeconfig=${kubeconfig} - ... render_in_commandlist=true - ${regexp}= Catenate - ... (?m)(?P.+) - RW.CLI.Parse Cli Output By Line - ... rsp=${unbound_pvc_events} - ... lines_like_regexp=${regexp} - ... set_severity_level=1 - ... set_issue_expected=PVCs should be bound - ... set_issue_actual=PVCs found pending with the following events - ... set_issue_title=PVC Errors & Events In Namespace ${NAMESPACE} - ... set_issue_details=We found "$line" in the namespace ${NAMESPACE} - ... set_issue_next_steps=Review list of unbound `PersistentVolumeClaims` in namespace `${NAMESPACE}`\nCheck `Node` `Events`, `StorageClasses` and `CSI drivers`\nReview your application configurations - ... line__raise_issue_if_contains=Name - ${history}= RW.CLI.Pop Shell History - RW.Core.Add Pre To Report Summary of events for unbound pvc in ${NAMESPACE}: - RW.Core.Add Pre To Report ${unbound_pvc_events.stdout} - RW.Core.Add Pre To Report Commands Used:\n${history} - -List PersistentVolumeClaims in Terminating State In Namespace `${NAMESPACE}` - [Documentation] Lists persistentvolumeclaims in a Terminating state. - [Tags] pvc list kubernetes storage persistentvolumeclaim terminating check persistentvolumes - ${terminating_pvcs}= RW.CLI.Run Cli - ... cmd=namespace=${NAMESPACE}; context=${CONTEXT}; ${KUBERNETES_DISTRIBUTION_BINARY} get pvc -n $namespace --context=$context -o json | jq -r '.items[] | select(.metadata.deletionTimestamp != null) | .metadata.name as $name | .metadata.deletionTimestamp as $deletion_time | .metadata.finalizers as $finalizers | "\\($name) is in Terminating state (Deletion started at: \\($deletion_time)). Finalizers: \\($finalizers)"' - ... env=${env} - ... secret_file__kubeconfig=${kubeconfig} - ... render_in_commandlist=true - ${history}= RW.CLI.Pop Shell History - RW.Core.Add Pre To Report Summary of events for dangling persistent volumes: - RW.Core.Add Pre To Report ${terminating_pvcs.stdout} - RW.Core.Add Pre To Report Commands Used:\n${history} - -List PersistentVolumes in Terminating State In Namespace `${NAMESPACE}` - [Documentation] Lists events related to persistent volumes in Terminating state. - [Tags] - ... pv - ... list - ... kubernetes - ... storage - ... persistentvolume - ... terminating - ... events - ... check event output and related nodes, persistentvolumes, persistentvolumeclaims, image registry authenticaiton, or fluxcd or argocd logs. - ${dangline_pvcs}= RW.CLI.Run Cli - ... cmd=for pv in $(${KUBERNETES_DISTRIBUTION_BINARY} get pv --context ${CONTEXT} -o json | jq -r '.items[] | select(.status.phase == "Terminating") | .metadata.name'); do ${KUBERNETES_DISTRIBUTION_BINARY} get events --all-namespaces --field-selector involvedObject.name=$pv --context ${CONTEXT} -o json | jq '.items[]| "Last Timestamp: " + .lastTimestamp + " Name: " + .involvedObject.name + " Message: " + .message'; done - ... env=${env} - ... secret_file__kubeconfig=${kubeconfig} - ... render_in_commandlist=true - ${regexp}= Catenate - ... (?m)(?P.+) - RW.CLI.Parse Cli Output By Line - ... rsp=${dangline_pvcs} - ... lines_like_regexp=${regexp} - ... set_severity_level=4 - ... set_issue_expected=PV should not be stuck terminating. - ... set_issue_actual=PV is in a terminating state. - ... set_issue_title=PV Events While Terminating In Namespace ${NAMESPACE} - ... set_issue_details=We found "$_line" in the namespace ${NAMESPACE} - ... set_issue_next_steps=Review `PersistentVolumeClaims` in ${NAMESPACE} after waiting a couple minutes to see if they resolve\nCheck Health of `Deployments` and `StatefulSets` mounting the volumes in `${NAMESPACE}`\nEnsure no `Pods` attached to the `PersistentVolumeClaims` are status=`Running` in namespace `${NAMESPACE}` as this can prevent them from terminating - ... _line__raise_issue_if_contains=Name - ${history}= RW.CLI.Pop Shell History - RW.Core.Add Pre To Report Summary of events for dangling persistent volumes: - RW.Core.Add Pre To Report ${dangline_pvcs.stdout} - RW.Core.Add Pre To Report Commands Used:\n${history} - -List Pods with Attached Volumes and Related PersistentVolume Details In Namespace `${NAMESPACE}` - [Documentation] For each pod in a namespace, collect details on configured PersistentVolumeClaim, PersistentVolume, and node. - [Tags] - ... pod - ... storage - ... pvc - ... pv - ... status - ... csi - ... storagereport - ... check event output and related nodes, persistentvolumes, persistentvolumeclaims, image registry authenticaiton, or fluxcd or argocd logs. - ${pod_storage_report}= RW.CLI.Run Cli - ... cmd=for pod in $(${KUBERNETES_DISTRIBUTION_BINARY} get pods -n ${NAMESPACE} --field-selector=status.phase=Running --context ${CONTEXT} -o jsonpath='{range .items[*]}{.metadata.name}{"\\n"}{end}'); do for pvc in $(${KUBERNETES_DISTRIBUTION_BINARY} get pods $pod -n ${NAMESPACE} --context ${CONTEXT} -o jsonpath='{range .spec.volumes[*]}{.persistentVolumeClaim.claimName}{"\\n"}{end}'); do pv=$(${KUBERNETES_DISTRIBUTION_BINARY} get pvc $pvc -n ${NAMESPACE} --context ${CONTEXT} -o jsonpath='{.spec.volumeName}') && status=$(${KUBERNETES_DISTRIBUTION_BINARY} get pv $pv --context ${CONTEXT} -o jsonpath='{.status.phase}') && node=$(${KUBERNETES_DISTRIBUTION_BINARY} get pod $pod -n ${NAMESPACE} --context ${CONTEXT} -o jsonpath='{.spec.nodeName}') && zone=$(${KUBERNETES_DISTRIBUTION_BINARY} get nodes $node --context ${CONTEXT} -o jsonpath='{.metadata.labels.topology\\.kubernetes\\.io/zone}') && ingressclass=$(${KUBERNETES_DISTRIBUTION_BINARY} get pvc $pvc -n ${NAMESPACE} --context ${CONTEXT} -o jsonpath='{.spec.storageClassName}') && accessmode=$(${KUBERNETES_DISTRIBUTION_BINARY} get pvc $pvc -n ${NAMESPACE} --context ${CONTEXT} -o jsonpath='{.status.accessModes[0]}') && reclaimpolicy=$(${KUBERNETES_DISTRIBUTION_BINARY} get pv $pv --context ${CONTEXT} -o jsonpath='{.spec.persistentVolumeReclaimPolicy}') && csidriver=$(${KUBERNETES_DISTRIBUTION_BINARY} get pv $pv --context ${CONTEXT} -o jsonpath='{.spec.csi.driver}')&& echo -e "\\n---\\nPod: $pod\\nPVC: $pvc\\nPV: $pv\\nStatus: $status\\nNode: $node\\nZone: $zone\\nIngressClass: $ingressclass\\nAccessModes: $accessmode\\nReclaimPolicy: $reclaimpolicy\\nCSIDriver: $csidriver\\n"; done; done - ... env=${env} - ... secret_file__kubeconfig=${kubeconfig} - ... render_in_commandlist=true - ${history}= RW.CLI.Pop Shell History - RW.Core.Add Pre To Report Summary of configured persistent volumes in ${NAMESPACE}: - RW.Core.Add Pre To Report ${pod_storage_report.stdout} - RW.Core.Add Pre To Report Commands Used:\n${history} - -Fetch the Storage Utilization for PVC Mounts In Namespace `${NAMESPACE}` - [Documentation] For each pod in a namespace, fetch the utilization of any PersistentVolumeClaims mounted using the linux df command. Requires kubectl exec permissions. - [Tags] - ... pod - ... storage - ... pvc - ... utilization - ... capacity - ... persistentvolumeclaims - ... persistentvolumeclaim - ... check pvc - ... check event output and related nodes, persistentvolumes, persistentvolumeclaims, image registry authenticaiton, or fluxcd or argocd logs. - ${pod_pvc_utilization}= RW.CLI.Run Cli - ... cmd=for pod in $(${KUBERNETES_DISTRIBUTION_BINARY} get pods -n ${NAMESPACE} --field-selector=status.phase=Running --context ${CONTEXT} -o jsonpath='{range .items[*]}{.metadata.name}{"\\n"}{end}'); do for pvc in $(${KUBERNETES_DISTRIBUTION_BINARY} get pods $pod -n ${NAMESPACE} --context ${CONTEXT} -o jsonpath='{range .spec.volumes[*]}{.persistentVolumeClaim.claimName}{"\\n"}{end}'); do for volumeName in $(${KUBERNETES_DISTRIBUTION_BINARY} get pod $pod -n ${NAMESPACE} --context ${CONTEXT} -o json | jq -r '.spec.volumes[] | select(has("persistentVolumeClaim")) | .name'); do mountPath=$(${KUBERNETES_DISTRIBUTION_BINARY} get pod $pod -n ${NAMESPACE} --context ${CONTEXT} -o json | jq -r --arg vol "$volumeName" '.spec.containers[].volumeMounts[] | select(.name == $vol) | .mountPath'); containerName=$(${KUBERNETES_DISTRIBUTION_BINARY} get pod $pod -n ${NAMESPACE} --context ${CONTEXT} -o json | jq -r --arg vol "$volumeName" '.spec.containers[] | select(.volumeMounts[].name == $vol) | .name'); echo -e "\\n---\\nPod: $pod, PVC: $pvc, volumeName: $volumeName, containerName: $containerName, mountPath: $mountPath"; ${KUBERNETES_DISTRIBUTION_BINARY} exec $pod -n ${NAMESPACE} --context ${CONTEXT} -c $containerName -- df -h $mountPath; done; done; done; - ... env=${env} - ... secret_file__kubeconfig=${kubeconfig} - ... render_in_commandlist=true - ${unhealthy_volume_capacity}= RW.CLI.Run Cli - ... cmd=echo "${pod_pvc_utilization.stdout}" | awk '/---/ { if (flag) { print record "\\n" $0; } record = ""; flag = 0; next; } $5 ~ /[9][5-9]%/ || $5 == "100%" { flag = 1; } { if (record == "") { record = $0; } else { record = record "\\n" $0; } } END { if (flag) { print record; } }' - ... env=${env} - ... secret_file__kubeconfig=${kubeconfig} - ${unhealthy_volume_list}= RW.CLI.Run Cli - ... cmd=echo "${unhealthy_volume_capacity.stdout}" | awk -F'[,:]' '/Pod:/ {print "Pod:" $2, "PVC:" $4}' - ... env=${env} - ... secret_file__kubeconfig=${kubeconfig} - RW.CLI.Parse Cli Output By Line - ... rsp=${unhealthy_volume_capacity} - ... set_severity_level=2 - ... set_issue_expected=PVC should be less than 95% utilized. - ... set_issue_actual=PVC is 95% or greater. - ... set_issue_title=PVC Storage Utilization As Report by Pod - ... set_issue_details=Found excessive PVC Utilization for: \n${unhealthy_volume_capacity.stdout} - ... _line__raise_issue_if_contains=Pod - ... set_issue_next_steps=Clean up or expand `PersistentVolumeClaims` in namespace `${NAMESPACE}` for: \n ${unhealthy_volume_list.stdout} - ${history}= RW.CLI.Pop Shell History - RW.Core.Add Pre To Report Summary of PVC storage mount utilization in ${NAMESPACE}: - RW.Core.Add Pre To Report ${pod_pvc_utilization.stdout} - RW.Core.Add Pre To Report Commands Used:\n${history} - -Check for RWO Persistent Volume Node Attachment Issues In Namespace `${NAMESPACE}` - [Documentation] For each pod in a namespace, check if it has an RWO persistent volume claim and if so, validate that the pod and the pv are on the same node. - [Tags] - ... pod - ... storage - ... pvc - ... readwriteonce - ... node - ... persistentvolumeclaims - ... persistentvolumeclaim - ... scheduled - ... attachment - ${pod_rwo_node_and_pod_attachment}= RW.CLI.Run Cli - ... cmd=NAMESPACE="${NAMESPACE}"; CONTEXT="${CONTEXT}"; PODS=$(kubectl get pods -n $NAMESPACE --context=$CONTEXT -o json); for pod in $(jq -r '.items[] | @base64' <<< "$PODS"); do _jq() { jq -r \${1} <<< "$(base64 --decode <<< \${pod})"; }; POD_NAME=$(_jq '.metadata.name'); POD_NODE_NAME=$(kubectl get pod $POD_NAME -n $NAMESPACE --context=$CONTEXT -o custom-columns=:.spec.nodeName --no-headers); PVC_NAMES=$(kubectl get pod $POD_NAME -n $NAMESPACE --context=$CONTEXT -o jsonpath='{.spec.volumes[*].persistentVolumeClaim.claimName}'); for pvc_name in $PVC_NAMES; do PVC=$(kubectl get pvc $pvc_name -n $NAMESPACE --context=$CONTEXT -o json); ACCESS_MODE=$(jq -r '.spec.accessModes[0]' <<< "$PVC"); if [[ "$ACCESS_MODE" == "ReadWriteOnce" ]]; then PV_NAME=$(jq -r '.spec.volumeName' <<< "$PVC"); STORAGE_NODE_NAME=$(jq -r --arg pv "$PV_NAME" '.items[] | select(.status.volumesAttached != null) | select(.status.volumesInUse[] | contains($pv)) | .metadata.name' <<< "$(kubectl get nodes --context=$CONTEXT -o json)"); echo "-----"; if [[ "$POD_NODE_NAME" == "$STORAGE_NODE_NAME" ]]; then echo "OK: Pod and Storage Node Matched"; else echo "Error: Pod and Storage Node Mismatched - If the issue persists, the node requires attention."; fi; echo "Pod: $POD_NAME"; echo "PVC: $pvc_name"; echo "PV: $PV_NAME"; echo "Node with Pod: $POD_NODE_NAME"; echo "Node with Storage: $STORAGE_NODE_NAME"; echo; fi; done; done - ... env=${env} - ... secret_file__kubeconfig=${kubeconfig} - ... render_in_commandlist=true - RW.CLI.Parse Cli Output By Line - ... rsp=${pod_rwo_node_and_pod_attachment} - ... set_severity_level=2 - ... set_issue_expected=All pods with RWO storage must be scheduled on the same node in which the persistent volume is attached: ${NAMESPACE} - ... set_issue_actual=Pods with RWO found on a different node than their RWO storage: ${NAMESPACE} - ... set_issue_title=Pods with RWO storage might not have storage scheduling issues for namespace: ${NAMESPACE} - ... set_issue_details=All Pods and RWO their storage details are:\n\n$_stdout\n\n - ... set_issue_next_steps=List `Pods` in namespace `${NAMESPACE}` and review the `Nodes` they're scheduled on\nReview Kubernetes `Scheduler` logs\nCheck `Node Affinity` and `Taints/Tolerations` - ... _line__raise_issue_if_contains=Error - ${history}= RW.CLI.Pop Shell History - RW.Core.Add Pre To Report - ... Summary of Pods with RWO storage and the nodes their scheduling details for namespace: ${NAMESPACE}: - RW.Core.Add Pre To Report ${pod_rwo_node_and_pod_attachment.stdout} - RW.Core.Add Pre To Report Commands Used:\n${history} - - -*** Keywords *** -Suite Initialization - ${kubeconfig}= RW.Core.Import Secret - ... kubeconfig - ... type=string - ... description=The kubernetes kubeconfig yaml containing connection configuration used to connect to cluster(s). - ... pattern=\w* - ... example=For examples, start here https://kubernetes.io/docs/concepts/configuration/organize-cluster-access-kubeconfig/ - ${kubectl}= RW.Core.Import Service kubectl - ... description=The location service used to interpret shell commands. - ... default=kubectl-service.shared - ... example=kubectl-service.shared - ${KUBERNETES_DISTRIBUTION_BINARY}= RW.Core.Import User Variable KUBERNETES_DISTRIBUTION_BINARY - ... type=string - ... description=Which binary to use for Kubernetes CLI commands. - ... enum=[kubectl,oc] - ... example=kubectl - ... default=kubectl - ${CONTEXT}= RW.Core.Import User Variable CONTEXT - ... type=string - ... description=Which Kubernetes context to operate within. - ... pattern=\w* - ... example=my-main-cluster - ${NAMESPACE}= RW.Core.Import User Variable NAMESPACE - ... type=string - ... description=The name of the namespace to search. - ... pattern=\w* - ... example=otel-demo - ... default= - Set Suite Variable ${kubeconfig} ${kubeconfig} - Set Suite Variable ${kubectl} ${kubectl} - Set Suite Variable ${KUBERNETES_DISTRIBUTION_BINARY} ${KUBERNETES_DISTRIBUTION_BINARY} - Set Suite Variable ${CONTEXT} ${CONTEXT} - Set Suite Variable ${NAMESPACE} ${NAMESPACE} - Set Suite Variable ${env} {"KUBECONFIG":"./${kubeconfig.key}"}