Updates/ns (#316)

* test script update to fox next step var sub * revert prev change and fix echo * update next steps for unready kustomizations * add reproduce_hint * fix typo, another next step * deployment logs update * update next steps for deployments
runwhen-contrib · Jan 31, 2024 · fcfc064 · fcfc064
1 parent 8c86b9b
commit fcfc064
Show file tree

Hide file tree

Showing 7 changed files with 134 additions and 62 deletions.
diff --git a/codebundles/k8s-deployment-healthcheck/deployment_logs.sh b/codebundles/k8s-deployment-healthcheck/deployment_logs.sh
@@ -114,11 +114,14 @@ SEARCH_RESOURCES=""
 
 # Format file / table http_logrus_custom
 # Search for http log format used by online-boutique (which uses logrus but is custom)
+echo "Query for HTTP Path patterns"
 for FILE in "${LOG_FILES[@]}"; do
     echo "$FILE"
     LOG_SUMMARY=$(lnav -n -c ';SELECT COUNT(*) AS error_count, CASE WHEN "http.req.path" LIKE "/product%" THEN "/product" ELSE "http.req.path" END AS root_path, "http.resp.status" FROM http_logrus_custom WHERE "http.resp.status" = 500 AND NOT "http.req.path" = "/" GROUP BY root_path, "http.resp.status" ORDER BY error_count DESC;' $FILE)
     echo "$LOG_SUMMARY"
-    INTERESTING_PATHS+=$(echo "$LOG_SUMMARY" | awk 'NR>1 && NR<5 {sub(/^\//, "", $2); print $2}')$'\n'
+    if [[ $LOG_SUMMARY ]]; then 
+        INTERESTING_PATHS+=$(echo "$LOG_SUMMARY" | awk 'NR>1 && NR<5 {sub(/^\//, "", $2); print $2}')$'\n'
+    fi
 done
 
 if [[ -n "$INTERESTING_PATHS" ]]; then
@@ -129,7 +132,17 @@ else
     echo "No interesting HTTP paths found."
 fi
 
+## Lightweight - we explicitly specify which resources we want to search
+# Run RESOURCE_SEARCH_LIST only if SEARCH_RESOURCES has content
+if [[ -n "$SEARCH_RESOURCES" ]]; then
+    RESOURCE_SEARCH_LIST=$(${KUBERNETES_DISTRIBUTION_BINARY} get deployment,pods,service,statefulset --context=${CONTEXT} -n ${NAMESPACE})
+else
+    echo "No search queries based on HTTP Paths returned results."
+fi
+
+
 # Search for error fields and strings
+echo "Query for generic error logs and sort"
 for FILE in "${LOG_FILES[@]}"; do
     echo "$FILE"
     ERROR_SUMMARY=$(lnav -n -c ';SELECT error, COUNT(*) AS count FROM http_logrus_custom WHERE error IS NOT NULL GROUP BY error;' $FILE)
@@ -139,30 +152,11 @@ done
 ERROR_FUZZY_STRING=$(echo "$ERROR_FUZZY_STRING" | sort | uniq)
 ##### End query #####
 
-
-
-
-# # Fetch a list of all resources in the namespace
-## Heavyweight - this times out after 30s, but is a better way to get any and all resources
-# SEARCH_LIST=$(${KUBERNETES_DISTRIBUTION_BINARY} api-resources --verbs=list --namespaced -o name  | xargs -n 1 ${KUBERNETES_DISTRIBUTION_BINARY} get --show-kind --ignore-not-found -n $NAMESPACE)
-
-## Lightweight - we explicitly specify which resources we want to search
-# Run RESOURCE_SEARCH_LIST only if SEARCH_RESOURCES has content
-if [[ -n "$SEARCH_RESOURCES" ]]; then
-    RESOURCE_SEARCH_LIST=$(${KUBERNETES_DISTRIBUTION_BINARY} get deployment,pods,service,statefulset --context=${CONTEXT} -n ${NAMESPACE})
-else
-    echo "No search queries returned results."
-    exit
-fi
-
-
-
 # Fuzzy match env vars in deployments with ERROR_FUZZY_STRING
 declare -a FUZZY_ENV_VAR_RESOURCE_MATCHES
-if [[ -n "$SEARCH_RESOURCES" && -n "$ERROR_FUZZY_STRING" ]]; then
+if [[ -n "$ERROR_FUZZY_STRING" ]]; then
     # Filter out common words from ERROR_FUZZY_STRING
     FILTERED_ERROR_STRING=$(filter_common_words "$ERROR_FUZZY_STRING")
-
     # Convert FILTERED_ERROR_STRING into an array
     mapfile -t PATTERNS <<< "$FILTERED_ERROR_STRING"
 
@@ -188,7 +182,6 @@ if [[ -n "$SEARCH_RESOURCES" && -n "$ERROR_FUZZY_STRING" ]]; then
     done
 else
     echo "No search queries or fuzzy matches to perform."
-    exit
 fi
 
 for match in "${FUZZY_ENV_VAR_RESOURCE_MATCHES[@]}"; do
@@ -202,7 +195,7 @@ done
 
 # Fetch namespace events for searching through
 EVENT_SEARCH_LIST=$(${KUBERNETES_DISTRIBUTION_BINARY}  get events --context=${CONTEXT} -n ${NAMESPACE})
-event_details="\nThe namespace `${NAMESPACE}` has produced the following interesting events:"
+event_details="\nThe namespace \`${NAMESPACE}\` has produced the following interesting events:"
 event_details+="\n"
 
 # For each value, search the namespace for applicable resources and events
@@ -226,6 +219,7 @@ if [[ ${#FUZZY_ENV_VAR_RESOURCE_MATCHES[@]} -ne 0 ]]; then
         env_value=${parts[3]}
 
         if [[ -z ${seen_resources[$resource]} ]]; then
+            issue_descriptions+=("Error log could be related to \`$resource\`")
             recommendations+=("Review manifest for \`$resource\` in namespace: \`${NAMESPACE}\`. Matched error log string \`$string\` in environment variable \`$env_key\`.  ")
             seen_resources[$resource]=1
         fi
@@ -255,18 +249,16 @@ if [[ -n "$INTERESTING_RESOURCES" ]]; then
             fi
             ;;
         deployment|deployment.apps)
-            recommendations+=("Check deployment health \`$name\` in namespace \`${NAMESPACE}\`")
+            recommendations+=("Check Deployment health \`$name\` in namespace \`${NAMESPACE}\`")
             ;;
         service)
-            recommendations+=("Check service health \`$name\` in namespace \`${NAMESPACE}\`")
+            recommendations+=("Check Service health \`$name\` in namespace \`${NAMESPACE}\`")
             ;;
         statefulset|statefulset.apps)
-            recommendations+=("Check statefulset health \`$name\` in namespace \`${NAMESPACE}\`")
+            recommendations+=("Check Statefulset health \`$name\` in namespace \`${NAMESPACE}\`")
             ;;
         esac
     done <<< "$INTERESTING_RESOURCES"
-else
-    echo "No resources found based on log query output"
 fi 
 
 # Display the issue descriptions

diff --git a/codebundles/k8s-deployment-healthcheck/runbook.robot b/codebundles/k8s-deployment-healthcheck/runbook.robot
@@ -43,16 +43,17 @@ Check Deployment Log For Issues with `${DEPLOYMENT_NAME}`
     ...    cmd=awk '/Issues Identified:/ {start=1; next} /The namespace `${NAMESPACE}` has produced the following interesting events:/ {start=0} start' <<< '''${logs.stdout}'''
     ...    env=${env}
     ...    include_in_history=false
-    # FIXME: Refactor this to a loop of 1 issue per line of issue output - better alinging next steps with specific issues
-    RW.Core.Add Issue
-    ...    severity=2
-    ...    expected=No logs matching error patterns found in deployment `${DEPLOYMENT_NAME}` in namespace `${NAMESPACE}`
-    ...    actual=Error logs found in deployment `${DEPLOYMENT_NAME}` in namespace `${NAMESPACE}`
-    ...    title=Deployment `${DEPLOYMENT_NAME}` in `${NAMESPACE}` has error logs.
-    ...    reproduce_hint=View Commands Used in Report Output
-    ...    details=Deployment `${DEPLOYMENT_NAME}` in `${NAMESPACE}` generated the following log analysis: \n${logs.stdout}
-    ...    next_steps=${recommendations.stdout}
-
+    ## We should improve deployment_logs.sh to generate a match issue + next steps + severity level
+    IF    len($issues.stdout) > 0
+        RW.Core.Add Issue
+        ...    severity=3
+        ...    expected=No logs matching error patterns found in deployment `${DEPLOYMENT_NAME}` in namespace `${NAMESPACE}`
+        ...    actual=Error logs found in deployment `${DEPLOYMENT_NAME}` in namespace `${NAMESPACE}`
+        ...    title=Deployment `${DEPLOYMENT_NAME}` in `${NAMESPACE}` is generating error logs.
+        ...    reproduce_hint=View Commands Used in Report Output
+        ...    details=Deployment `${DEPLOYMENT_NAME}` in `${NAMESPACE}` generated the following log analysis: \n${logs.stdout}
+        ...    next_steps=${recommendations.stdout}
+    END
     ${history}=    RW.CLI.Pop Shell History
     RW.Core.Add Pre To Report
     ...    Recent logs from deployment/`${DEPLOYMENT_NAME}` in `${NAMESPACE}`:\n\n${logs.stdout}
@@ -80,7 +81,7 @@ Check Liveness Probe Configuration for Deployment `${DEPLOYMENT_NAME}`
     ...    secret_file__kubeconfig=${kubeconfig}
     ...    show_in_rwl_cheatsheet=true
     ${recommendations}=    RW.CLI.Run Cli
-    ...    cmd=awk '/Recommended Next Steps:/ {flag=1; next} flag' <<< "${liveness_probe_health.stdout}"
+    ...    cmd=awk '/Recommended Next Steps:/ {flag=1; next} flag' <<< '''${liveness_probe_health.stdout}'''
     ...    env=${env}
     ...    include_in_history=false
     IF    len($recommendations.stdout) > 0
@@ -117,7 +118,7 @@ Check Readiness Probe Configuration for Deployment `${DEPLOYMENT_NAME}`
     ...    secret_file__kubeconfig=${kubeconfig}
     ...    show_in_rwl_cheatsheet=true
     ${recommendations}=    RW.CLI.Run Cli
-    ...    cmd=awk '/Recommended Next Steps:/ {flag=1; next} flag' <<< "${readiness_probe_health.stdout}"
+    ...    cmd=awk '/Recommended Next Steps:/ {flag=1; next} flag' <<< '''${readiness_probe_health.stdout}'''
     ...    env=${env}
     ...    include_in_history=false
     IF    len($recommendations.stdout) > 0

diff --git a/codebundles/k8s-deployment-healthcheck/workload_next_steps.sh b/codebundles/k8s-deployment-healthcheck/workload_next_steps.sh
@@ -26,16 +26,16 @@ if [[ $messages =~ "Misconfiguration" && $owner_kind == "Deployment" ]]; then
     next_steps+=("Get Deployment Workload Details For \`$owner_name\` and Add to Report")
 fi
 
-if [[ $messages =~ "Deployment does not have minimum availability" && $owner_kind == "Deployment" ]]; then
-    next_steps+=("Troubleshoot Deployment Warning Events for \`$owner_name\`")
-    next_steps+=("Troubleshoot Container Restarts In Namespace \`$NAMESPACE\`")
-fi
-
 if [[ $messages =~ "Misconfiguration" ]]; then
-    next_steps+=("Review configuration of  owner_kind \`$owner_name\`")
+    next_steps+=("Review configuration of $owner_kind \`$owner_name\`")
     next_steps+=("Check for Node Failures or Maintenance Activities in Cluster \`$CONTEXT\`")
 fi
 
+if [[ $messages =~ "PodInitializing" ]]; then
+    next_steps+=("Check $owner_kind Health for \`$owner_name\`")
+    next_steps+=("Troubleshoot $owner_kind Warning Events for \`$owner_name\`")
+fi
+
 if [[ $messages =~ "Liveness probe failed" || $messages =~ "Liveness probe errored" ]]; then
     next_steps+=("Check Liveliness Probe Configuration for $owner_kind \`$owner_name\`")
 fi
@@ -44,11 +44,21 @@ if [[ $messages =~ "Readiness probe errored" || $messages =~ "Readiness probe fa
     next_steps+=("Check Readiness Probe Configuration for $owner_kind \`$owner_name\`")
 fi
 
+if [[ $messages =~ "PodFailed" ]]; then
+    next_steps+=("Check Readiness Probe Configuration for $owner_kind \`$owner_name\`")
+fi
+
 if [[ $messages =~ "ImagePullBackOff" || $messages =~ "Back-off pulling image" || $messages =~ "ErrImagePull" ]]; then
     next_steps+=("List ImagePullBackoff Events and Test Path and Tags for Namespace \`$NAMESPACE\`")
     next_steps+=("List Images and Tags for Every Container in Failed Pods for Namespace \`$NAMESPACE\`")
 fi
 
+if [[ $messages =~ "Back-off restarting failed container" ]]; then
+    next_steps+=("Check Log for $owner_kind \`$owner_name\`")
+    next_steps+=("Troubleshoot Warning Events for $owner_kind \`$owner_name\`")
+
+fi
+
 if [[ $messages =~ "ImagePullBackOff" || $messages =~ "Back-off pulling image" || $messages =~ "ErrImagePull" ]]; then
     next_steps+=("List ImagePullBackoff Events and Test Path and Tags for Namespace \`$NAMESPACE\`")
     next_steps+=("List Images and Tags for Every Container in Failed Pods for Namespace \`$NAMESPACE\`")
@@ -58,5 +68,25 @@ if [[ $messages =~ "forbidden: failed quota" ]]; then
     next_steps+=("Check Resource Quota Utilization in Namepace `${NAMESPACE}`")
 fi
 
+if [[ $messages =~ "No preemption victims found for incoming pod" || $messages =~ "Insufficient cpu" ]]; then
+    next_steps+=("Not enough node resources available to schedule pods. Escalate this issue to your cluster owner. ")
+    next_steps+=("Increase Node Count in Cluster")
+    next_steps+=("Check for Quota Errors")
+fi
+
+if [[ $messages =~ "max node group size reached" ]]; then
+    next_steps+=("Not enough node resources available to schedule pods. Escalate this issue to your cluster owner.")
+    next_steps+=("Increase node count in cluster.")
+    next_steps+=("Check for quota errors.")
+fi
+
+if [[ $messages =~ "Health check failed after" ]]; then
+    next_steps+=("Check $owner_kind \`$owner_name\` Health")
+fi
+
+if [[ ${#next_steps[@]} -eq 0 ]]; then
+    next_steps+=("Please review the report logs and escalate the issue if necessary.")
+fi
+
 # Display the list of recommendations
 printf "%s\n" "${next_steps[@]}" | sort | uniq
diff --git a/codebundles/k8s-fluxcd-kustomization-health/runbook.robot b/codebundles/k8s-fluxcd-kustomization-health/runbook.robot
@@ -6,6 +6,8 @@ Metadata            Supports    Kubernetes,AKS,EKS,GKE,OpenShift,FluxCD
 Library             RW.Core
 Library             RW.CLI
 Library             RW.platform
+Library             RW.NextSteps
+Library             String
 
 Suite Setup         Suite Initialization
 
@@ -28,26 +30,35 @@ Get details for unready Kustomizations in Namespace `${NAMESPACE}`
     [Documentation]    List all Kustomizations that are not found in a ready state in namespace ${NAMESPACE}  
     [Tags]        FluxCD     Kustomization    Versions    ${NAMESPACE}
     ${kustomizations_not_ready}=    RW.CLI.Run Cli
-    ...    cmd=${KUBERNETES_DISTRIBUTION_BINARY} get ${RESOURCE_NAME} -n ${NAMESPACE} --context ${CONTEXT} -o json | jq -r '.items[] | select (.status.conditions[] | select(.type == "Ready" and .status == "False")) | "---\\nKustomization Name: \\(.metadata.name)\\n\\nReady Status: \\(.status.conditions[] | select(.type == "Ready") | "\\n ready: \\(.status)\\n message: \\(.message)\\n reason: \\(.reason)\\n last_transition_time: \\(.lastTransitionTime)")\\n\\nReconcile Status:\\(.status.conditions[] | select(.type == "Reconciling") |"\\n reconciling: \\(.status)\\n message: \\(.message)")\\n---\\n"'
+    ...    cmd=${KUBERNETES_DISTRIBUTION_BINARY} get ${RESOURCE_NAME} -n ${NAMESPACE} --context ${CONTEXT} -o json | jq '[.items[] | select(.status.conditions[] | select(.type == "Ready" and .status == "False")) | {KustomizationName: .metadata.name, ReadyStatus: {ready: (.status.conditions[] | select(.type == "Ready").status), message: (.status.conditions[] | select(.type == "Ready").message), reason: (.status.conditions[] | select(.type == "Ready").reason), last_transition_time: (.status.conditions[] | select(.type == "Ready").lastTransitionTime)}, ReconcileStatus: {reconciling: (.status.conditions[] | select(.type == "Reconciling").status), message: (.status.conditions[] | select(.type == "Reconciling").message)}}]'
     ...    env=${env}
     ...    secret_file__kubeconfig=${KUBECONFIG}
     ...    show_in_rwl_cheatsheet=true
-    ...    render_in_commandlist=true
-    RW.CLI.Parse Cli Output By Line
-    ...    rsp=${kustomizations_not_ready}
-    ...    set_severity_level=2
-    ...    set_issue_expected=Kustomizations should be synced and ready.   
-    ...    set_issue_actual=We found the following kustomization objects in a pending state: $_stdout
-    ...    set_issue_title=Unready Kustomizations Found In Namespace ${NAMESPACE}
-    ...    set_issue_details=Kustomizations pending with reasons:\n"$_stdout" in the namespace ${NAMESPACE}
-    ...    _line__raise_issue_if_contains=-
+    ${kustomizations_not_ready_list}=    Evaluate    json.loads(r'''${kustomizations_not_ready.stdout}''')    json
+    IF    len(@{kustomizations_not_ready_list}) > 0
+        FOR    ${item}    IN    @{kustomizations_not_ready_list}               
+            ${messages}=    Replace String    ${item["ReadyStatus"]["message"]}   "    ${EMPTY}
+            ${item_next_steps}=    RW.CLI.Run Bash File
+            ...    bash_file=workload_next_steps.sh
+            ...    cmd_override=./workload_next_steps.sh "${messages}"
+            ...    env=${env}
+            ...    include_in_history=False
+            RW.Core.Add Issue
+            ...    severity=2
+            ...    expected=Kustomizations should be synced and ready.   
+            ...    actual=We found the following kustomization objects in a pending state: ${item}
+            ...    title=Unready Kustomization \`${item["KustomizationName"]}\` Found In Namespace \`${NAMESPACE}\`
+            ...    reproduce_hint=${kustomizations_not_ready.cmd}
+            ...    details=${item}
+            ...    next_steps=${item_next_steps.stdout}
+        END
+    END
     ${history}=    RW.CLI.Pop Shell History
     IF    """${kustomizations_not_ready.stdout}""" == ""
         ${kustomizations_not_ready}=    Set Variable    No Kustomizations Pending Found
     ELSE
         ${kustomizations_not_ready}=    Set Variable    ${kustomizations_not_ready.stdout}
     END
-    ${history}=    RW.CLI.Pop Shell History
     RW.Core.Add Pre To Report    Kustomizations with: \n ${kustomizations_not_ready}
     RW.Core.Add Pre To Report    Commands Used:\n${history}
 

diff --git a/codebundles/k8s-fluxcd-kustomization-health/workload_next_steps.sh b/codebundles/k8s-fluxcd-kustomization-health/workload_next_steps.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+# -----------------------------------------------------------------------------
+# Script Information and Metadata
+# -----------------------------------------------------------------------------
+# Author: @stewartshea
+# Description: This script takes in event message strings captured from a 
+# Kubernetes based system and provides some generalized next steps based on the 
+# content and frequency of the message. 
+# -----------------------------------------------------------------------------
+# Input: List of event messages, related owner kind, and related owner name
+messages="$1"
+
+
+# Try to parse out object details
+# Splitting the extracted string to get individual parts
+matched=$(echo "$messages" | grep -oP "\[\K(\w+\/\w+\/.+?)(?=\])")
+owner_kind=$(echo "$matched" | cut -d'/' -f1)
+owner_name=$(echo "$matched" | cut -d'/' -f2)
+additional_details=$(echo "$matched" | cut -d'/' -f3-)
+
+# Initialize an empty array to store recommendations
+next_steps=()
+
+
+if [[ $messages =~ "Health check failed" ]]; then
+    next_steps+=("Troubleshoot $owner_kind Replicas for \`$owner_name\`")
+    next_steps+=("Troubleshoot $owner_kind Warning Events for \`$owner_name\`")
+fi
+
+
+
+# Display the list of recommendations
+printf "%s\n" "${next_steps[@]}" | sort | uniq
diff --git a/codebundles/k8s-namespace-healthcheck/runbook.robot b/codebundles/k8s-namespace-healthcheck/runbook.robot
@@ -127,9 +127,9 @@ Troubleshoot Pending Pods In Namespace `${NAMESPACE}`
     ...    secret_file__kubeconfig=${kubeconfig}
     ...    show_in_rwl_cheatsheet=true
     ...    render_in_commandlist=true
-    ${pendind_pod_list}=    Split String    ${pending_pods.stdout}    _______-
-    IF    len($pendind_pod_list) > 0
-        FOR    ${item}    IN    @{pendind_pod_list}
+    ${pending_pod_list}=    Split String    ${pending_pods.stdout}    _______-
+    IF    len($pending_pod_list) > 0
+        FOR    ${item}    IN    @{pending_pod_list}
             ${is_not_just_newline}=    Evaluate    '''${item}'''.strip() != ''
             IF    ${is_not_just_newline}
                 ${pod_name}=    RW.CLI.Run Cli

diff --git a/codebundles/k8s-namespace-healthcheck/workload_next_steps.sh b/codebundles/k8s-namespace-healthcheck/workload_next_steps.sh
@@ -27,10 +27,15 @@ if [[ $messages =~ "Misconfiguration" && $owner_kind == "Deployment" ]]; then
 fi
 
 if [[ $messages =~ "Misconfiguration" ]]; then
-    next_steps+=("Review configuration of  owner_kind \`$owner_name\`")
+    next_steps+=("Review configuration of $owner_kind \`$owner_name\`")
     next_steps+=("Check for Node Failures or Maintenance Activities in Cluster \`$CONTEXT\`")
 fi
 
+if [[ $messages =~ "PodInitializing" ]]; then
+    next_steps+=("Check $owner_kind Health for \`$owner_name\`")
+    next_steps+=("Troubleshoot $owner_kind Warning Events for \`$owner_name\`")
+fi
+
 if [[ $messages =~ "Liveness probe failed" || $messages =~ "Liveness probe errored" ]]; then
     next_steps+=("Check Liveliness Probe Configuration for $owner_kind \`$owner_name\`")
 fi
@@ -54,7 +59,6 @@ if [[ $messages =~ "Back-off restarting failed container" ]]; then
 
 fi
 
-
 if [[ $messages =~ "ImagePullBackOff" || $messages =~ "Back-off pulling image" || $messages =~ "ErrImagePull" ]]; then
     next_steps+=("List ImagePullBackoff Events and Test Path and Tags for Namespace \`$NAMESPACE\`")
     next_steps+=("List Images and Tags for Every Container in Failed Pods for Namespace \`$NAMESPACE\`")