K8s ns/more next steps (#413)

* change older workload_next_steps structure * update all the scripts to better handle multi word pattern matching * workload next steps touchup. fix pod name in pending pod task
runwhen-contrib · Jul 25, 2024 · 3838427 · 3838427
1 parent c34ec19
commit 3838427
Show file tree

Hide file tree

Showing 5 changed files with 92 additions and 94 deletions.
diff --git a/codebundles/k8s-deployment-healthcheck/workload_issues.sh b/codebundles/k8s-deployment-healthcheck/workload_issues.sh
@@ -5,7 +5,7 @@
 # -----------------------------------------------------------------------------
 # Author: @stewartshea
 # Description: This script takes in event message strings captured from a 
-# Kubernetes based system and provides more concrete issue details in json format. This is a migratio naway from workload_next_steps.sh in order to support dynamic severity generation and more robust next step details. 
+# Kubernetes based system and provides more concrete issue details in json format. This is a migration away from workload_next_steps.sh in order to support dynamic severity generation and more robust next step details. 
 # -----------------------------------------------------------------------------
 # Input: List of event messages, related owner kind, and related owner name
 messages="$1"
@@ -24,68 +24,67 @@ add_issue() {
 }
 
 # Check conditions and add issues to the array
-if [[ $messages =~ "ContainersNotReady" && $owner_kind == "Deployment" ]]; then
+if echo "$messages" | grep -q "ContainersNotReady" && [[ $owner_kind == "Deployment" ]]; then
     add_issue "2" "$owner_kind \`$owner_name\` has unready containers" "$messages" "Inspect Deployment Replicas for \`$owner_name\`"
 fi
 
-if [[ $messages =~ "Misconfiguration" && $owner_kind == "Deployment" ]]; then
+if echo "$messages" | grep -q "Misconfiguration" && [[ $owner_kind == "Deployment" ]]; then
     add_issue "2" "$owner_kind \`$owner_name\` has a misconfiguration" "$messages" "Check Deployment Log For Issues for \`$owner_name\`\nGet Deployment Workload Details For \`$owner_name\` and Add to Report"
 fi
 
-if [[ $messages =~ "PodInitializing" ]]; then
+if echo "$messages" | grep -q "PodInitializing"; then
     add_issue "4" "$owner_kind \`$owner_name\` is initializing" "$messages" "Retry in a few minutes and verify that \`$owner_name\` is running.\nInspect $owner_kind Warning Events for \`$owner_name\`"
 fi
 
-if [[ $messages =~ "Startup probe failed" ]]; then
+if echo "$messages" | grep -q "Startup probe failed"; then
     add_issue "2" "$owner_kind \`$owner_name\` is unable to start" "$messages" "Check Deployment Logs for $owner_kind \`$owner_name\`\nReview Startup Probe Configuration for $owner_kind \`$owner_name\`\nIncrease Startup Probe Timeout and Threshold for $owner_kind \`$owner_name\`\nIdentify Resource Constrained Pods In Namespace \`$NAMESPACE\`"
 fi
 
-
-if [[ $messages =~ "Liveness probe failed" || $messages =~ "Liveness probe errored" ]]; then
+if echo "$messages" | grep -q "Liveness probe failed\|Liveness probe errored"; then
     add_issue "3" "$owner_kind \`$owner_name\` is restarting" "$messages" "Check Liveliness Probe Configuration for $owner_kind \`$owner_name\`"
 fi
 
-if [[ $messages =~ "Readiness probe errored" || $messages =~ "Readiness probe failed" ]]; then
+if echo "$messages" | grep -q "Readiness probe errored\|Readiness probe failed"; then
     add_issue "2" "$owner_kind \`$owner_name\` is unable to start" "$messages" "Check Readiness Probe Configuration for $owner_kind \`$owner_name\`"
 fi
 
-if [[ $messages =~ "PodFailed" ]]; then
+if echo "$messages" | grep -q "PodFailed"; then
     add_issue "2" "$owner_kind \`$owner_name\` has failed pods" "$messages" "Check Pod Status and Logs for Errors"
 fi
 
-if [[ $messages =~ "ImagePullBackOff" || $messages =~ "Back-off pulling image" || $messages =~ "ErrImagePull" ]]; then
+if echo "$messages" | grep -q "ImagePullBackOff\|Back-off pulling image\|ErrImagePull"; then
     add_issue "2" "$owner_kind \`$owner_name\` has image access issues" "$messages" "List Images and Tags for Every Container in Failed Pods for Namespace \`$NAMESPACE\`\nList ImagePullBackoff Events and Test Path and Tags for Namespace \`$NAMESPACE\`"
 fi
 
-if [[ $messages =~ "Back-off restarting failed container" ]]; then
+if echo "$messages" | grep -q "Back-off restarting failed container"; then
     add_issue "2" "$owner_kind \`$owner_name\` has failing containers" "$messages" "Check $owner_kind Log for \`$owner_name\`\nInspect Warning Events for $owner_kind \`$owner_name\`"
 fi
 
-if [[ $messages =~ "forbidden: failed quota" || $messages =~ "forbidden: exceeded quota" ]]; then
+if echo "$messages" | grep -q "forbidden: failed quota\|forbidden: exceeded quota"; then
     add_issue "3" "$owner_kind \`$owner_name\` has resources that cannot be scheduled" "$messages" "Adjust resource configuration for $owner_kind \`$owner_name\` according to issue details."
 fi
 
-if [[ $messages =~ "is forbidden: [minimum cpu usage per Container" || $messages =~ "is forbidden: [minimum memory usage per Container" ]]; then
+if echo "$messages" | grep -q "is forbidden: \[minimum cpu usage per Container\|is forbidden: \[minimum memory usage per Container"; then
     add_issue "2" "$owner_kind \`$owner_name\` has invalid resource configuration" "$messages" "Adjust resource configuration for $owner_kind \`$owner_name\` according to issue details."
 fi
 
-if [[ $messages =~ "No preemption victims found for incoming pod" || $messages =~ "Insufficient cpu" ]]; then
+if echo "$messages" | grep -q "No preemption victims found for incoming pod\|Insufficient cpu\|The node was low on resource\|nodes are available\|Preemption is not helpful"; then
     add_issue "2" "$owner_kind \`$owner_name\` cannot be scheduled - not enough cluster resources." "$messages" "Not enough node resources available to schedule pods. Escalate this issue to your cluster owner.\nIncrease Node Count in Cluster\nCheck for Quota Errors\nIdentify High Utilization Nodes for Cluster \`${CONTEXT}\`"
 fi
 
-if [[ $messages =~ "max node group size reached" ]]; then
+if echo "$messages" | grep -q "max node group size reached"; then
     add_issue "2" "$owner_kind \`$owner_name\` cannot be scheduled - cannot increase cluster size." "$messages" "Not enough node resources available to schedule pods. Escalate this issue to your cluster owner.\nIncrease Max Node Group Size in Cluster\nIdentify High Utilization Nodes for Cluster \`${CONTEXT}\`"
 fi
 
-if [[ $messages =~ "Health check failed after" ]]; then
+if echo "$messages" | grep -q "Health check failed after"; then
     add_issue "3" "$owner_kind \`$owner_name\` health check failed." "$messages" "Check $owner_kind \`$owner_name\` Health"
 fi
 
-if [[ $messages =~ "Deployment does not have minimum availability" ]]; then
+if echo "$messages" | grep -q "Deployment does not have minimum availability"; then
     add_issue "3" "$owner_kind \`$owner_name\` is not available." "$messages" "Inspect Deployment Warning Events for \`$owner_name\`"
 fi
 
-if [[ $messages =~ "Created container server" || $messages =~ "no changes since last reconcilation" || $messages =~ "Reconciliation finished" || "successfully rotated K8s secret" ]]; then
+if echo "$messages" | grep -q "Created container server\|no changes since last reconcilation\|Reconciliation finished\|successfully rotated K8s secret"; then
     # Don't generate any issue data, these are normal strings
     echo "[]" | jq .
     exit 0
@@ -100,5 +99,5 @@ if [ ${#issue_details_array[@]} -gt 0 ]; then
     issues_json="[${issues_json%,}]" # Remove the last comma and wrap in square brackets
     echo "$issues_json" | jq .
 else
-    echo "[{\"severity\":\"4\",\"title\":\"$owner_kind \`$owner_name\` has issues that require further investigation.\",\"details\":\"$messages\",\"next_steps\":\"Escalate issues for $owner_kind \`$owner_name\` to service owner \"}]" | jq .
-fi
+    echo "[{\"severity\":\"4\",\"title\":\"$owner_kind \`$owner_name\` has issues that require further investigation.\",\"details\":\"$messages\",\"next_steps\":\"Escalate issues for $owner kind \`$owner_name\` to service owner \"}]" | jq .
+fi
diff --git a/codebundles/k8s-deployment-healthcheck/workload_next_steps.sh b/codebundles/k8s-deployment-healthcheck/workload_next_steps.sh
@@ -16,93 +16,91 @@ owner_name="$3"
 # Initialize an empty array to store recommendations
 next_steps=()
 
-
-if [[ $messages =~ "ContainersNotReady" && $owner_kind == "Deployment" ]]; then
+# Check conditions and add recommendations to the array
+if echo "$messages" | grep -q "ContainersNotReady" && [[ $owner_kind == "Deployment" ]]; then
     next_steps+=("Inspect Deployment Replicas for \`$owner_name\`")
 fi
 
-if [[ $messages =~ "Misconfiguration" && $owner_kind == "Deployment" ]]; then
+if echo "$messages" | grep -q "Misconfiguration" && [[ $owner_kind == "Deployment" ]]; then
     next_steps+=("Check Deployment Log For Issues for \`$owner_name\`")
     next_steps+=("Get Deployment Workload Details For \`$owner_name\` and Add to Report")
 fi
 
-if [[ $messages =~ "Misconfiguration" ]]; then
+if echo "$messages" | grep -q "Misconfiguration"; then
     next_steps+=("Review configuration of $owner_kind \`$owner_name\`")
     next_steps+=("Check for Node Failures or Maintenance Activities in Cluster \`$CONTEXT\`")
 fi
 
-if [[ $messages =~ "PodInitializing" ]]; then
+if echo "$messages" | grep -q "PodInitializing"; then
     next_steps+=("Check $owner_kind Health for \`$owner_name\`")
     next_steps+=("Inspect $owner_kind Warning Events for \`$owner_name\`")
 fi
 
-if [[ $messages =~ "Startup probe failed" ]]; then
+if echo "$messages" | grep -q "Startup probe failed"; then
     next_steps+=("Check Deployment Logs for $owner_kind \`$owner_name\`")
     next_steps+=("Review Startup Probe Configuration for $owner_kind \`$owner_name\`")
     next_steps+=("Increase Startup Probe Timeout and Threshold for $owner_kind \`$owner_name\`")
     next_steps+=("Identify Resource Constrained Pods In Namespace \`$NAMESPACE\`")
 fi
 
-
-if [[ $messages =~ "Liveness probe failed" || $messages =~ "Liveness probe errored" ]]; then
+if echo "$messages" | grep -q "Liveness probe failed\|Liveness probe errored"; then
     next_steps+=("Check Liveliness Probe Configuration for $owner_kind \`$owner_name\`")
 fi
 
-if [[ $messages =~ "Readiness probe errored" || $messages =~ "Readiness probe failed" ]]; then
+if echo "$messages" | grep -q "Readiness probe errored\|Readiness probe failed"; then
     next_steps+=("Check Readiness Probe Configuration for $owner_kind \`$owner_name\`")
 fi
 
-if [[ $messages =~ "PodFailed" ]]; then
+if echo "$messages" | grep -q "PodFailed"; then
     next_steps+=("Check Readiness Probe Configuration for $owner_kind \`$owner_name\`")
 fi
 
-if [[ $messages =~ "ImagePullBackOff" || $messages =~ "Back-off pulling image" || $messages =~ "ErrImagePull" ]]; then
+if echo "$messages" | grep -q "ImagePullBackOff\|Back-off pulling image\|ErrImagePull"; then
     next_steps+=("List ImagePullBackoff Events and Test Path and Tags for Namespace \`$NAMESPACE\`")
     next_steps+=("List Images and Tags for Every Container in Failed Pods for Namespace \`$NAMESPACE\`")
 fi
 
-if [[ $messages =~ "Back-off restarting failed container" ]]; then
+if echo "$messages" | grep -q "Back-off restarting failed container"; then
     next_steps+=("Check Log for $owner_kind \`$owner_name\`")
     next_steps+=("Inspect Warning Events for $owner_kind \`$owner_name\`")
-
 fi
 
-if [[ $messages =~ "ImagePullBackOff" || $messages =~ "Back-off pulling image" || $messages =~ "ErrImagePull" ]]; then
+if echo "$messages" | grep -q "ImagePullBackOff\|Back-off pulling image\|ErrImagePull"; then
     next_steps+=("List ImagePullBackoff Events and Test Path and Tags for Namespace \`$NAMESPACE\`")
     next_steps+=("List Images and Tags for Every Container in Failed Pods for Namespace \`$NAMESPACE\`")
 fi
 
-if [[ $messages =~ "forbidden: failed quota" || $messages =~ "forbidden: exceeded quota" ]]; then
-    next_steps+=("Check Resource Quota Utilization in Namepace `${NAMESPACE}`")
+if echo "$messages" | grep -q "forbidden: failed quota\|forbidden: exceeded quota"; then
+    next_steps+=("Check Resource Quota Utilization in Namepace \`${NAMESPACE}\`")
 fi
 
-if [[ $messages =~ "No preemption victims found for incoming pod" || $messages =~ "Insufficient cpu" ]]; then
-    next_steps+=("Not enough node resources available to schedule pods. Escalate this issue to your cluster owner. ")
+if echo "$messages" | grep -q "No preemption victims found for incoming pod\|Insufficient cpu"; then
+    next_steps+=("Not enough node resources available to schedule pods. Escalate this issue to your cluster owner.")
     next_steps+=("Increase Node Count in Cluster")
     next_steps+=("Check for Quota Errors")
 fi
 
-if [[ $messages =~ "max node group size reached" ]]; then
+if echo "$messages" | grep -q "max node group size reached"; then
     next_steps+=("Not enough node resources available to schedule pods. Escalate this issue to your cluster owner.")
     next_steps+=("Increase node count in cluster.")
     next_steps+=("Check for quota errors.")
 fi
 
-if [[ $messages =~ "Health check failed after" ]]; then
+if echo "$messages" | grep -q "Health check failed after"; then
     next_steps+=("Check $owner_kind \`$owner_name\` Health")
 fi
 
-if [[ $messages =~ "Deployment does not have minimum availability" ]]; then
+if echo "$messages" | grep -q "Deployment does not have minimum availability"; then
     next_steps+=("Inspect Deployment Warning Events for \`$owner_name\`")
 fi
 
-## Exit on normal strings
-if [[ $messages =~ "Created container server" || $messages =~ "no changes since last reconcilation" || $messages =~ "Reconciliation finished" || "successfully rotated K8s secret" ]]; then
+# Exit on normal strings
+if echo "$messages" | grep -q "Created container server\|no changes since last reconcilation\|Reconciliation finished\|successfully rotated K8s secret"; then
     # Don't generate any issue data, these are normal strings
     exit 0
 fi
 
-## Catch All
+# Catch All
 if [[ ${#next_steps[@]} -eq 0 ]]; then
     next_steps+=("Please review the report logs and escalate the issue if necessary.")
 fi

diff --git a/codebundles/k8s-namespace-healthcheck/runbook.robot b/codebundles/k8s-namespace-healthcheck/runbook.robot
@@ -172,7 +172,7 @@ Inspect Pending Pods In Namespace `${NAMESPACE}`
                 RW.Core.Add Issue
                 ...    severity=${issue["severity"]}
                 ...    expected=Pods should not be pending in `${NAMESPACE}`.
-                ...    actual=Pod `${pod_name.stdout}` is pending with ${item["containerReason"]}
+                ...    actual=Pod `${item["pod_name"]}` is pending with ${item["containerReason"]}
                 ...    title= ${issue["title"]}
                 ...    reproduce_hint=${pending_pods.cmd}
                 ...    details=${issue["details"]}
@@ -291,6 +291,7 @@ Inspect Workload Status Conditions In Namespace `${NAMESPACE}`
             ELSE
                 ${owner_kind}=    Set Variable    "Unknown"
                 ${owner_name}=    Set Variable    "Unknown"
+                Exit For Loop
             END
             ${item_next_steps}=    RW.CLI.Run Bash File
             ...    bash_file=workload_next_steps.sh