Skip to content

Commit

Permalink
K8s ns/more next steps (#413)
Browse files Browse the repository at this point in the history
* change older workload_next_steps structure

* update all the scripts to better handle multi word pattern matching

* workload next steps touchup. fix pod name in pending pod task
  • Loading branch information
stewartshea authored Jul 25, 2024
1 parent c34ec19 commit 3838427
Show file tree
Hide file tree
Showing 5 changed files with 92 additions and 94 deletions.
39 changes: 19 additions & 20 deletions codebundles/k8s-deployment-healthcheck/workload_issues.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# -----------------------------------------------------------------------------
# Author: @stewartshea
# Description: This script takes in event message strings captured from a
# Kubernetes based system and provides more concrete issue details in json format. This is a migratio naway from workload_next_steps.sh in order to support dynamic severity generation and more robust next step details.
# Kubernetes based system and provides more concrete issue details in json format. This is a migration away from workload_next_steps.sh in order to support dynamic severity generation and more robust next step details.
# -----------------------------------------------------------------------------
# Input: List of event messages, related owner kind, and related owner name
messages="$1"
Expand All @@ -24,68 +24,67 @@ add_issue() {
}

# Check conditions and add issues to the array
if [[ $messages =~ "ContainersNotReady" && $owner_kind == "Deployment" ]]; then
if echo "$messages" | grep -q "ContainersNotReady" && [[ $owner_kind == "Deployment" ]]; then
add_issue "2" "$owner_kind \`$owner_name\` has unready containers" "$messages" "Inspect Deployment Replicas for \`$owner_name\`"
fi

if [[ $messages =~ "Misconfiguration" && $owner_kind == "Deployment" ]]; then
if echo "$messages" | grep -q "Misconfiguration" && [[ $owner_kind == "Deployment" ]]; then
add_issue "2" "$owner_kind \`$owner_name\` has a misconfiguration" "$messages" "Check Deployment Log For Issues for \`$owner_name\`\nGet Deployment Workload Details For \`$owner_name\` and Add to Report"
fi

if [[ $messages =~ "PodInitializing" ]]; then
if echo "$messages" | grep -q "PodInitializing"; then
add_issue "4" "$owner_kind \`$owner_name\` is initializing" "$messages" "Retry in a few minutes and verify that \`$owner_name\` is running.\nInspect $owner_kind Warning Events for \`$owner_name\`"
fi

if [[ $messages =~ "Startup probe failed" ]]; then
if echo "$messages" | grep -q "Startup probe failed"; then
add_issue "2" "$owner_kind \`$owner_name\` is unable to start" "$messages" "Check Deployment Logs for $owner_kind \`$owner_name\`\nReview Startup Probe Configuration for $owner_kind \`$owner_name\`\nIncrease Startup Probe Timeout and Threshold for $owner_kind \`$owner_name\`\nIdentify Resource Constrained Pods In Namespace \`$NAMESPACE\`"
fi


if [[ $messages =~ "Liveness probe failed" || $messages =~ "Liveness probe errored" ]]; then
if echo "$messages" | grep -q "Liveness probe failed\|Liveness probe errored"; then
add_issue "3" "$owner_kind \`$owner_name\` is restarting" "$messages" "Check Liveliness Probe Configuration for $owner_kind \`$owner_name\`"
fi

if [[ $messages =~ "Readiness probe errored" || $messages =~ "Readiness probe failed" ]]; then
if echo "$messages" | grep -q "Readiness probe errored\|Readiness probe failed"; then
add_issue "2" "$owner_kind \`$owner_name\` is unable to start" "$messages" "Check Readiness Probe Configuration for $owner_kind \`$owner_name\`"
fi

if [[ $messages =~ "PodFailed" ]]; then
if echo "$messages" | grep -q "PodFailed"; then
add_issue "2" "$owner_kind \`$owner_name\` has failed pods" "$messages" "Check Pod Status and Logs for Errors"
fi

if [[ $messages =~ "ImagePullBackOff" || $messages =~ "Back-off pulling image" || $messages =~ "ErrImagePull" ]]; then
if echo "$messages" | grep -q "ImagePullBackOff\|Back-off pulling image\|ErrImagePull"; then
add_issue "2" "$owner_kind \`$owner_name\` has image access issues" "$messages" "List Images and Tags for Every Container in Failed Pods for Namespace \`$NAMESPACE\`\nList ImagePullBackoff Events and Test Path and Tags for Namespace \`$NAMESPACE\`"
fi

if [[ $messages =~ "Back-off restarting failed container" ]]; then
if echo "$messages" | grep -q "Back-off restarting failed container"; then
add_issue "2" "$owner_kind \`$owner_name\` has failing containers" "$messages" "Check $owner_kind Log for \`$owner_name\`\nInspect Warning Events for $owner_kind \`$owner_name\`"
fi

if [[ $messages =~ "forbidden: failed quota" || $messages =~ "forbidden: exceeded quota" ]]; then
if echo "$messages" | grep -q "forbidden: failed quota\|forbidden: exceeded quota"; then
add_issue "3" "$owner_kind \`$owner_name\` has resources that cannot be scheduled" "$messages" "Adjust resource configuration for $owner_kind \`$owner_name\` according to issue details."
fi

if [[ $messages =~ "is forbidden: [minimum cpu usage per Container" || $messages =~ "is forbidden: [minimum memory usage per Container" ]]; then
if echo "$messages" | grep -q "is forbidden: \[minimum cpu usage per Container\|is forbidden: \[minimum memory usage per Container"; then
add_issue "2" "$owner_kind \`$owner_name\` has invalid resource configuration" "$messages" "Adjust resource configuration for $owner_kind \`$owner_name\` according to issue details."
fi

if [[ $messages =~ "No preemption victims found for incoming pod" || $messages =~ "Insufficient cpu" ]]; then
if echo "$messages" | grep -q "No preemption victims found for incoming pod\|Insufficient cpu\|The node was low on resource\|nodes are available\|Preemption is not helpful"; then
add_issue "2" "$owner_kind \`$owner_name\` cannot be scheduled - not enough cluster resources." "$messages" "Not enough node resources available to schedule pods. Escalate this issue to your cluster owner.\nIncrease Node Count in Cluster\nCheck for Quota Errors\nIdentify High Utilization Nodes for Cluster \`${CONTEXT}\`"
fi

if [[ $messages =~ "max node group size reached" ]]; then
if echo "$messages" | grep -q "max node group size reached"; then
add_issue "2" "$owner_kind \`$owner_name\` cannot be scheduled - cannot increase cluster size." "$messages" "Not enough node resources available to schedule pods. Escalate this issue to your cluster owner.\nIncrease Max Node Group Size in Cluster\nIdentify High Utilization Nodes for Cluster \`${CONTEXT}\`"
fi

if [[ $messages =~ "Health check failed after" ]]; then
if echo "$messages" | grep -q "Health check failed after"; then
add_issue "3" "$owner_kind \`$owner_name\` health check failed." "$messages" "Check $owner_kind \`$owner_name\` Health"
fi

if [[ $messages =~ "Deployment does not have minimum availability" ]]; then
if echo "$messages" | grep -q "Deployment does not have minimum availability"; then
add_issue "3" "$owner_kind \`$owner_name\` is not available." "$messages" "Inspect Deployment Warning Events for \`$owner_name\`"
fi

if [[ $messages =~ "Created container server" || $messages =~ "no changes since last reconcilation" || $messages =~ "Reconciliation finished" || "successfully rotated K8s secret" ]]; then
if echo "$messages" | grep -q "Created container server\|no changes since last reconcilation\|Reconciliation finished\|successfully rotated K8s secret"; then
# Don't generate any issue data, these are normal strings
echo "[]" | jq .
exit 0
Expand All @@ -100,5 +99,5 @@ if [ ${#issue_details_array[@]} -gt 0 ]; then
issues_json="[${issues_json%,}]" # Remove the last comma and wrap in square brackets
echo "$issues_json" | jq .
else
echo "[{\"severity\":\"4\",\"title\":\"$owner_kind \`$owner_name\` has issues that require further investigation.\",\"details\":\"$messages\",\"next_steps\":\"Escalate issues for $owner_kind \`$owner_name\` to service owner \"}]" | jq .
fi
echo "[{\"severity\":\"4\",\"title\":\"$owner_kind \`$owner_name\` has issues that require further investigation.\",\"details\":\"$messages\",\"next_steps\":\"Escalate issues for $owner kind \`$owner_name\` to service owner \"}]" | jq .
fi
46 changes: 22 additions & 24 deletions codebundles/k8s-deployment-healthcheck/workload_next_steps.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,93 +16,91 @@ owner_name="$3"
# Initialize an empty array to store recommendations
next_steps=()


if [[ $messages =~ "ContainersNotReady" && $owner_kind == "Deployment" ]]; then
# Check conditions and add recommendations to the array
if echo "$messages" | grep -q "ContainersNotReady" && [[ $owner_kind == "Deployment" ]]; then
next_steps+=("Inspect Deployment Replicas for \`$owner_name\`")
fi

if [[ $messages =~ "Misconfiguration" && $owner_kind == "Deployment" ]]; then
if echo "$messages" | grep -q "Misconfiguration" && [[ $owner_kind == "Deployment" ]]; then
next_steps+=("Check Deployment Log For Issues for \`$owner_name\`")
next_steps+=("Get Deployment Workload Details For \`$owner_name\` and Add to Report")
fi

if [[ $messages =~ "Misconfiguration" ]]; then
if echo "$messages" | grep -q "Misconfiguration"; then
next_steps+=("Review configuration of $owner_kind \`$owner_name\`")
next_steps+=("Check for Node Failures or Maintenance Activities in Cluster \`$CONTEXT\`")
fi

if [[ $messages =~ "PodInitializing" ]]; then
if echo "$messages" | grep -q "PodInitializing"; then
next_steps+=("Check $owner_kind Health for \`$owner_name\`")
next_steps+=("Inspect $owner_kind Warning Events for \`$owner_name\`")
fi

if [[ $messages =~ "Startup probe failed" ]]; then
if echo "$messages" | grep -q "Startup probe failed"; then
next_steps+=("Check Deployment Logs for $owner_kind \`$owner_name\`")
next_steps+=("Review Startup Probe Configuration for $owner_kind \`$owner_name\`")
next_steps+=("Increase Startup Probe Timeout and Threshold for $owner_kind \`$owner_name\`")
next_steps+=("Identify Resource Constrained Pods In Namespace \`$NAMESPACE\`")
fi


if [[ $messages =~ "Liveness probe failed" || $messages =~ "Liveness probe errored" ]]; then
if echo "$messages" | grep -q "Liveness probe failed\|Liveness probe errored"; then
next_steps+=("Check Liveliness Probe Configuration for $owner_kind \`$owner_name\`")
fi

if [[ $messages =~ "Readiness probe errored" || $messages =~ "Readiness probe failed" ]]; then
if echo "$messages" | grep -q "Readiness probe errored\|Readiness probe failed"; then
next_steps+=("Check Readiness Probe Configuration for $owner_kind \`$owner_name\`")
fi

if [[ $messages =~ "PodFailed" ]]; then
if echo "$messages" | grep -q "PodFailed"; then
next_steps+=("Check Readiness Probe Configuration for $owner_kind \`$owner_name\`")
fi

if [[ $messages =~ "ImagePullBackOff" || $messages =~ "Back-off pulling image" || $messages =~ "ErrImagePull" ]]; then
if echo "$messages" | grep -q "ImagePullBackOff\|Back-off pulling image\|ErrImagePull"; then
next_steps+=("List ImagePullBackoff Events and Test Path and Tags for Namespace \`$NAMESPACE\`")
next_steps+=("List Images and Tags for Every Container in Failed Pods for Namespace \`$NAMESPACE\`")
fi

if [[ $messages =~ "Back-off restarting failed container" ]]; then
if echo "$messages" | grep -q "Back-off restarting failed container"; then
next_steps+=("Check Log for $owner_kind \`$owner_name\`")
next_steps+=("Inspect Warning Events for $owner_kind \`$owner_name\`")

fi

if [[ $messages =~ "ImagePullBackOff" || $messages =~ "Back-off pulling image" || $messages =~ "ErrImagePull" ]]; then
if echo "$messages" | grep -q "ImagePullBackOff\|Back-off pulling image\|ErrImagePull"; then
next_steps+=("List ImagePullBackoff Events and Test Path and Tags for Namespace \`$NAMESPACE\`")
next_steps+=("List Images and Tags for Every Container in Failed Pods for Namespace \`$NAMESPACE\`")
fi

if [[ $messages =~ "forbidden: failed quota" || $messages =~ "forbidden: exceeded quota" ]]; then
next_steps+=("Check Resource Quota Utilization in Namepace `${NAMESPACE}`")
if echo "$messages" | grep -q "forbidden: failed quota\|forbidden: exceeded quota"; then
next_steps+=("Check Resource Quota Utilization in Namepace \`${NAMESPACE}\`")
fi

if [[ $messages =~ "No preemption victims found for incoming pod" || $messages =~ "Insufficient cpu" ]]; then
next_steps+=("Not enough node resources available to schedule pods. Escalate this issue to your cluster owner. ")
if echo "$messages" | grep -q "No preemption victims found for incoming pod\|Insufficient cpu"; then
next_steps+=("Not enough node resources available to schedule pods. Escalate this issue to your cluster owner.")
next_steps+=("Increase Node Count in Cluster")
next_steps+=("Check for Quota Errors")
fi

if [[ $messages =~ "max node group size reached" ]]; then
if echo "$messages" | grep -q "max node group size reached"; then
next_steps+=("Not enough node resources available to schedule pods. Escalate this issue to your cluster owner.")
next_steps+=("Increase node count in cluster.")
next_steps+=("Check for quota errors.")
fi

if [[ $messages =~ "Health check failed after" ]]; then
if echo "$messages" | grep -q "Health check failed after"; then
next_steps+=("Check $owner_kind \`$owner_name\` Health")
fi

if [[ $messages =~ "Deployment does not have minimum availability" ]]; then
if echo "$messages" | grep -q "Deployment does not have minimum availability"; then
next_steps+=("Inspect Deployment Warning Events for \`$owner_name\`")
fi

## Exit on normal strings
if [[ $messages =~ "Created container server" || $messages =~ "no changes since last reconcilation" || $messages =~ "Reconciliation finished" || "successfully rotated K8s secret" ]]; then
# Exit on normal strings
if echo "$messages" | grep -q "Created container server\|no changes since last reconcilation\|Reconciliation finished\|successfully rotated K8s secret"; then
# Don't generate any issue data, these are normal strings
exit 0
fi

## Catch All
# Catch All
if [[ ${#next_steps[@]} -eq 0 ]]; then
next_steps+=("Please review the report logs and escalate the issue if necessary.")
fi
Expand Down
3 changes: 2 additions & 1 deletion codebundles/k8s-namespace-healthcheck/runbook.robot
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ Inspect Pending Pods In Namespace `${NAMESPACE}`
RW.Core.Add Issue
... severity=${issue["severity"]}
... expected=Pods should not be pending in `${NAMESPACE}`.
... actual=Pod `${pod_name.stdout}` is pending with ${item["containerReason"]}
... actual=Pod `${item["pod_name"]}` is pending with ${item["containerReason"]}
... title= ${issue["title"]}
... reproduce_hint=${pending_pods.cmd}
... details=${issue["details"]}
Expand Down Expand Up @@ -291,6 +291,7 @@ Inspect Workload Status Conditions In Namespace `${NAMESPACE}`
ELSE
${owner_kind}= Set Variable "Unknown"
${owner_name}= Set Variable "Unknown"
Exit For Loop
END
${item_next_steps}= RW.CLI.Run Bash File
... bash_file=workload_next_steps.sh
Expand Down
Loading

0 comments on commit 3838427

Please sign in to comment.