diff --git a/codebundles/k8s-deployment-healthcheck/workload_issues.sh b/codebundles/k8s-deployment-healthcheck/workload_issues.sh index 667055d1..05f7f98d 100755 --- a/codebundles/k8s-deployment-healthcheck/workload_issues.sh +++ b/codebundles/k8s-deployment-healthcheck/workload_issues.sh @@ -5,7 +5,7 @@ # ----------------------------------------------------------------------------- # Author: @stewartshea # Description: This script takes in event message strings captured from a -# Kubernetes based system and provides more concrete issue details in json format. This is a migratio naway from workload_next_steps.sh in order to support dynamic severity generation and more robust next step details. +# Kubernetes based system and provides more concrete issue details in json format. This is a migration away from workload_next_steps.sh in order to support dynamic severity generation and more robust next step details. # ----------------------------------------------------------------------------- # Input: List of event messages, related owner kind, and related owner name messages="$1" @@ -24,68 +24,67 @@ add_issue() { } # Check conditions and add issues to the array -if [[ $messages =~ "ContainersNotReady" && $owner_kind == "Deployment" ]]; then +if echo "$messages" | grep -q "ContainersNotReady" && [[ $owner_kind == "Deployment" ]]; then add_issue "2" "$owner_kind \`$owner_name\` has unready containers" "$messages" "Inspect Deployment Replicas for \`$owner_name\`" fi -if [[ $messages =~ "Misconfiguration" && $owner_kind == "Deployment" ]]; then +if echo "$messages" | grep -q "Misconfiguration" && [[ $owner_kind == "Deployment" ]]; then add_issue "2" "$owner_kind \`$owner_name\` has a misconfiguration" "$messages" "Check Deployment Log For Issues for \`$owner_name\`\nGet Deployment Workload Details For \`$owner_name\` and Add to Report" fi -if [[ $messages =~ "PodInitializing" ]]; then +if echo "$messages" | grep -q "PodInitializing"; then add_issue "4" "$owner_kind \`$owner_name\` is initializing" "$messages" "Retry in a few minutes and verify that \`$owner_name\` is running.\nInspect $owner_kind Warning Events for \`$owner_name\`" fi -if [[ $messages =~ "Startup probe failed" ]]; then +if echo "$messages" | grep -q "Startup probe failed"; then add_issue "2" "$owner_kind \`$owner_name\` is unable to start" "$messages" "Check Deployment Logs for $owner_kind \`$owner_name\`\nReview Startup Probe Configuration for $owner_kind \`$owner_name\`\nIncrease Startup Probe Timeout and Threshold for $owner_kind \`$owner_name\`\nIdentify Resource Constrained Pods In Namespace \`$NAMESPACE\`" fi - -if [[ $messages =~ "Liveness probe failed" || $messages =~ "Liveness probe errored" ]]; then +if echo "$messages" | grep -q "Liveness probe failed\|Liveness probe errored"; then add_issue "3" "$owner_kind \`$owner_name\` is restarting" "$messages" "Check Liveliness Probe Configuration for $owner_kind \`$owner_name\`" fi -if [[ $messages =~ "Readiness probe errored" || $messages =~ "Readiness probe failed" ]]; then +if echo "$messages" | grep -q "Readiness probe errored\|Readiness probe failed"; then add_issue "2" "$owner_kind \`$owner_name\` is unable to start" "$messages" "Check Readiness Probe Configuration for $owner_kind \`$owner_name\`" fi -if [[ $messages =~ "PodFailed" ]]; then +if echo "$messages" | grep -q "PodFailed"; then add_issue "2" "$owner_kind \`$owner_name\` has failed pods" "$messages" "Check Pod Status and Logs for Errors" fi -if [[ $messages =~ "ImagePullBackOff" || $messages =~ "Back-off pulling image" || $messages =~ "ErrImagePull" ]]; then +if echo "$messages" | grep -q "ImagePullBackOff\|Back-off pulling image\|ErrImagePull"; then add_issue "2" "$owner_kind \`$owner_name\` has image access issues" "$messages" "List Images and Tags for Every Container in Failed Pods for Namespace \`$NAMESPACE\`\nList ImagePullBackoff Events and Test Path and Tags for Namespace \`$NAMESPACE\`" fi -if [[ $messages =~ "Back-off restarting failed container" ]]; then +if echo "$messages" | grep -q "Back-off restarting failed container"; then add_issue "2" "$owner_kind \`$owner_name\` has failing containers" "$messages" "Check $owner_kind Log for \`$owner_name\`\nInspect Warning Events for $owner_kind \`$owner_name\`" fi -if [[ $messages =~ "forbidden: failed quota" || $messages =~ "forbidden: exceeded quota" ]]; then +if echo "$messages" | grep -q "forbidden: failed quota\|forbidden: exceeded quota"; then add_issue "3" "$owner_kind \`$owner_name\` has resources that cannot be scheduled" "$messages" "Adjust resource configuration for $owner_kind \`$owner_name\` according to issue details." fi -if [[ $messages =~ "is forbidden: [minimum cpu usage per Container" || $messages =~ "is forbidden: [minimum memory usage per Container" ]]; then +if echo "$messages" | grep -q "is forbidden: \[minimum cpu usage per Container\|is forbidden: \[minimum memory usage per Container"; then add_issue "2" "$owner_kind \`$owner_name\` has invalid resource configuration" "$messages" "Adjust resource configuration for $owner_kind \`$owner_name\` according to issue details." fi -if [[ $messages =~ "No preemption victims found for incoming pod" || $messages =~ "Insufficient cpu" ]]; then +if echo "$messages" | grep -q "No preemption victims found for incoming pod\|Insufficient cpu\|The node was low on resource\|nodes are available\|Preemption is not helpful"; then add_issue "2" "$owner_kind \`$owner_name\` cannot be scheduled - not enough cluster resources." "$messages" "Not enough node resources available to schedule pods. Escalate this issue to your cluster owner.\nIncrease Node Count in Cluster\nCheck for Quota Errors\nIdentify High Utilization Nodes for Cluster \`${CONTEXT}\`" fi -if [[ $messages =~ "max node group size reached" ]]; then +if echo "$messages" | grep -q "max node group size reached"; then add_issue "2" "$owner_kind \`$owner_name\` cannot be scheduled - cannot increase cluster size." "$messages" "Not enough node resources available to schedule pods. Escalate this issue to your cluster owner.\nIncrease Max Node Group Size in Cluster\nIdentify High Utilization Nodes for Cluster \`${CONTEXT}\`" fi -if [[ $messages =~ "Health check failed after" ]]; then +if echo "$messages" | grep -q "Health check failed after"; then add_issue "3" "$owner_kind \`$owner_name\` health check failed." "$messages" "Check $owner_kind \`$owner_name\` Health" fi -if [[ $messages =~ "Deployment does not have minimum availability" ]]; then +if echo "$messages" | grep -q "Deployment does not have minimum availability"; then add_issue "3" "$owner_kind \`$owner_name\` is not available." "$messages" "Inspect Deployment Warning Events for \`$owner_name\`" fi -if [[ $messages =~ "Created container server" || $messages =~ "no changes since last reconcilation" || $messages =~ "Reconciliation finished" || "successfully rotated K8s secret" ]]; then +if echo "$messages" | grep -q "Created container server\|no changes since last reconcilation\|Reconciliation finished\|successfully rotated K8s secret"; then # Don't generate any issue data, these are normal strings echo "[]" | jq . exit 0 @@ -100,5 +99,5 @@ if [ ${#issue_details_array[@]} -gt 0 ]; then issues_json="[${issues_json%,}]" # Remove the last comma and wrap in square brackets echo "$issues_json" | jq . else - echo "[{\"severity\":\"4\",\"title\":\"$owner_kind \`$owner_name\` has issues that require further investigation.\",\"details\":\"$messages\",\"next_steps\":\"Escalate issues for $owner_kind \`$owner_name\` to service owner \"}]" | jq . -fi \ No newline at end of file + echo "[{\"severity\":\"4\",\"title\":\"$owner_kind \`$owner_name\` has issues that require further investigation.\",\"details\":\"$messages\",\"next_steps\":\"Escalate issues for $owner kind \`$owner_name\` to service owner \"}]" | jq . +fi diff --git a/codebundles/k8s-deployment-healthcheck/workload_next_steps.sh b/codebundles/k8s-deployment-healthcheck/workload_next_steps.sh index 52feab11..4c6bb89e 100755 --- a/codebundles/k8s-deployment-healthcheck/workload_next_steps.sh +++ b/codebundles/k8s-deployment-healthcheck/workload_next_steps.sh @@ -16,93 +16,91 @@ owner_name="$3" # Initialize an empty array to store recommendations next_steps=() - -if [[ $messages =~ "ContainersNotReady" && $owner_kind == "Deployment" ]]; then +# Check conditions and add recommendations to the array +if echo "$messages" | grep -q "ContainersNotReady" && [[ $owner_kind == "Deployment" ]]; then next_steps+=("Inspect Deployment Replicas for \`$owner_name\`") fi -if [[ $messages =~ "Misconfiguration" && $owner_kind == "Deployment" ]]; then +if echo "$messages" | grep -q "Misconfiguration" && [[ $owner_kind == "Deployment" ]]; then next_steps+=("Check Deployment Log For Issues for \`$owner_name\`") next_steps+=("Get Deployment Workload Details For \`$owner_name\` and Add to Report") fi -if [[ $messages =~ "Misconfiguration" ]]; then +if echo "$messages" | grep -q "Misconfiguration"; then next_steps+=("Review configuration of $owner_kind \`$owner_name\`") next_steps+=("Check for Node Failures or Maintenance Activities in Cluster \`$CONTEXT\`") fi -if [[ $messages =~ "PodInitializing" ]]; then +if echo "$messages" | grep -q "PodInitializing"; then next_steps+=("Check $owner_kind Health for \`$owner_name\`") next_steps+=("Inspect $owner_kind Warning Events for \`$owner_name\`") fi -if [[ $messages =~ "Startup probe failed" ]]; then +if echo "$messages" | grep -q "Startup probe failed"; then next_steps+=("Check Deployment Logs for $owner_kind \`$owner_name\`") next_steps+=("Review Startup Probe Configuration for $owner_kind \`$owner_name\`") next_steps+=("Increase Startup Probe Timeout and Threshold for $owner_kind \`$owner_name\`") next_steps+=("Identify Resource Constrained Pods In Namespace \`$NAMESPACE\`") fi - -if [[ $messages =~ "Liveness probe failed" || $messages =~ "Liveness probe errored" ]]; then +if echo "$messages" | grep -q "Liveness probe failed\|Liveness probe errored"; then next_steps+=("Check Liveliness Probe Configuration for $owner_kind \`$owner_name\`") fi -if [[ $messages =~ "Readiness probe errored" || $messages =~ "Readiness probe failed" ]]; then +if echo "$messages" | grep -q "Readiness probe errored\|Readiness probe failed"; then next_steps+=("Check Readiness Probe Configuration for $owner_kind \`$owner_name\`") fi -if [[ $messages =~ "PodFailed" ]]; then +if echo "$messages" | grep -q "PodFailed"; then next_steps+=("Check Readiness Probe Configuration for $owner_kind \`$owner_name\`") fi -if [[ $messages =~ "ImagePullBackOff" || $messages =~ "Back-off pulling image" || $messages =~ "ErrImagePull" ]]; then +if echo "$messages" | grep -q "ImagePullBackOff\|Back-off pulling image\|ErrImagePull"; then next_steps+=("List ImagePullBackoff Events and Test Path and Tags for Namespace \`$NAMESPACE\`") next_steps+=("List Images and Tags for Every Container in Failed Pods for Namespace \`$NAMESPACE\`") fi -if [[ $messages =~ "Back-off restarting failed container" ]]; then +if echo "$messages" | grep -q "Back-off restarting failed container"; then next_steps+=("Check Log for $owner_kind \`$owner_name\`") next_steps+=("Inspect Warning Events for $owner_kind \`$owner_name\`") - fi -if [[ $messages =~ "ImagePullBackOff" || $messages =~ "Back-off pulling image" || $messages =~ "ErrImagePull" ]]; then +if echo "$messages" | grep -q "ImagePullBackOff\|Back-off pulling image\|ErrImagePull"; then next_steps+=("List ImagePullBackoff Events and Test Path and Tags for Namespace \`$NAMESPACE\`") next_steps+=("List Images and Tags for Every Container in Failed Pods for Namespace \`$NAMESPACE\`") fi -if [[ $messages =~ "forbidden: failed quota" || $messages =~ "forbidden: exceeded quota" ]]; then - next_steps+=("Check Resource Quota Utilization in Namepace `${NAMESPACE}`") +if echo "$messages" | grep -q "forbidden: failed quota\|forbidden: exceeded quota"; then + next_steps+=("Check Resource Quota Utilization in Namepace \`${NAMESPACE}\`") fi -if [[ $messages =~ "No preemption victims found for incoming pod" || $messages =~ "Insufficient cpu" ]]; then - next_steps+=("Not enough node resources available to schedule pods. Escalate this issue to your cluster owner. ") +if echo "$messages" | grep -q "No preemption victims found for incoming pod\|Insufficient cpu"; then + next_steps+=("Not enough node resources available to schedule pods. Escalate this issue to your cluster owner.") next_steps+=("Increase Node Count in Cluster") next_steps+=("Check for Quota Errors") fi -if [[ $messages =~ "max node group size reached" ]]; then +if echo "$messages" | grep -q "max node group size reached"; then next_steps+=("Not enough node resources available to schedule pods. Escalate this issue to your cluster owner.") next_steps+=("Increase node count in cluster.") next_steps+=("Check for quota errors.") fi -if [[ $messages =~ "Health check failed after" ]]; then +if echo "$messages" | grep -q "Health check failed after"; then next_steps+=("Check $owner_kind \`$owner_name\` Health") fi -if [[ $messages =~ "Deployment does not have minimum availability" ]]; then +if echo "$messages" | grep -q "Deployment does not have minimum availability"; then next_steps+=("Inspect Deployment Warning Events for \`$owner_name\`") fi -## Exit on normal strings -if [[ $messages =~ "Created container server" || $messages =~ "no changes since last reconcilation" || $messages =~ "Reconciliation finished" || "successfully rotated K8s secret" ]]; then +# Exit on normal strings +if echo "$messages" | grep -q "Created container server\|no changes since last reconcilation\|Reconciliation finished\|successfully rotated K8s secret"; then # Don't generate any issue data, these are normal strings exit 0 fi -## Catch All +# Catch All if [[ ${#next_steps[@]} -eq 0 ]]; then next_steps+=("Please review the report logs and escalate the issue if necessary.") fi diff --git a/codebundles/k8s-namespace-healthcheck/runbook.robot b/codebundles/k8s-namespace-healthcheck/runbook.robot index 0ca66158..21b6b029 100644 --- a/codebundles/k8s-namespace-healthcheck/runbook.robot +++ b/codebundles/k8s-namespace-healthcheck/runbook.robot @@ -172,7 +172,7 @@ Inspect Pending Pods In Namespace `${NAMESPACE}` RW.Core.Add Issue ... severity=${issue["severity"]} ... expected=Pods should not be pending in `${NAMESPACE}`. - ... actual=Pod `${pod_name.stdout}` is pending with ${item["containerReason"]} + ... actual=Pod `${item["pod_name"]}` is pending with ${item["containerReason"]} ... title= ${issue["title"]} ... reproduce_hint=${pending_pods.cmd} ... details=${issue["details"]} @@ -291,6 +291,7 @@ Inspect Workload Status Conditions In Namespace `${NAMESPACE}` ELSE ${owner_kind}= Set Variable "Unknown" ${owner_name}= Set Variable "Unknown" + Exit For Loop END ${item_next_steps}= RW.CLI.Run Bash File ... bash_file=workload_next_steps.sh diff --git a/codebundles/k8s-namespace-healthcheck/workload_issues.sh b/codebundles/k8s-namespace-healthcheck/workload_issues.sh index f2b7d3fa..b3e5e9f2 100644 --- a/codebundles/k8s-namespace-healthcheck/workload_issues.sh +++ b/codebundles/k8s-namespace-healthcheck/workload_issues.sh @@ -5,7 +5,7 @@ # ----------------------------------------------------------------------------- # Author: @stewartshea # Description: This script takes in event message strings captured from a -# Kubernetes based system and provides more concrete issue details in json format. This is a migratio naway from workload_next_steps.sh in order to support dynamic severity generation and more robust next step details. +# Kubernetes based system and provides more concrete issue details in json format. This is a migration away from workload_next_steps.sh in order to support dynamic severity generation and more robust next step details. # ----------------------------------------------------------------------------- # Input: List of event messages, related owner kind, and related owner name messages="$1" @@ -24,82 +24,81 @@ add_issue() { } # Check conditions and add issues to the array -if [[ $messages =~ "ContainersNotReady" && $owner_kind == "Deployment" ]]; then +if echo "$messages" | grep -q "ContainersNotReady" && [[ $owner_kind == "Deployment" ]]; then add_issue "2" "$owner_kind \`$owner_name\` has unready containers" "$messages" "Inspect Deployment Replicas for \`$owner_name\`" fi -if [[ $messages =~ "Misconfiguration" && $owner_kind == "Deployment" ]]; then +if echo "$messages" | grep -q "Misconfiguration" && [[ $owner_kind == "Deployment" ]]; then add_issue "2" "$owner_kind \`$owner_name\` has a misconfiguration" "$messages" "Check Deployment Log For Issues for \`$owner_name\`\nGet Deployment Workload Details For \`$owner_name\` and Add to Report" fi -if [[ $messages =~ "PodInitializing" ]]; then +if echo "$messages" | grep -q "PodInitializing"; then add_issue "4" "$owner_kind \`$owner_name\` is initializing" "$messages" "Retry in a few minutes and verify that \`$owner_name\` is running.\nInspect $owner_kind Warning Events for \`$owner_name\`" fi -if [[ $messages =~ "Startup probe failed" ]]; then +if echo "$messages" | grep -q "Startup probe failed"; then add_issue "2" "$owner_kind \`$owner_name\` is unable to start" "$messages" "Check Deployment Logs for $owner_kind \`$owner_name\`\nReview Startup Probe Configuration for $owner_kind \`$owner_name\`\nIncrease Startup Probe Timeout and Threshold for $owner_kind \`$owner_name\`\nIdentify Resource Constrained Pods In Namespace \`$NAMESPACE\`" fi -if [[ $messages =~ "Liveness probe failed" || $messages =~ "Liveness probe errored" ]]; then +if echo "$messages" | grep -q "Liveness probe failed\|Liveness probe errored"; then add_issue "3" "$owner_kind \`$owner_name\` is restarting" "$messages" "Check Liveliness Probe Configuration for $owner_kind \`$owner_name\`" fi -if [[ $messages =~ "Readiness probe errored" || $messages =~ "Readiness probe failed" ]]; then +if echo "$messages" | grep -q "Readiness probe errored\|Readiness probe failed"; then add_issue "2" "$owner_kind \`$owner_name\` is unable to start" "$messages" "Check Readiness Probe Configuration for $owner_kind \`$owner_name\`" fi -if [[ $messages =~ "PodFailed" ]]; then +if echo "$messages" | grep -q "PodFailed"; then add_issue "2" "$owner_kind \`$owner_name\` has failed pods" "$messages" "Check Pod Status and Logs for Errors" fi -if [[ $messages =~ "ImagePullBackOff" || $messages =~ "Back-off pulling image" || $messages =~ "ErrImagePull" ]]; then +if echo "$messages" | grep -q "ImagePullBackOff\|Back-off pulling image\|ErrImagePull"; then add_issue "2" "$owner_kind \`$owner_name\` has image access issues" "$messages" "List Images and Tags for Every Container in Failed Pods for Namespace \`$NAMESPACE\`\nList ImagePullBackoff Events and Test Path and Tags for Namespace \`$NAMESPACE\`" fi -if [[ $messages =~ "Back-off restarting failed container" ]]; then +if echo "$messages" | grep -q "Back-off restarting failed container"; then add_issue "2" "$owner_kind \`$owner_name\` has failing containers" "$messages" "Check $owner_kind Log for \`$owner_name\`\nInspect Warning Events for $owner_kind \`$owner_name\`" fi -if [[ $messages =~ "forbidden: failed quota" || $messages =~ "forbidden: exceeded quota" ]]; then +if echo "$messages" | grep -q "forbidden: failed quota\|forbidden: exceeded quota"; then add_issue "3" "$owner_kind \`$owner_name\` has resources that cannot be scheduled" "$messages" "Adjust resource configuration for $owner_kind \`$owner_name\` according to issue details." fi -if [[ $messages =~ "is forbidden: [minimum cpu usage per Container" || $messages =~ "is forbidden: [minimum memory usage per Container" ]]; then +if echo "$messages" | grep -q "is forbidden: \[minimum cpu usage per Container\|is forbidden: \[minimum memory usage per Container"; then add_issue "2" "$owner_kind \`$owner_name\` has invalid resource configuration" "$messages" "Adjust resource configuration for $owner_kind \`$owner_name\` according to issue details." fi -if [[ $messages =~ "No preemption victims found for incoming pod" || $messages =~ "Insufficient cpu" || $messages =~ "The node was low on resource" ]]; then +if echo "$messages" | grep -q "No preemption victims found for incoming pod\|Insufficient cpu\|The node was low on resource\|nodes are available\|Preemption is not helpful"; then add_issue "2" "$owner_kind \`$owner_name\` cannot be scheduled - not enough cluster resources." "$messages" "Not enough node resources available to schedule pods. Escalate this issue to your cluster owner.\nIncrease Node Count in Cluster\nCheck for Quota Errors\nIdentify High Utilization Nodes for Cluster \`${CONTEXT}\`" fi -if [[ $messages =~ "max node group size reached" ]]; then +if echo "$messages" | grep -q "max node group size reached"; then add_issue "2" "$owner_kind \`$owner_name\` cannot be scheduled - cannot increase cluster size." "$messages" "Not enough node resources available to schedule pods. Escalate this issue to your cluster owner.\nIncrease Max Node Group Size in Cluster\nIdentify High Utilization Nodes for Cluster \`${CONTEXT}\`" fi -if [[ $messages =~ "Health check failed after" ]]; then +if echo "$messages" | grep -q "Health check failed after"; then add_issue "3" "$owner_kind \`$owner_name\` health check failed." "$messages" "Check $owner_kind \`$owner_name\` Health" fi -if [[ $messages =~ "Deployment does not have minimum availability" ]]; then +if echo "$messages" | grep -q "Deployment does not have minimum availability"; then add_issue "3" "$owner_kind \`$owner_name\` is not available." "$messages" "Inspect Deployment Warning Events for \`$owner_name\`" fi -if [[ $messages =~ "failed to download archive" ]]; then +if echo "$messages" | grep -q "failed to download archive"; then add_issue "3" "$owner_kind \`$owner_name\` has internal connectivity issues fetching source" "$messages" "Escalate connectivity issues to service owner if they continue." fi - -if [[ $messages =~ "OCI runtime exec failed: exec failed: unable to start container process" ]]; then +if echo "$messages" | grep -q "OCI runtime exec failed: exec failed: unable to start container process"; then add_issue "2" "Possible node or container runtime issue" "$messages" "Escalate container runtime issue to service owner if they continue." fi -if [[ $messages =~ "Created container server" || $messages =~ "no changes since last reconcilation" || $messages =~ "Reconciliation finished" || "successfully rotated K8s secret" ]]; then +if echo "$messages" | grep -q "Created container server\|no changes since last reconcilation\|Reconciliation finished\|successfully rotated K8s secret"; then # Don't generate any issue data, these are normal strings echo "[]" | jq . exit 0 fi -if [[ $messages =~ "connect: connection refused" ]]; then +if echo "$messages" | grep -q "connect: connection refused"; then add_issue "3" "Internal connectivity issues detected" "$messages" "Escalate connectivity issues to service owner if they continue." fi @@ -113,4 +112,4 @@ if [ ${#issue_details_array[@]} -gt 0 ]; then echo "$issues_json" | jq . else echo "[{\"severity\":\"4\",\"title\":\"$owner_kind \`$owner_name\` has issues that require further investigation.\",\"details\":\"$messages\",\"next_steps\":\"Escalate issues for $owner_kind \`$owner_name\` to service owner \"}]" | jq . -fi \ No newline at end of file +fi diff --git a/codebundles/k8s-namespace-healthcheck/workload_next_steps.sh b/codebundles/k8s-namespace-healthcheck/workload_next_steps.sh index dcc9a025..b9fa0cb9 100755 --- a/codebundles/k8s-namespace-healthcheck/workload_next_steps.sh +++ b/codebundles/k8s-namespace-healthcheck/workload_next_steps.sh @@ -16,85 +16,86 @@ owner_name="$3" # Initialize an empty array to store recommendations next_steps=() - -if [[ $messages =~ "ContainersNotReady" && $owner_kind == "Deployment" ]]; then +if echo "$messages" | grep -q "ContainersNotReady" && [[ $owner_kind == "Deployment" ]]; then next_steps+=("Inspect Deployment Replicas for \`$owner_name\`") fi -if [[ $messages =~ "Misconfiguration" && $owner_kind == "Deployment" ]]; then +if echo "$messages" | grep -q "ContainersNotReady\|containers with unready status"; then + next_steps+=("Check container restarts for $owner_kind \`$owner_name\`") +fi + +if echo "$messages" | grep -q "Misconfiguration" && [[ $owner_kind == "Deployment" ]]; then next_steps+=("Check Deployment Log For Issues for \`$owner_name\`") next_steps+=("Get Deployment Workload Details For \`$owner_name\` and Add to Report") fi -if [[ $messages =~ "Misconfiguration" ]]; then +if echo "$messages" | grep -q "Misconfiguration"; then next_steps+=("Review configuration of $owner_kind \`$owner_name\`") next_steps+=("Check for Node Failures or Maintenance Activities in Cluster \`$CONTEXT\`") fi -if [[ $messages =~ "PodInitializing" ]]; then +if echo "$messages" | grep -q "PodInitializing"; then next_steps+=("Check $owner_kind Health for \`$owner_name\`") next_steps+=("Inspect $owner_kind Warning Events for \`$owner_name\`") fi -if [[ $messages =~ "Startup probe failed" ]]; then - add_issue "2" "$owner_kind \`$owner_name\` is unable to start" "$messages" "Check Deployment Logs for $owner_kind \`$owner_name\`\nReview Startup Probe Configuration for $owner_kind \`$owner_name\`\nIncrease Startup Probe Timeout and Threshold for $owner_kind \`$owner_name\`\nIdentify Resource Constrained Pods In Namespace \`$NAMESPACE\`" -fi - -if [[ $messages =~ "Liveness probe failed" || $messages =~ "Liveness probe errored" ]]; then +if echo "$messages" | grep -q "Liveness probe failed\|Liveness probe errored"; then next_steps+=("Check Liveliness Probe Configuration for $owner_kind \`$owner_name\`") fi -if [[ $messages =~ "Readiness probe errored" || $messages =~ "Readiness probe failed" ]]; then +if echo "$messages" | grep -q "Readiness probe errored\|Readiness probe failed"; then next_steps+=("Check Readiness Probe Configuration for $owner_kind \`$owner_name\`") fi -if [[ $messages =~ "PodFailed" ]]; then +if echo "$messages" | grep -q "PodFailed"; then next_steps+=("Check Readiness Probe Configuration for $owner_kind \`$owner_name\`") fi -if [[ $messages =~ "ImagePullBackOff" || $messages =~ "Back-off pulling image" || $messages =~ "ErrImagePull" ]]; then +if echo "$messages" | grep -q "ImagePullBackOff\|Back-off pulling image\|ErrImagePull"; then next_steps+=("List ImagePullBackoff Events and Test Path and Tags for Namespace \`$NAMESPACE\`") next_steps+=("List Images and Tags for Every Container in Failed Pods for Namespace \`$NAMESPACE\`") fi -if [[ $messages =~ "Back-off restarting failed container" ]]; then +if echo "$messages" | grep -q "Back-off restarting failed container"; then next_steps+=("Check Log for $owner_kind \`$owner_name\`") next_steps+=("Inspect Warning Events for $owner_kind \`$owner_name\`") - fi -if [[ $messages =~ "ImagePullBackOff" || $messages =~ "Back-off pulling image" || $messages =~ "ErrImagePull" ]]; then - next_steps+=("List ImagePullBackoff Events and Test Path and Tags for Namespace \`$NAMESPACE\`") - next_steps+=("List Images and Tags for Every Container in Failed Pods for Namespace \`$NAMESPACE\`") -fi - -if [[ $messages =~ "forbidden: failed quota" || $messages =~ "forbidden: exceeded quota" ]]; then +if echo "$messages" | grep -q "forbidden: failed quota\|forbidden: exceeded quota"; then next_steps+=("Check Resource Quota Utilization in Namepace \`$NAMESPACE\`") fi -if [[ $messages =~ "No preemption victims found for incoming pod" || $messages =~ "Insufficient cpu" ]]; then +if echo "$messages" | grep -q "No preemption victims found for incoming pod\|Insufficient cpu"; then next_steps+=("Not enough node resources available to schedule pods. Escalate this issue to the service owner of cluster context \`$CONTEXT\`. ") next_steps+=("Increase node count in cluster context \`$CONTEXT\`") next_steps+=("Check Cloud Provider Quota Errors") fi -if [[ $messages =~ "max node group size reached" ]]; then +if echo "$messages" | grep -q "max node group size reached"; then next_steps+=("Not enough node resources available to schedule pods. Escalate this issue to the service owner of cluster context \`$CONTEXT\`") next_steps+=("Increase node count in cluster context \`$CONTEXT\`") next_steps+=("Check Cloud Provider Quota Errors") fi -if [[ $messages =~ "Health check failed after" ]]; then +if echo "$messages" | grep -q "Health check failed after"; then next_steps+=("Check $owner_kind \`$owner_name\` Health") fi -if [[ $messages =~ "Deployment does not have minimum availability" ]]; then +if echo "$messages" | grep -q "Deployment does not have minimum availability"; then next_steps+=("Inspect Deployment Warning Events for \`$owner_name\`") fi +if echo "$messages" | grep -q "Pod was terminated in response to imminent node shutdown\|TerminationByKubelet"; then + next_steps+=("Verify $owner_kind \`$owner_name\` health.") + next_steps+=("Verify node restarts or maintenance activities are expected health.") +fi + +if echo "$messages" | grep -q "Startup probe failed"; then + next_steps+=("Check Startup Probe Configuration for $owner_kind \`$owner_name\`") +fi -##Exit on normal strings -if [[ $messages =~ "Created container server" || $messages =~ "no changes since last reconcilation" || $messages =~ "Reconciliation finished" || "successfully rotated K8s secret" ]]; then +## Exit on normal strings +if echo "$messages" | grep -q "Created container server\|no changes since last reconcilation\|Reconciliation finished\|successfully rotated K8s secret"; then # Don't generate any issue data, these are normal strings exit 0 fi