Skip to content

Commit

Permalink
Update/next steps (#384)
Browse files Browse the repository at this point in the history
* improve container restart next steps and tune up anomoly event processing

* remove unused script

* makes event age configurable

* update template

* add container restart analysis to deployment

* fix json from state to lastState. Improve OOM recommendations and add RESTART_AGE
  • Loading branch information
stewartshea authored Jun 6, 2024
1 parent 3098aab commit 4aea444
Show file tree
Hide file tree
Showing 10 changed files with 405 additions and 148 deletions.
262 changes: 262 additions & 0 deletions codebundles/k8s-deployment-healthcheck/container_restarts.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,262 @@
#!/bin/bash

# -----------------------------------------------------------------------------
# Script Information and Metadata
# -----------------------------------------------------------------------------
# Author: @stewartshea
# Description: This script is designed to fetch information about containers
# that are restarting in a namespace and provide helpful recommendations
# based on the status messages provided by Kubernetes
# -----------------------------------------------------------------------------

# Update PATH to ensure script dependencies are found
export PATH="$PATH"

# -------------------------- Function Definitions -----------------------------

# Check if a command exists
function check_command_exists() {
if ! command -v $1 &> /dev/null; then
echo "$1 could not be found"
exit
fi
}

# Tasks to perform when container exit code is "Error" or 1
function exit_code_error() {
logs=$(${KUBERNETES_DISTRIBUTION_BINARY} logs -p $1 --all-containers --context=${CONTEXT} -n ${NAMESPACE} )
escaped_log_data=$(echo "$logs" | sed 's/\\/\\\\/g' | sed 's/"/\\"/g' | sed ':a;N;$!ba;s/\n/\\n/g')

echo $escaped_log_data
}

# Tasks to perform when container exit code is "Success"
function exit_code_success() {
echo "The exit code returned was 'Success' for pod \`$1\`. This usually means that the container has successfully run to completion."
echo "Checking which resources might be related to this pod..."
owner=$(find_resource_owner "$1")
owner_kind=$(echo $owner | jq -r '.kind')
owner_name=$(echo $owner | jq -r '.metadata.name')
labels_json=$(echo $owner | jq -r '.metadata.labels')
flux_labels=$(find_matching_labels "$labels_json" "flux")
argocd_labels=$(find_matching_labels "$labels_json" "argo")
command_override_check=$(check_manifest_configuration "$owner_kind/$owner_name")
if [[ "$command_override_check" ]]; then
echo "The command for \`$owner_kind/$owner_name\` has an explicit configuration. Please verify this is accurate: \`$command_override_check\`"
fi
echo "Detected that \`$owner_kind/$owner_name\` manages this pod. Checking for other controllers that might be related..."
if [[ "$flux_labels" ]]; then
echo "Detected FluxCD controlled resource."
flux_labels=$(echo "$flux_labels" | tr ' ' '\n')
namespace=$(echo "$flux_labels" | grep namespace: | awk -F ":" '{print $2}')
name=$(echo "$flux_labels" | grep name: | awk -F ":" '{print $2}')
issue_details="{\"severity\":\"4\",\"title\":\"$owner_kind \`$owner_name\` has container restarts in namespace \`${NAMESPACE}\`\",\"next_steps\":\"Check that the FluxCD resources are not suspended and the manifests are accurate for app:\`$name\`, configured in GitOps namespace:\`$namespace\`\",\"details\":\"The exit code returned was 'Success' for pod \`$1\`. This usually means that the container has successfully run to completion.\"}"
elif [[ "$argocd_labels" ]]; then
echo "Detected ArgoCD"
argocd_labels=$(echo "$argocd_labels" | tr ' ' '\n')
instance=$(echo "$argocd_labels" | grep instance: | awk -F ":" '{print $2}')
issue_details="{\"severity\":\"4\",\"title\":\"$owner_kind \`$owner_name\` has container restarts in namespace \`${NAMESPACE}\`\",\"next_steps\":\"Check that the ArgoCD resources and manifests are accurate for app instance \`$instance\`\",\"details\":\"The exit code returned was 'Success' for pod \`$1\`. This usually means that the container has successfully run to completion.\"}"
else
issue_details="{\"severity\":\"4\",\"title\":\"$owner_kind \`$owner_name\` has container restarts in namespace \`${NAMESPACE}\`\",\"next_steps\":\"Owner resources appear to be manually applied. Please review the manifest for \`$owner\` in \`$NAMESPACE\` for accuracy.\",\"details\":\"The exit code returned was 'Success' for pod \`$1\`. This usually means that the container has successfully run to completion.\"}"
fi
}

function find_resource_owner(){
pod=$(${KUBERNETES_DISTRIBUTION_BINARY} get pod $1 --context=${CONTEXT} -n ${NAMESPACE} -o json )
pod_owner_kind=$(echo $pod | jq -r '.metadata.ownerReferences[0].kind' )
pod_owner_name=$(echo $pod | jq -r '.metadata.ownerReferences[0].name' )
resource_owner=$(${KUBERNETES_DISTRIBUTION_BINARY} get $pod_owner_kind/$pod_owner_name --context=${CONTEXT} -n ${NAMESPACE} -o json)
resource_owner_kind=$(echo $resource_owner | jq -r '.metadata.ownerReferences[0].kind' )
resource_owner_name=$(echo $resource_owner | jq -r '.metadata.ownerReferences[0].name' )
owner=$(${KUBERNETES_DISTRIBUTION_BINARY} get $resource_owner_kind/$resource_owner_name --context=${CONTEXT} -n ${NAMESPACE} -o json)
echo $owner
}

# Define a Bash function that returns matching labels
function find_matching_labels() {
local labels_json="$1"
local text_to_match="$2"
local matching_labels=()

# Use jq to filter and return key-value pairs where either the key or value contains the specified text
matching_pairs=$(echo "$labels_json" | jq -r 'to_entries[] | select(.key | contains("'$text_to_match'")) | "\(.key):\(.value)"')

# Check if the label key or value contains the specified text
for label in $matching_pairs; do
matching_labels+=("$label")
done

echo "${matching_labels[*]}"
}

function check_manifest_configuration() {
resource="$1"
command_override_check=$(${KUBERNETES_DISTRIBUTION_BINARY} get $1 --context=${CONTEXT} -n ${NAMESPACE} -o json | jq -r '
.spec.template.spec.containers[] |
select(.command != null) |
"Container Name: \(.name)\nCommand: \(.command | join(" "))\nArguments: \(.args | join(" "))\n---"
')
echo $command_override_check
}

# ------------------------- Dependency Verification ---------------------------

# Ensure all the required binaries are accessible
check_command_exists ${KUBERNETES_DISTRIBUTION_BINARY}
check_command_exists jq

EXIT_CODE_EXPLANATIONS='{"0": "Success", "1": "Error", "2": "Misconfiguration", "130": "Pod terminated by SIGINT", "134": "Abnormal Termination SIGABRT", "137": "Pod terminated by SIGKILL - Possible OOM", "143":"Graceful Termination SIGTERM"}'

# Fetch the label selector from the deployment
DEPLOYMENT_LABEL_SELECTOR=$(${KUBERNETES_DISTRIBUTION_BINARY} get deployment ${DEPLOYMENT_NAME} --context=${CONTEXT} -n ${NAMESPACE} -o json | jq -r '.spec.selector.matchLabels | to_entries | map("\(.key)=\(.value)") | join(",")')

# Fetch pods related to the specified deployment using the label selector
pods_json=$(${KUBERNETES_DISTRIBUTION_BINARY} get pods --context=${CONTEXT} -n ${NAMESPACE} -l ${DEPLOYMENT_LABEL_SELECTOR} -o json)

container_restarts_json=$(echo "$pods_json" | jq -r --argjson exit_code_explanations "$EXIT_CODE_EXPLANATIONS" '
{
"container_restarts": [
.items[] | select(.status.containerStatuses != null) | select(any(.status.containerStatuses[]; .restartCount > 0)) | {
pod_name: .metadata.name,
containers: [
.status.containerStatuses[] | {
name: .name,
restart_count: .restartCount,
message: (.state.waiting.message // "N/A"),
terminated_reason: (.lastState.terminated.reason // "N/A"),
terminated_finishedAt: (.lastState.terminated.finishedAt // "N/A"),
terminated_exitCode: (.lastState.terminated.exitCode // "N/A"),
exit_code_explanation: ($exit_code_explanations[.lastState.terminated.exitCode | tostring] // "Unknown exit code")
}
]
}
]
}'
)

if [ "$(echo "$container_restarts_json" | jq '.container_restarts | length')" -eq 0 ]; then
echo "No containers with restarts found for deployment ${DEPLOYMENT_NAME}."
exit 0
fi

declare -A container_restarts_dict
container_restarts_dict=$(echo "$container_restarts_json" | jq -r '.container_restarts[] | {"item": .}' | jq -s add)

# Extract data and print in a clean text format
container_details_dict=()

containers=$(echo "$container_restarts_json" | jq -c '.container_restarts[]')

while read -r container; do
pod_name=$(echo "$container" | jq -r '.pod_name')

container_text="Pod Name: $pod_name\n"

container_list=$(echo "$container" | jq -c '.containers[]')

while read -r c; do
name=$(echo "$c" | jq -r '.name')
restart_count=$(echo "$c" | jq -r '.restart_count')
message=$(echo "$c" | jq -r '.message')
terminated_reason=$(echo "$c" | jq -r '.terminated_reason')
terminated_finishedAt=$(echo "$c" | jq -r '.terminated_finishedAt')
terminated_exitCode=$(echo "$c" | jq -r '.terminated_exitCode')
exit_code_explanation=$(echo "$c" | jq -r '.exit_code_explanation')

container_text+="Containers:\n"
container_text+="Name: $name\n"
container_text+="Restart Count: $restart_count\n"
container_text+="Message: $message\n"
container_text+="Terminated Reason: $terminated_reason\n"
container_text+="Terminated FinishedAt: $terminated_finishedAt\n"
container_text+="Terminated ExitCode: $terminated_exitCode\n"
container_text+="Exit Code Explanation: $exit_code_explanation\n\n"

done <<< "$container_list"

container_details_dict+=("$container_text")
done <<< "$containers"

# Print the container restart details
printf "\nContainer Restart Details: \n"
for container_text in "${container_details_dict[@]}"; do
echo -e "$container_text"
done

recommendations=()
printf "Container Restart Analysis: \n"
for item in "${container_restarts_dict[@]}"; do
# Extract the exit code explanation from the container
exit_code_explanation=$(jq -r '.item.containers[0].exit_code_explanation' <<< "$item")
message=$(jq -r '.item.containers[0].message' <<< "$item")
pod_name=$(jq -r .item.pod_name <<< "$item")
owner=$(find_resource_owner "$pod_name")
owner_kind=$(jq -r '.kind' <<< "$owner")
owner_name=$(jq -r '.metadata.name' <<< "$owner")
# Use a case statement to check the exit code and perform actions or recommendations
case "$exit_code_explanation" in
"Success")
exit_code_success "$pod_name"
;;
"Error")
echo "Container exited with an error code."
details=$(exit_code_error "$pod_name")
issue_details="{\"severity\":\"2\",\"title\":\"$owner_kind \`$owner_name\` has container restarts in namespace \`${NAMESPACE}\`\",\"next_steps\":\"Check $owner_kind Log for Issues with \`$owner_name\`\\nView issue details in report for container log details\",\"details\":\"Container exited with an error code. Log Details: \\n$details\"}"
;;
"Misconfiguration")
echo "Container stopped due to misconfiguration"
issue_details="{\"severity\":\"2\",\"title\":\"$owner_kind \`$owner_name\` has container restarts in namespace \`${NAMESPACE}\`\",\"next_steps\":\"Get $owner_kind \`$owner_name\` manifest and check configuration for any mistakes.\",\"details\":\"Container stopped due to misconfiguration\"}"
;;
"Pod terminated by SIGINT")
echo "Container received SIGINT signal, indicating an interrupted process."
issue_details="{\"severity\":\"3\",\"title\":\"$owner_kind \`$owner_name\` has container restarts in namespace \`${NAMESPACE}\`\",\"next_steps\":\"Check $owner_kind Event Anomalies for \`$owner_name\`\\nIf SIGINT is frequently occuring, escalate to the service or infrastructure owner for further investigation.\",\"details\":\"Container received SIGINT signal, indicating an interrupted process.\"}"
;;
"Abnormal Termination SIGABRT")
echo "Container terminated abnormally with SIGABRT signal."
issue_details="{\"severity\":\"2\",\"title\":\"$owner_kind \`$owner_name\` has container restarts in namespace \`${NAMESPACE}\`\",\"next_steps\":\"Check $owner_kind Log for Issues with \`$owner_name\`\\nSIGABRT is usually a serious error. If it doesn't appear application related, escalate to the service or infrastructure owner for further investigation.\",\"details\":\"Container terminated abnormally with SIGABRT signal.\"}"
;;
"Pod terminated by SIGKILL - Possible OOM")
if [[ $message =~ "Pod was terminated in response to imminent node shutdown." ]]; then
echo "Container terminated by SIGKILL related to node shutdown."
issue_details="{\"severity\":\"4\",\"title\":\"$owner_kind \`$owner_name\` in namespace \`${NAMESPACE}\` was evicted due to node shutdown\",\"next_steps\":\"Inspect $owner_kind replicas for \`$owner_name\`\"}"
else
echo "Container terminated by SIGKILL, possibly due to Out Of Memory. Check if the container exceeded its memory limit. Consider increasing memory allocation or optimizing the application for better memory usage."
issue_details="{\"severity\":\"2\",\"title\":\"$owner_kind \`$owner_name\` has container restarts in namespace \`${NAMESPACE}\`\",\"next_steps\":\"Check $owner_kind Log for Issues with \`$owner_name\`\\nGet Pod Resource Utilization with Top in Namespace \`$NAMESPACE\`\\nShow Pods Without Resource Limit or Resource Requests Set in Namespace \`$NAMESPACE\`\\nIdentify Resource Constrained Pods In Namespace \`$NAMESPACE\`\",\"details\":\"Container terminated by SIGKILL, possibly due to Out Of Memory. Check if the container exceeded its memory limit. Consider increasing memory allocation or optimizing the application for better memory usage.\"}"
fi
;;
"Graceful Termination SIGTERM")
echo "Container received SIGTERM signal for graceful termination.Ensure that the container's shutdown process is handling SIGTERM correctly. This may be a normal part of the pod lifecycle."
issue_details="{\"severity\":\"4\",\"title\":\"$owner_kind \`$owner_name\` has container restarts in namespace \`${NAMESPACE}\`\",\"next_steps\":\"If SIGTERM is frequently occuring, escalate to the service or infrastructure owner for further investigation.\",\"details\":\"Container received SIGTERM signal for graceful termination.Ensure that the container's shutdown process is handling SIGTERM correctly. This may be a normal part of the pod lifecycle.\"}"
;;
*)
echo "Unknown exit code for pod \`$pod_name\`: $exit_code_explanation"
echo "$item"
# Handle unknown exit codes here
issue_details="{\"severity\":\"3\",\"title\":\"$owner_kind \`$owner_name\` has container restarts in namespace \`${NAMESPACE}\`\",\"next_steps\":\"Unknown exit code for pod \`$pod_name\`. Escalate to the service or infrastructure owner for further investigation.\",\"details\":\"Unknown exit code for pod \`$pod_name\`: $exit_code_explanation\"}"
;;
esac
# Initialize issues as an empty array if not already set
if [ -z "$issues" ]; then
issues="[]"
fi

# Concatenate issue detail to the string
if [ -n "$issue_details" ]; then
# Remove the closing bracket from issues to prepare for adding a new item
issues="${issues%]}"

# If issues is not an empty array (more than just "["), add a comma before the new item
if [ "$issues" != "[" ]; then
issues="$issues,"
fi

# Add the new issue detail and close the array
issues="$issues $issue_details]"
fi
done

# Display all unique recommendations that can be shown as Next Steps
if [ -n "$issues" ]; then
echo -e "\nRecommended Next Steps: \n"
echo "$issues"
fi
32 changes: 32 additions & 0 deletions codebundles/k8s-deployment-healthcheck/runbook.robot
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,38 @@ Check Readiness Probe Configuration for Deployment `${DEPLOYMENT_NAME}`
RW.Core.Add Pre To Report Readiness probe testing results:\n\n${readiness_probe_health.stdout}
RW.Core.Add Pre To Report Commands Used: ${readiness_probe_health.cmd}

Inspect Container Restarts for Deployment `${DEPLOYMENT_NAME}` Namespace `${NAMESPACE}`
[Documentation] Fetches pods that have container restarts and provides a report of the restart issues.
[Tags] namespace containers status restarts ${DEPLOYMENT_NAME} ${NAMESPACE}
${container_restart_analysis}= RW.CLI.Run Bash File
... bash_file=container_restarts.sh
... env=${env}
... secret_file__kubeconfig=${kubeconfig}
... include_in_history=False
${recommendations}= RW.CLI.Run Cli
... cmd=echo '${container_restart_analysis.stdout}' | awk '/Recommended Next Steps:/ {flag=1; next} flag'
... env=${env}
... include_in_history=false
IF $recommendations.stdout != ""
${recommendation_list}= Evaluate json.loads(r'''${recommendations.stdout}''') json
IF len(@{recommendation_list}) > 0
FOR ${item} IN @{recommendation_list}
RW.Core.Add Issue
... severity=${item["severity"]}
... expected=Containers should not be restarting for Deployment `${DEPLOYMENT_NAME}` in namespace `${NAMESPACE}`
... actual=We found containers with restarts for Deployment `${DEPLOYMENT_NAME}` in namespace `${NAMESPACE}`
... title=${item["title"]}
... reproduce_hint=${container_restart_analysis.cmd}
... details=${item["details"]}
... next_steps=${item["next_steps"]}
END
END
END
${history}= RW.CLI.Pop Shell History
RW.Core.Add Pre To Report Summary of container restarts for Deployment `${DEPLOYMENT_NAME}` in namespace: ${NAMESPACE}
RW.Core.Add Pre To Report ${container_restart_analysis.stdout}
RW.Core.Add Pre To Report Commands Used:\n${history}

Inspect Deployment Warning Events for `${DEPLOYMENT_NAME}`
[Documentation] Fetches warning events related to the deployment workload in the namespace and triages any issues found in the events.
[Tags] events workloads errors warnings get deployment ${DEPLOYMENT_NAME}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,9 @@ spec:
- name: SERVICE_EXCLUDE_PATTERN
value: (node_modules|opentelemetry)
- name: ANOMALY_THRESHOLD
value: "5.0"
value: "3.0"
- name: EVENT_AGE
value: "30"
secretsProvided:
- name: kubeconfig
workspaceKey: {{custom.kubeconfig_secret_name}}
Loading

0 comments on commit 4aea444

Please sign in to comment.