Skip to content

Commit

Permalink
Updates/ns (#316)
Browse files Browse the repository at this point in the history
* test script update to fox next step var sub

* revert prev change and fix echo

* update next steps for unready kustomizations

* add reproduce_hint

* fix typo, another next step

* deployment logs update

* update next steps for deployments
  • Loading branch information
stewartshea authored Jan 31, 2024
1 parent 8c86b9b commit fcfc064
Show file tree
Hide file tree
Showing 7 changed files with 134 additions and 62 deletions.
48 changes: 20 additions & 28 deletions codebundles/k8s-deployment-healthcheck/deployment_logs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -114,11 +114,14 @@ SEARCH_RESOURCES=""

# Format file / table http_logrus_custom
# Search for http log format used by online-boutique (which uses logrus but is custom)
echo "Query for HTTP Path patterns"
for FILE in "${LOG_FILES[@]}"; do
echo "$FILE"
LOG_SUMMARY=$(lnav -n -c ';SELECT COUNT(*) AS error_count, CASE WHEN "http.req.path" LIKE "/product%" THEN "/product" ELSE "http.req.path" END AS root_path, "http.resp.status" FROM http_logrus_custom WHERE "http.resp.status" = 500 AND NOT "http.req.path" = "/" GROUP BY root_path, "http.resp.status" ORDER BY error_count DESC;' $FILE)
echo "$LOG_SUMMARY"
INTERESTING_PATHS+=$(echo "$LOG_SUMMARY" | awk 'NR>1 && NR<5 {sub(/^\//, "", $2); print $2}')$'\n'
if [[ $LOG_SUMMARY ]]; then
INTERESTING_PATHS+=$(echo "$LOG_SUMMARY" | awk 'NR>1 && NR<5 {sub(/^\//, "", $2); print $2}')$'\n'
fi
done

if [[ -n "$INTERESTING_PATHS" ]]; then
Expand All @@ -129,7 +132,17 @@ else
echo "No interesting HTTP paths found."
fi

## Lightweight - we explicitly specify which resources we want to search
# Run RESOURCE_SEARCH_LIST only if SEARCH_RESOURCES has content
if [[ -n "$SEARCH_RESOURCES" ]]; then
RESOURCE_SEARCH_LIST=$(${KUBERNETES_DISTRIBUTION_BINARY} get deployment,pods,service,statefulset --context=${CONTEXT} -n ${NAMESPACE})
else
echo "No search queries based on HTTP Paths returned results."
fi


# Search for error fields and strings
echo "Query for generic error logs and sort"
for FILE in "${LOG_FILES[@]}"; do
echo "$FILE"
ERROR_SUMMARY=$(lnav -n -c ';SELECT error, COUNT(*) AS count FROM http_logrus_custom WHERE error IS NOT NULL GROUP BY error;' $FILE)
Expand All @@ -139,30 +152,11 @@ done
ERROR_FUZZY_STRING=$(echo "$ERROR_FUZZY_STRING" | sort | uniq)
##### End query #####




# # Fetch a list of all resources in the namespace
## Heavyweight - this times out after 30s, but is a better way to get any and all resources
# SEARCH_LIST=$(${KUBERNETES_DISTRIBUTION_BINARY} api-resources --verbs=list --namespaced -o name | xargs -n 1 ${KUBERNETES_DISTRIBUTION_BINARY} get --show-kind --ignore-not-found -n $NAMESPACE)

## Lightweight - we explicitly specify which resources we want to search
# Run RESOURCE_SEARCH_LIST only if SEARCH_RESOURCES has content
if [[ -n "$SEARCH_RESOURCES" ]]; then
RESOURCE_SEARCH_LIST=$(${KUBERNETES_DISTRIBUTION_BINARY} get deployment,pods,service,statefulset --context=${CONTEXT} -n ${NAMESPACE})
else
echo "No search queries returned results."
exit
fi



# Fuzzy match env vars in deployments with ERROR_FUZZY_STRING
declare -a FUZZY_ENV_VAR_RESOURCE_MATCHES
if [[ -n "$SEARCH_RESOURCES" && -n "$ERROR_FUZZY_STRING" ]]; then
if [[ -n "$ERROR_FUZZY_STRING" ]]; then
# Filter out common words from ERROR_FUZZY_STRING
FILTERED_ERROR_STRING=$(filter_common_words "$ERROR_FUZZY_STRING")

# Convert FILTERED_ERROR_STRING into an array
mapfile -t PATTERNS <<< "$FILTERED_ERROR_STRING"

Expand All @@ -188,7 +182,6 @@ if [[ -n "$SEARCH_RESOURCES" && -n "$ERROR_FUZZY_STRING" ]]; then
done
else
echo "No search queries or fuzzy matches to perform."
exit
fi

for match in "${FUZZY_ENV_VAR_RESOURCE_MATCHES[@]}"; do
Expand All @@ -202,7 +195,7 @@ done

# Fetch namespace events for searching through
EVENT_SEARCH_LIST=$(${KUBERNETES_DISTRIBUTION_BINARY} get events --context=${CONTEXT} -n ${NAMESPACE})
event_details="\nThe namespace `${NAMESPACE}` has produced the following interesting events:"
event_details="\nThe namespace \`${NAMESPACE}\` has produced the following interesting events:"
event_details+="\n"

# For each value, search the namespace for applicable resources and events
Expand All @@ -226,6 +219,7 @@ if [[ ${#FUZZY_ENV_VAR_RESOURCE_MATCHES[@]} -ne 0 ]]; then
env_value=${parts[3]}

if [[ -z ${seen_resources[$resource]} ]]; then
issue_descriptions+=("Error log could be related to \`$resource\`")
recommendations+=("Review manifest for \`$resource\` in namespace: \`${NAMESPACE}\`. Matched error log string \`$string\` in environment variable \`$env_key\`. ")
seen_resources[$resource]=1
fi
Expand Down Expand Up @@ -255,18 +249,16 @@ if [[ -n "$INTERESTING_RESOURCES" ]]; then
fi
;;
deployment|deployment.apps)
recommendations+=("Check deployment health \`$name\` in namespace \`${NAMESPACE}\`")
recommendations+=("Check Deployment health \`$name\` in namespace \`${NAMESPACE}\`")
;;
service)
recommendations+=("Check service health \`$name\` in namespace \`${NAMESPACE}\`")
recommendations+=("Check Service health \`$name\` in namespace \`${NAMESPACE}\`")
;;
statefulset|statefulset.apps)
recommendations+=("Check statefulset health \`$name\` in namespace \`${NAMESPACE}\`")
recommendations+=("Check Statefulset health \`$name\` in namespace \`${NAMESPACE}\`")
;;
esac
done <<< "$INTERESTING_RESOURCES"
else
echo "No resources found based on log query output"
fi

# Display the issue descriptions
Expand Down
25 changes: 13 additions & 12 deletions codebundles/k8s-deployment-healthcheck/runbook.robot
Original file line number Diff line number Diff line change
Expand Up @@ -43,16 +43,17 @@ Check Deployment Log For Issues with `${DEPLOYMENT_NAME}`
... cmd=awk '/Issues Identified:/ {start=1; next} /The namespace `${NAMESPACE}` has produced the following interesting events:/ {start=0} start' <<< '''${logs.stdout}'''
... env=${env}
... include_in_history=false
# FIXME: Refactor this to a loop of 1 issue per line of issue output - better alinging next steps with specific issues
RW.Core.Add Issue
... severity=2
... expected=No logs matching error patterns found in deployment `${DEPLOYMENT_NAME}` in namespace `${NAMESPACE}`
... actual=Error logs found in deployment `${DEPLOYMENT_NAME}` in namespace `${NAMESPACE}`
... title=Deployment `${DEPLOYMENT_NAME}` in `${NAMESPACE}` has error logs.
... reproduce_hint=View Commands Used in Report Output
... details=Deployment `${DEPLOYMENT_NAME}` in `${NAMESPACE}` generated the following log analysis: \n${logs.stdout}
... next_steps=${recommendations.stdout}

## We should improve deployment_logs.sh to generate a match issue + next steps + severity level
IF len($issues.stdout) > 0
RW.Core.Add Issue
... severity=3
... expected=No logs matching error patterns found in deployment `${DEPLOYMENT_NAME}` in namespace `${NAMESPACE}`
... actual=Error logs found in deployment `${DEPLOYMENT_NAME}` in namespace `${NAMESPACE}`
... title=Deployment `${DEPLOYMENT_NAME}` in `${NAMESPACE}` is generating error logs.
... reproduce_hint=View Commands Used in Report Output
... details=Deployment `${DEPLOYMENT_NAME}` in `${NAMESPACE}` generated the following log analysis: \n${logs.stdout}
... next_steps=${recommendations.stdout}
END
${history}= RW.CLI.Pop Shell History
RW.Core.Add Pre To Report
... Recent logs from deployment/`${DEPLOYMENT_NAME}` in `${NAMESPACE}`:\n\n${logs.stdout}
Expand Down Expand Up @@ -80,7 +81,7 @@ Check Liveness Probe Configuration for Deployment `${DEPLOYMENT_NAME}`
... secret_file__kubeconfig=${kubeconfig}
... show_in_rwl_cheatsheet=true
${recommendations}= RW.CLI.Run Cli
... cmd=awk '/Recommended Next Steps:/ {flag=1; next} flag' <<< "${liveness_probe_health.stdout}"
... cmd=awk '/Recommended Next Steps:/ {flag=1; next} flag' <<< '''${liveness_probe_health.stdout}'''
... env=${env}
... include_in_history=false
IF len($recommendations.stdout) > 0
Expand Down Expand Up @@ -117,7 +118,7 @@ Check Readiness Probe Configuration for Deployment `${DEPLOYMENT_NAME}`
... secret_file__kubeconfig=${kubeconfig}
... show_in_rwl_cheatsheet=true
${recommendations}= RW.CLI.Run Cli
... cmd=awk '/Recommended Next Steps:/ {flag=1; next} flag' <<< "${readiness_probe_health.stdout}"
... cmd=awk '/Recommended Next Steps:/ {flag=1; next} flag' <<< '''${readiness_probe_health.stdout}'''
... env=${env}
... include_in_history=false
IF len($recommendations.stdout) > 0
Expand Down
42 changes: 36 additions & 6 deletions codebundles/k8s-deployment-healthcheck/workload_next_steps.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,16 @@ if [[ $messages =~ "Misconfiguration" && $owner_kind == "Deployment" ]]; then
next_steps+=("Get Deployment Workload Details For \`$owner_name\` and Add to Report")
fi

if [[ $messages =~ "Deployment does not have minimum availability" && $owner_kind == "Deployment" ]]; then
next_steps+=("Troubleshoot Deployment Warning Events for \`$owner_name\`")
next_steps+=("Troubleshoot Container Restarts In Namespace \`$NAMESPACE\`")
fi

if [[ $messages =~ "Misconfiguration" ]]; then
next_steps+=("Review configuration of owner_kind \`$owner_name\`")
next_steps+=("Review configuration of $owner_kind \`$owner_name\`")
next_steps+=("Check for Node Failures or Maintenance Activities in Cluster \`$CONTEXT\`")
fi

if [[ $messages =~ "PodInitializing" ]]; then
next_steps+=("Check $owner_kind Health for \`$owner_name\`")
next_steps+=("Troubleshoot $owner_kind Warning Events for \`$owner_name\`")
fi

if [[ $messages =~ "Liveness probe failed" || $messages =~ "Liveness probe errored" ]]; then
next_steps+=("Check Liveliness Probe Configuration for $owner_kind \`$owner_name\`")
fi
Expand All @@ -44,11 +44,21 @@ if [[ $messages =~ "Readiness probe errored" || $messages =~ "Readiness probe fa
next_steps+=("Check Readiness Probe Configuration for $owner_kind \`$owner_name\`")
fi

if [[ $messages =~ "PodFailed" ]]; then
next_steps+=("Check Readiness Probe Configuration for $owner_kind \`$owner_name\`")
fi

if [[ $messages =~ "ImagePullBackOff" || $messages =~ "Back-off pulling image" || $messages =~ "ErrImagePull" ]]; then
next_steps+=("List ImagePullBackoff Events and Test Path and Tags for Namespace \`$NAMESPACE\`")
next_steps+=("List Images and Tags for Every Container in Failed Pods for Namespace \`$NAMESPACE\`")
fi

if [[ $messages =~ "Back-off restarting failed container" ]]; then
next_steps+=("Check Log for $owner_kind \`$owner_name\`")
next_steps+=("Troubleshoot Warning Events for $owner_kind \`$owner_name\`")

fi

if [[ $messages =~ "ImagePullBackOff" || $messages =~ "Back-off pulling image" || $messages =~ "ErrImagePull" ]]; then
next_steps+=("List ImagePullBackoff Events and Test Path and Tags for Namespace \`$NAMESPACE\`")
next_steps+=("List Images and Tags for Every Container in Failed Pods for Namespace \`$NAMESPACE\`")
Expand All @@ -58,5 +68,25 @@ if [[ $messages =~ "forbidden: failed quota" ]]; then
next_steps+=("Check Resource Quota Utilization in Namepace `${NAMESPACE}`")
fi

if [[ $messages =~ "No preemption victims found for incoming pod" || $messages =~ "Insufficient cpu" ]]; then
next_steps+=("Not enough node resources available to schedule pods. Escalate this issue to your cluster owner. ")
next_steps+=("Increase Node Count in Cluster")
next_steps+=("Check for Quota Errors")
fi

if [[ $messages =~ "max node group size reached" ]]; then
next_steps+=("Not enough node resources available to schedule pods. Escalate this issue to your cluster owner.")
next_steps+=("Increase node count in cluster.")
next_steps+=("Check for quota errors.")
fi

if [[ $messages =~ "Health check failed after" ]]; then
next_steps+=("Check $owner_kind \`$owner_name\` Health")
fi

if [[ ${#next_steps[@]} -eq 0 ]]; then
next_steps+=("Please review the report logs and escalate the issue if necessary.")
fi

# Display the list of recommendations
printf "%s\n" "${next_steps[@]}" | sort | uniq
33 changes: 22 additions & 11 deletions codebundles/k8s-fluxcd-kustomization-health/runbook.robot
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ Metadata Supports Kubernetes,AKS,EKS,GKE,OpenShift,FluxCD
Library RW.Core
Library RW.CLI
Library RW.platform
Library RW.NextSteps
Library String

Suite Setup Suite Initialization

Expand All @@ -28,26 +30,35 @@ Get details for unready Kustomizations in Namespace `${NAMESPACE}`
[Documentation] List all Kustomizations that are not found in a ready state in namespace ${NAMESPACE}
[Tags] FluxCD Kustomization Versions ${NAMESPACE}
${kustomizations_not_ready}= RW.CLI.Run Cli
... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get ${RESOURCE_NAME} -n ${NAMESPACE} --context ${CONTEXT} -o json | jq -r '.items[] | select (.status.conditions[] | select(.type == "Ready" and .status == "False")) | "---\\nKustomization Name: \\(.metadata.name)\\n\\nReady Status: \\(.status.conditions[] | select(.type == "Ready") | "\\n ready: \\(.status)\\n message: \\(.message)\\n reason: \\(.reason)\\n last_transition_time: \\(.lastTransitionTime)")\\n\\nReconcile Status:\\(.status.conditions[] | select(.type == "Reconciling") |"\\n reconciling: \\(.status)\\n message: \\(.message)")\\n---\\n"'
... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get ${RESOURCE_NAME} -n ${NAMESPACE} --context ${CONTEXT} -o json | jq '[.items[] | select(.status.conditions[] | select(.type == "Ready" and .status == "False")) | {KustomizationName: .metadata.name, ReadyStatus: {ready: (.status.conditions[] | select(.type == "Ready").status), message: (.status.conditions[] | select(.type == "Ready").message), reason: (.status.conditions[] | select(.type == "Ready").reason), last_transition_time: (.status.conditions[] | select(.type == "Ready").lastTransitionTime)}, ReconcileStatus: {reconciling: (.status.conditions[] | select(.type == "Reconciling").status), message: (.status.conditions[] | select(.type == "Reconciling").message)}}]'
... env=${env}
... secret_file__kubeconfig=${KUBECONFIG}
... show_in_rwl_cheatsheet=true
... render_in_commandlist=true
RW.CLI.Parse Cli Output By Line
... rsp=${kustomizations_not_ready}
... set_severity_level=2
... set_issue_expected=Kustomizations should be synced and ready.
... set_issue_actual=We found the following kustomization objects in a pending state: $_stdout
... set_issue_title=Unready Kustomizations Found In Namespace ${NAMESPACE}
... set_issue_details=Kustomizations pending with reasons:\n"$_stdout" in the namespace ${NAMESPACE}
... _line__raise_issue_if_contains=-
${kustomizations_not_ready_list}= Evaluate json.loads(r'''${kustomizations_not_ready.stdout}''') json
IF len(@{kustomizations_not_ready_list}) > 0
FOR ${item} IN @{kustomizations_not_ready_list}
${messages}= Replace String ${item["ReadyStatus"]["message"]} " ${EMPTY}
${item_next_steps}= RW.CLI.Run Bash File
... bash_file=workload_next_steps.sh
... cmd_override=./workload_next_steps.sh "${messages}"
... env=${env}
... include_in_history=False
RW.Core.Add Issue
... severity=2
... expected=Kustomizations should be synced and ready.
... actual=We found the following kustomization objects in a pending state: ${item}
... title=Unready Kustomization \`${item["KustomizationName"]}\` Found In Namespace \`${NAMESPACE}\`
... reproduce_hint=${kustomizations_not_ready.cmd}
... details=${item}
... next_steps=${item_next_steps.stdout}
END
END
${history}= RW.CLI.Pop Shell History
IF """${kustomizations_not_ready.stdout}""" == ""
${kustomizations_not_ready}= Set Variable No Kustomizations Pending Found
ELSE
${kustomizations_not_ready}= Set Variable ${kustomizations_not_ready.stdout}
END
${history}= RW.CLI.Pop Shell History
RW.Core.Add Pre To Report Kustomizations with: \n ${kustomizations_not_ready}
RW.Core.Add Pre To Report Commands Used:\n${history}

Expand Down
34 changes: 34 additions & 0 deletions codebundles/k8s-fluxcd-kustomization-health/workload_next_steps.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/bin/bash

# -----------------------------------------------------------------------------
# Script Information and Metadata
# -----------------------------------------------------------------------------
# Author: @stewartshea
# Description: This script takes in event message strings captured from a
# Kubernetes based system and provides some generalized next steps based on the
# content and frequency of the message.
# -----------------------------------------------------------------------------
# Input: List of event messages, related owner kind, and related owner name
messages="$1"


# Try to parse out object details
# Splitting the extracted string to get individual parts
matched=$(echo "$messages" | grep -oP "\[\K(\w+\/\w+\/.+?)(?=\])")
owner_kind=$(echo "$matched" | cut -d'/' -f1)
owner_name=$(echo "$matched" | cut -d'/' -f2)
additional_details=$(echo "$matched" | cut -d'/' -f3-)

# Initialize an empty array to store recommendations
next_steps=()


if [[ $messages =~ "Health check failed" ]]; then
next_steps+=("Troubleshoot $owner_kind Replicas for \`$owner_name\`")
next_steps+=("Troubleshoot $owner_kind Warning Events for \`$owner_name\`")
fi



# Display the list of recommendations
printf "%s\n" "${next_steps[@]}" | sort | uniq
6 changes: 3 additions & 3 deletions codebundles/k8s-namespace-healthcheck/runbook.robot
Original file line number Diff line number Diff line change
Expand Up @@ -127,9 +127,9 @@ Troubleshoot Pending Pods In Namespace `${NAMESPACE}`
... secret_file__kubeconfig=${kubeconfig}
... show_in_rwl_cheatsheet=true
... render_in_commandlist=true
${pendind_pod_list}= Split String ${pending_pods.stdout} _______-
IF len($pendind_pod_list) > 0
FOR ${item} IN @{pendind_pod_list}
${pending_pod_list}= Split String ${pending_pods.stdout} _______-
IF len($pending_pod_list) > 0
FOR ${item} IN @{pending_pod_list}
${is_not_just_newline}= Evaluate '''${item}'''.strip() != ''
IF ${is_not_just_newline}
${pod_name}= RW.CLI.Run Cli
Expand Down
8 changes: 6 additions & 2 deletions codebundles/k8s-namespace-healthcheck/workload_next_steps.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,15 @@ if [[ $messages =~ "Misconfiguration" && $owner_kind == "Deployment" ]]; then
fi

if [[ $messages =~ "Misconfiguration" ]]; then
next_steps+=("Review configuration of owner_kind \`$owner_name\`")
next_steps+=("Review configuration of $owner_kind \`$owner_name\`")
next_steps+=("Check for Node Failures or Maintenance Activities in Cluster \`$CONTEXT\`")
fi

if [[ $messages =~ "PodInitializing" ]]; then
next_steps+=("Check $owner_kind Health for \`$owner_name\`")
next_steps+=("Troubleshoot $owner_kind Warning Events for \`$owner_name\`")
fi

if [[ $messages =~ "Liveness probe failed" || $messages =~ "Liveness probe errored" ]]; then
next_steps+=("Check Liveliness Probe Configuration for $owner_kind \`$owner_name\`")
fi
Expand All @@ -54,7 +59,6 @@ if [[ $messages =~ "Back-off restarting failed container" ]]; then

fi


if [[ $messages =~ "ImagePullBackOff" || $messages =~ "Back-off pulling image" || $messages =~ "ErrImagePull" ]]; then
next_steps+=("List ImagePullBackoff Events and Test Path and Tags for Namespace \`$NAMESPACE\`")
next_steps+=("List Images and Tags for Every Container in Failed Pods for Namespace \`$NAMESPACE\`")
Expand Down

0 comments on commit fcfc064

Please sign in to comment.