Skip to content

Commit

Permalink
Updates/rs and owner (#340)
Browse files Browse the repository at this point in the history
* replica health script

* integrate task

* fix runwhen flag

* add util to find related resources for certain label / annotation patterns

* update default error codes

* switch to a script for future improvements

* add additional related objects

* simplify issue titles
  • Loading branch information
stewartshea authored Feb 26, 2024
1 parent 90db058 commit dd02724
Show file tree
Hide file tree
Showing 7 changed files with 309 additions and 24 deletions.
24 changes: 9 additions & 15 deletions codebundles/curl-gmp-nginx-ingress-inspection/runbook.robot
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ Metadata Supports GCP,GMP,Ingress,Nginx,Metrics
Library BuiltIn
Library RW.Core
Library RW.CLI
Library RW.K8sHelper
Library RW.platform
Library OperatingSystem

Expand Down Expand Up @@ -42,14 +43,20 @@ Fetch Nginx HTTP Errors From GMP for Ingress `${INGRESS_OBJECT_NAME}`
${owner_name}= RW.CLI.Run Cli
... cmd=echo "${k8s_ingress_details.stdout}" | grep 'Owner:[^ ]*' | awk -F': ' '{print $2}' |awk -F':' '{print $2}'| sed 's/ *$//' | tr -d '\n'
... include_in_history=false
${k8s_ingress_details}= RW.CLI.Run Cli
... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get ingress ${INGRESS_OBJECT_NAME} -n ${NAMESPACE} --context ${CONTEXT} -o json
... env=${env}
... secret_file__kubeconfig=${kubeconfig}
${related_resource_recommendations}= RW.K8sHelper.Get Related Resource Recommendations
... k8s_object=${k8s_ingress_details.stdout}
RW.CLI.Parse Cli Output By Line
... rsp=${gmp_rsp}
... set_severity_level=2
... set_issue_expected=The ingress in $_line should not have any HTTP responses with the following codes: ${ERROR_CODES}
... set_issue_actual=We found the following HTTP error codes: ${ERROR_CODES} associated with the ingress in $_line
... set_issue_title=Detected HTTP Error Codes for Ingress `${INGRESS_OBJECT_NAME}`
... set_issue_details=HTTP error codes in ingress and service "$_line". Troubleshoot the application associated with ${owner_kind.stdout} `${owner_name.stdout}`
... set_issue_next_steps=Check Deployment Log For Issues with `${owner_name.stdout}`\nQuery Traces for HTTP Errors in Namespace `${NAMESPACE}`
... set_issue_next_steps=Check Deployment Log For Issues with `${owner_name.stdout}`\nQuery Traces for HTTP Errors in Namespace `${NAMESPACE}`\n${related_resource_recommendations}
... _line__raise_issue_if_contains=Host
${ingress_info}= Set Variable ${gmp_rsp.stdout}
IF """${ingress_info}""" == "" or """${ingress_info}""".isspace()
Expand All @@ -75,7 +82,6 @@ Find Owner and Service Health for Ingress `${INGRESS_OBJECT_NAME}`
RW.Core.Add Pre To Report Commands Used: ${history}
RW.Core.Add Pre To Report Ingress Info:\n${k8s_ingress_details.stdout}


*** Keywords ***
Suite Initialization
${kubeconfig}= RW.Core.Import Secret
Expand All @@ -84,10 +90,6 @@ Suite Initialization
... description=The kubernetes kubeconfig yaml containing connection configuration used to connect to cluster(s).
... pattern=\w*
... example=For examples, start here https://kubernetes.io/docs/concepts/configuration/organize-cluster-access-kubeconfig/
${kubectl}= RW.Core.Import Service kubectl
... description=The location service used to interpret shell commands.
... default=kubectl-service.shared
... example=kubectl-service.shared
${KUBERNETES_DISTRIBUTION_BINARY}= RW.Core.Import User Variable KUBERNETES_DISTRIBUTION_BINARY
... type=string
... description=Which binary to use for Kubernetes CLI commands.
Expand All @@ -105,12 +107,6 @@ Suite Initialization
... pattern=\w*
... example=otel-demo
... default=
${GCLOUD_SERVICE}= RW.Core.Import Service gcloud
... type=string
... description=The selected RunWhen Service to use for accessing services within a network.
... pattern=\w*
... example=gcloud-service.shared
... default=gcloud-service.shared
${gcp_credentials_json}= RW.Core.Import Secret gcp_credentials_json
... type=string
... description=GCP service account json used to authenticate with GCP APIs.
Expand Down Expand Up @@ -151,15 +147,13 @@ Suite Initialization
... description=Which http status codes to look for and classify as errors.
... pattern=\w*
... example=500
... default=500|501|502
... default=500|501|502|503|504
${OS_PATH}= Get Environment Variable PATH
Set Suite Variable ${kubeconfig} ${kubeconfig}
Set Suite Variable ${kubectl} ${kubectl}
Set Suite Variable ${KUBERNETES_DISTRIBUTION_BINARY} ${KUBERNETES_DISTRIBUTION_BINARY}
Set Suite Variable ${CONTEXT} ${CONTEXT}
Set Suite Variable ${NAMESPACE} ${NAMESPACE}
Set Suite Variable ${ERROR_CODES} ${ERROR_CODES}
Set Suite Variable ${GCLOUD_SERVICE} ${GCLOUD_SERVICE}
Set Suite Variable ${gcp_credentials_json} ${gcp_credentials_json}
Set Suite Variable ${GCP_PROJECT_ID} ${GCP_PROJECT_ID}
Set Suite Variable ${INGRESS_HOST} ${INGRESS_HOST}
Expand Down
146 changes: 146 additions & 0 deletions codebundles/k8s-deployment-healthcheck/check_replicaset.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
#!/bin/bash

# Kubernetes Deployment ReplicaSet Management Script
# This script checks Kubernetes deployments to ensure they are running the latest ReplicaSet. It is designed to manage
# ReplicaSets during normal operations and rolling updates, checking for multiple ReplicaSets, verifying the active latest ReplicaSet, and providing actionable insights for any inactive or conflicting ReplicaSets.

# Function to check for rolling update status
check_rolling_update_status() {
# Extract conditions and replica counts
local progressingCondition=$(echo "$DEPLOYMENT_JSON" | jq '.status.conditions[] | select(.type=="Progressing")')
local availableCondition=$(echo "$DEPLOYMENT_JSON" | jq '.status.conditions[] | select(.type=="Available").status')
local replicas=$(echo "$DEPLOYMENT_JSON" | jq '.status.replicas // 0')
local updatedReplicas=$(echo "$DEPLOYMENT_JSON" | jq '.status.updatedReplicas // 0')
local availableReplicas=$(echo "$DEPLOYMENT_JSON" | jq '.status.availableReplicas // 0')
local readyReplicas=$(echo "$DEPLOYMENT_JSON" | jq '.status.readyReplicas // 0')

# Interpret 'Progressing' condition more accurately
local progressingStatus=$(echo "$progressingCondition" | jq -r '.status')
local progressingReason=$(echo "$progressingCondition" | jq -r '.reason')
local lastUpdateTime=$(echo "$progressingCondition" | jq -r '.lastUpdateTime')

# Current time in UTC for comparison (assuming 'date' command is available and system timezone is correctly set)
local currentTime=$(date -u +"%Y-%m-%dT%H:%M:%SZ")

# Compare replica counts for a more accurate ongoing rollout check
if [[ "$progressingStatus" == "True" && "$progressingReason" == "NewReplicaSetAvailable" && "$updatedReplicas" == "$replicas" && "$availableReplicas" == "$updatedReplicas" && "$readyReplicas" == "$updatedReplicas" ]]; then
# Check how recent the last update was to consider a buffer for stabilization
if [[ $(date -d "$lastUpdateTime" +%s) -lt $(date -d "$currentTime" +%s --date='-2 minutes') ]]; then
echo "Deployment $DEPLOYMENT_NAME is stable. No active rollout detected."
ROLLING_UPDATE_STATUS=1 # Indicates no update is in progress
else
echo "Deployment $DEPLOYMENT_NAME has recently updated and may still be stabilizing."
ROLLING_UPDATE_STATUS=0 # Indicates recent update, considering stabilization
fi
elif [[ "$updatedReplicas" -lt "$replicas" ]] || [[ "$availableReplicas" -lt "$updatedReplicas" ]] || [[ "$readyReplicas" -lt "$updatedReplicas" ]]; then
echo "Deployment $DEPLOYMENT_NAME is undergoing a rollout."
ROLLING_UPDATE_STATUS=0 # Indicates an update is in progress
else
echo "Deployment $DEPLOYMENT_NAME is stable. No active rollout detected."
ROLLING_UPDATE_STATUS=1 # Indicates no update is in progress
fi
}



verify_pods_association_with_latest_rs() {
# Fetch all pods associated with the deployment
PODS_JSON=$(${KUBERNETES_DISTRIBUTION_BINARY} get pods -n $NAMESPACE --context $CONTEXT --selector=app=$DEPLOYMENT_NAME --context $CONTEXT -o json)
PODS_COUNT=$(echo "$PODS_JSON" | jq '.items | length')
OUTDATED_PODS_COUNT=0

for ((i=0; i<PODS_COUNT; i++)); do
POD_RS=$(echo "$PODS_JSON" | jq -r ".items[$i].metadata.ownerReferences[] | select(.kind == \"ReplicaSet\") | .name")
if [[ "$POD_RS" != "$LATEST_RS" ]]; then
OUTDATED_PODS_COUNT=$((OUTDATED_PODS_COUNT + 1))
fi
done

if [[ "$OUTDATED_PODS_COUNT" -eq 0 ]]; then
echo "All pods are correctly associated with the latest ReplicaSet."
else
echo "Warning: $OUTDATED_PODS_COUNT pod(s) are not associated with the latest ReplicaSet."
issue_details="{\"severity\":\"2\",\"title\":\"$OUTDATED_PODS_COUNT pod(s) are not running the latest version of Deployment \`$DEPLOYMENT_NAME\` in namespace \`${NAMESPACE}\`\",\"next_steps\":\"Clean up stale ReplicaSet \`$RS\` for Deployment \`$DEPLOYMENT_NAME\` in namespace \`${NAMESPACE}\` \",\"details\":\"$RS_DETAILS\"}"
fi
}

# Get Deployment JSON
DEPLOYMENT_JSON=$(${KUBERNETES_DISTRIBUTION_BINARY} get deployment $DEPLOYMENT_NAME -n $NAMESPACE --context $CONTEXT -o json)

# Get the deployment's latest ReplicaSet
REPLICASETS_JSON=$(${KUBERNETES_DISTRIBUTION_BINARY} get rs -n $NAMESPACE --context $CONTEXT -o json | jq --arg DEPLOYMENT_NAME "$DEPLOYMENT_NAME" \
'[.items[] | select(.metadata.ownerReferences[]? | select(.kind == "Deployment" and .name == $DEPLOYMENT_NAME))]')

# Extract the name of the latest ReplicaSet from the filtered JSON
LATEST_RS=$(echo "$REPLICASETS_JSON" | jq -r 'sort_by(.metadata.creationTimestamp) | last(.[]).metadata.name')

# Extract names of all ReplicaSets associated with the Deployment from the filtered JSON
ALL_RS=$(echo "$REPLICASETS_JSON" | jq -r '.[].metadata.name' | tr '\n' ' ')
readarray -t ALL_RS_NAMES < <(echo "$REPLICASETS_JSON" | jq -r '.[].metadata.name')

echo "Latest ReplicaSet: $LATEST_RS"
echo "All ReplicaSets for the deployment: $ALL_RS"

ROLLING_UPDATE_STATUS=-1 # Default to -1; will be set to 0 or 1 by check_rolling_update_status
check_rolling_update_status

# Check if there are multiple ReplicaSets and if the latest is active
if [[ $(echo $ALL_RS | tr ' ' '\n' | wc -l) -gt 1 ]]; then
echo "Multiple ReplicaSets detected. Verifying..."

# Loop through all ReplicaSets
for RS in $ALL_RS; do
# Skip the latest ReplicaSet
if [[ "$RS" == "$LATEST_RS" ]]; then
continue
fi

# Check the status of older ReplicaSets (replicas, availableReplicas, readyReplicas)
RS_DETAILS_JSON=$(echo "$REPLICASETS_JSON" | jq --arg RS "$RS" '.[] | select(.metadata.name==$RS)')
REPLICAS=$(echo "$RS_DETAILS_JSON" | jq '.status.replicas')
if [[ "$REPLICAS" == "0" ]]; then
echo "ReplicaSet $RS for Deployment $DEPLOYMENT_NAME is not active. Consider for cleanup..."
else
if [[ $ROLLING_UPDATE_STATUS -eq 0 ]]; then
date
echo "Multiple ReplicaSets are active, which is expected due to the rolling update process."
issue_details="{\"severity\":\"4\",\"title\":\"A rolling update is in progress for Deployment \`$DEPLOYMENT_NAME\` in namespace \`${NAMESPACE}\`\",\"next_steps\":\"Wait for Rollout to Complete and Check Again.\",\"details\":\"$RS_DETAILS\"}"

elif [[ $ROLLING_UPDATE_STATUS -eq 1 ]]; then
echo "Multiple ReplicaSets are active and no update appears to be in place. Investigation may be required to ensure they are not conflicting."
verify_pods_association_with_latest_rs
issue_details="{\"severity\":\"2\",\"title\":\"Conflicting versions detected for Deployment \`$DEPLOYMENT_NAME\` in namespace \`${NAMESPACE}\`\",\"next_steps\":\"Clean up stale ReplicaSet \`$RS\` for Deployment \`$DEPLOYMENT_NAME\` in namespace \`${NAMESPACE}\` \",\"details\":\"$RS_DETAILS_JSON\"}"
else
echo "Multiple ReplicaSets are active and no update appears to be in place. Investigation may be required to ensure they are not conflicting."
fi
fi

# Initialize issues as an empty array if not already set
if [ -z "$issues" ]; then
issues="[]"
fi

# Concatenate issue detail to the string
if [ -n "$issue_details" ]; then
# Remove the closing bracket from issues to prepare for adding a new item
issues="${issues%]}"

# If issues is not an empty array (more than just "["), add a comma before the new item
if [ "$issues" != "[" ]; then
issues="$issues,"
fi

# Add the new issue detail and close the array
issues="$issues $issue_details]"
fi
done
else
echo "Only one ReplicaSet is active. Deployment is up to date."
fi


# Display all unique recommendations that can be shown as Next Steps
if [ -n "$issues" ]; then
echo -e "\nRecommended Next Steps: \n"
echo "$issues"
fi
51 changes: 51 additions & 0 deletions codebundles/k8s-deployment-healthcheck/event_anomalies.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/bin/bash

# Assuming environment variables are already exported and available

# Command to get Kubernetes events in JSON format
EVENTS_JSON=$(${KUBERNETES_DISTRIBUTION_BINARY} get events --context ${CONTEXT} -n ${NAMESPACE} -o json)

# Use jq to process the JSON, skipping events without valid timestamps
PROCESSED_EVENTS=$(echo "${EVENTS_JSON}" | jq --arg DEPLOYMENT_NAME "${DEPLOYMENT_NAME}" '
[ .items[]
| select(
.type != "Warning"
and (.involvedObject.kind | test("Deployment|ReplicaSet|Pod"))
and (.involvedObject.name | contains($DEPLOYMENT_NAME))
and (.firstTimestamp | fromdateiso8601? // empty) and (.lastTimestamp | fromdateiso8601? // empty)
)
| {
kind: .involvedObject.kind,
count: .count,
name: .involvedObject.name,
reason: .reason,
message: .message,
firstTimestamp: .firstTimestamp,
lastTimestamp: .lastTimestamp,
duration: (
if (((.lastTimestamp | fromdateiso8601) - (.firstTimestamp | fromdateiso8601)) == 0)
then 1
else (((.lastTimestamp | fromdateiso8601) - (.firstTimestamp | fromdateiso8601)) / 60)
end
)
}
]
| group_by([.kind, .name])
| map({
kind: .[0].kind,
name: .[0].name,
count: (map(.count) | add),
reasons: (map(.reason) | unique),
messages: (map(.message) | unique),
average_events_per_minute: (
if .[0].duration == 1
then 1
else ((map(.count) | add) / .[0].duration)
end
),
firstTimestamp: (map(.firstTimestamp | fromdateiso8601) | sort | .[0] | todateiso8601),
lastTimestamp: (map(.lastTimestamp | fromdateiso8601) | sort | reverse | .[0] | todateiso8601)
})
')

echo "${PROCESSED_EVENTS}"
Loading

0 comments on commit dd02724

Please sign in to comment.