diff --git a/codebundles/gcp-bucket-health/check_security.sh b/codebundles/gcp-bucket-health/check_security.sh index aad8660d..59c81f12 100755 --- a/codebundles/gcp-bucket-health/check_security.sh +++ b/codebundles/gcp-bucket-health/check_security.sh @@ -109,6 +109,8 @@ done echo "Security Issues:" if [ ${#ISSUES[@]} -eq 0 ]; then echo "No security issues found." + # Add empty json list to file so that json loads doesn't fail. + echo "[{}]" > $HOME/bucket_security_issues.json else echo "${ISSUES[@]}" | jq -s . > $HOME/bucket_security_issues.json cat $HOME/bucket_security_issues.json diff --git a/codebundles/gcp-bucket-health/runbook.robot b/codebundles/gcp-bucket-health/runbook.robot index c3d05160..09a16c8c 100644 --- a/codebundles/gcp-bucket-health/runbook.robot +++ b/codebundles/gcp-bucket-health/runbook.robot @@ -61,6 +61,9 @@ Check GCP Bucket Security Configuration for `${PROJECT_IDS}` ... env=${env} ... secret_file__gcp_credentials_json=${gcp_credentials_json} ... show_in_rwl_cheatsheet=true + RW.Core.Add Pre To Report GCP Security Configuration Check:\n${bucket_security_configuration.stdout} + RW.Core.Add Pre To Report Commands Used:\n${bucket_security_configuration.cmd} + ${bucket_security_output}= RW.CLI.Run Cli ... cmd=cat $HOME/bucket_security_issues.json | jq . ... env=${env} @@ -82,8 +85,6 @@ Check GCP Bucket Security Configuration for `${PROJECT_IDS}` ... next_steps=Review IAM configuration for GCP storage bucket `${item["bucket"]}` in project `${item["project"]}` END END - RW.Core.Add Pre To Report GCP Security Configuration Check:\n${bucket_security_configuration.stdout} - RW.Core.Add Pre To Report Commands Used:\n${bucket_security_configuration.cmd} *** Keywords *** Suite Initialization diff --git a/codebundles/k8s-deployment-healthcheck/deployment_logs.sh b/codebundles/k8s-deployment-healthcheck/deployment_logs.sh index f722892d..ad8b282b 100755 --- a/codebundles/k8s-deployment-healthcheck/deployment_logs.sh +++ b/codebundles/k8s-deployment-healthcheck/deployment_logs.sh @@ -70,34 +70,43 @@ if [ -z "$SELECTOR" ]; then exit 1 fi +fetch_logs() { + local POD=$1 + local CONTAINER=$2 + local PREVIOUS_FLAG=$3 + + if [ -n "$LOGS_ERROR_PATTERN" ] && [ -n "$LOGS_EXCLUDE_PATTERN" ]; then + # Both error and exclusion patterns provided + LOGS=$($KUBERNETES_DISTRIBUTION_BINARY logs $POD -c $CONTAINER $PREVIOUS_FLAG --limit-bytes=256000 --since=3h --context=$CONTEXT -n $NAMESPACE | grep -Ei "$LOGS_ERROR_PATTERN" | grep -Eiv "$LOGS_EXCLUDE_PATTERN") + elif [ -n "$LOGS_ERROR_PATTERN" ]; then + # Only error pattern provided + LOGS=$($KUBERNETES_DISTRIBUTION_BINARY logs $POD -c $CONTAINER $PREVIOUS_FLAG --limit-bytes=256000 --since=3h --context=$CONTEXT -n $NAMESPACE | grep -Ei "$LOGS_ERROR_PATTERN") + elif [ -n "$LOGS_EXCLUDE_PATTERN" ]; then + # Only exclusion pattern provided + LOGS=$($KUBERNETES_DISTRIBUTION_BINARY logs $POD -c $CONTAINER $PREVIOUS_FLAG --limit-bytes=256000 --since=3h --context=$CONTEXT -n $NAMESPACE | grep -Eiv "$LOGS_EXCLUDE_PATTERN") + else + # Neither pattern provided + LOGS=$($KUBERNETES_DISTRIBUTION_BINARY logs $POD -c $CONTAINER $PREVIOUS_FLAG --limit-bytes=256000 --since=3h --context=$CONTEXT -n $NAMESPACE) + fi + + # Check log format and store appropriately + FIRST_LINE=$(echo "$LOGS" | head -n 1) + EXT=$(echo "$FIRST_LINE" | jq -e . &>/dev/null && echo "json" || echo "txt") + FILENAME="${POD}_${CONTAINER}_logs${PREVIOUS_FLAG:+_previous}.$EXT" + LOG_FILES+=("$FILENAME") + echo "Fetching logs for Pod: $POD, Container: $CONTAINER. Saving to $FILENAME." + echo "$LOGS" > $FILENAME +} + # Iterate through the pods based on the selector and fetch logs -LOG_FILES=() while read POD; do - CONTAINERS=$(${KUBERNETES_DISTRIBUTION_BINARY} get pod $POD -n ${NAMESPACE} --context=${CONTEXT} -o=jsonpath='{range .spec.containers[*]}{.name}{"\n"}{end}') + CONTAINERS=$($KUBERNETES_DISTRIBUTION_BINARY get pod $POD -n $NAMESPACE --context=$CONTEXT -o=jsonpath='{range .spec.containers[*]}{.name}{"\n"}{end}') for CONTAINER in $CONTAINERS; do - if [ -n "$LOGS_ERROR_PATTERN" ] && [ -n "$LOGS_EXCLUDE_PATTERN" ]; then - # Both error and exclusion patterns provided - LOGS=$(${KUBERNETES_DISTRIBUTION_BINARY} logs $POD -c $CONTAINER --limit-bytes=256000 --since=3h --context=${CONTEXT} -n ${NAMESPACE} | grep -Ei "${LOGS_ERROR_PATTERN}" | grep -Eiv "${LOGS_EXCLUDE_PATTERN}") - elif [ -n "$LOGS_ERROR_PATTERN" ]; then - # Only error pattern provided - LOGS=$(${KUBERNETES_DISTRIBUTION_BINARY} logs $POD -c $CONTAINER --limit-bytes=256000 --since=3h --context=${CONTEXT} -n ${NAMESPACE} | grep -Ei "${LOGS_ERROR_PATTERN}") - elif [ -n "$LOGS_EXCLUDE_PATTERN" ]; then - # Only exclusion pattern provided - LOGS=$(${KUBERNETES_DISTRIBUTION_BINARY} logs $POD -c $CONTAINER --limit-bytes=256000 --since=3h --context=${CONTEXT} -n ${NAMESPACE} | grep -Eiv "${LOGS_EXCLUDE_PATTERN}") - else - # Neither pattern provided - LOGS=$(${KUBERNETES_DISTRIBUTION_BINARY} logs $POD -c $CONTAINER --limit-bytes=256000 --since=3h --context=${CONTEXT} -n ${NAMESPACE}) - fi - - # Check log format and store appropriately - FIRST_LINE=$(echo "$LOGS" | head -n 1) - EXT=$(echo "$FIRST_LINE" | jq -e . &>/dev/null && echo "json" || echo "txt") - FILENAME="${POD}_${CONTAINER}_logs.$EXT" - LOG_FILES+=("$FILENAME") - echo "Fetching logs for Pod: $POD, Container: $CONTAINER. Saving to $FILENAME." - echo "$LOGS" > $FILENAME + fetch_logs $POD $CONTAINER "" + fetch_logs $POD $CONTAINER "-p" done -done < <(${KUBERNETES_DISTRIBUTION_BINARY} get pods --selector=$SELECTOR -n ${NAMESPACE} --context=${CONTEXT} -o=jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}') +done < <($KUBERNETES_DISTRIBUTION_BINARY get pods --selector=$SELECTOR -n $NAMESPACE --context=$CONTEXT -o=jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}') + # Initialize an issue description array issue_descriptions=() diff --git a/codebundles/k8s-deployment-healthcheck/runbook.robot b/codebundles/k8s-deployment-healthcheck/runbook.robot index 5758e52a..cce01a75 100644 --- a/codebundles/k8s-deployment-healthcheck/runbook.robot +++ b/codebundles/k8s-deployment-healthcheck/runbook.robot @@ -61,7 +61,6 @@ Check Deployment Log For Issues with `${DEPLOYMENT_NAME}` ... Recent logs from Deployment ${DEPLOYMENT_NAME} in Namespace ${NAMESPACE}:\n\n${logs.stdout} RW.Core.Add Pre To Report Commands Used: ${history} -# Fetch Previous Logs for Deployment `${DEPLOYMENT_NAME}` Check Liveness Probe Configuration for Deployment `${DEPLOYMENT_NAME}` [Documentation] Validates if a Liveliness probe has possible misconfigurations diff --git a/codebundles/k8s-podresources-health/.runwhen/templates/k8s-pod-resources-taskset.yaml b/codebundles/k8s-podresources-health/.runwhen/templates/k8s-pod-resources-taskset.yaml index eac3a5fe..a380830a 100644 --- a/codebundles/k8s-podresources-health/.runwhen/templates/k8s-pod-resources-taskset.yaml +++ b/codebundles/k8s-podresources-health/.runwhen/templates/k8s-pod-resources-taskset.yaml @@ -29,6 +29,10 @@ spec: value: {{context}} - name: LABELS value: '' + - name: UTILIZATION_THRESHOLD + value: 95 + - name: DEFAULT_INCREASE + value: 25 secretsProvided: - name: kubeconfig workspaceKey: {{custom.kubeconfig_secret_name}} \ No newline at end of file diff --git a/codebundles/k8s-podresources-health/find_resource_owners.sh b/codebundles/k8s-podresources-health/find_resource_owners.sh new file mode 100755 index 00000000..96d12509 --- /dev/null +++ b/codebundles/k8s-podresources-health/find_resource_owners.sh @@ -0,0 +1,59 @@ +#!/bin/bash +# set -eo pipefail + +# ----------------------------------------------------------------------------- +# Script Information and Metadata +# ----------------------------------------------------------------------------- +# Author: @stewartshea +# Description: This script is designed to take in some information about a +# resource (typically a pod) and return it's owner. +# NOTES: +# Not sure if this is best served as a bash script or keyword +# This is quickly added and likely requires further expansion +# Not sure if it makes sense to keep this as a shared script, or +# packaged multiple times with each codebundle depending on cases +# ----------------------------------------------------------------------------- + +# Define the kind of resource, name (or part of it), namespace, and context +RESOURCE_KIND="$1" +RESOURCE_NAME="$2" +NAMESPACE="$3" +CONTEXT="$4" + +# Command to get the Kubernetes distribution binary, for example, kubectl +KUBERNETES_DISTRIBUTION_BINARY="kubectl" + +# Function to get the owner of a resource +get_owner() { + local resource_name=$1 + local resource_kind=$2 + owner_kind=$(${KUBERNETES_DISTRIBUTION_BINARY} get $resource_kind $resource_name -n "${NAMESPACE}" --context="${CONTEXT}" -o=jsonpath="{.metadata.ownerReferences[0].kind}") + if [ "$owner_kind" = "" ]; then + # No owner reference means there is no parent object. Return the direct object. + echo "$resource_kind $resource_name" + elif [ "$owner_kind" = "ReplicaSet" ]; then + replicaset=$(${KUBERNETES_DISTRIBUTION_BINARY} get $resource_kind $resource_name -n "${NAMESPACE}" --context="${CONTEXT}" -o=jsonpath="{.metadata.ownerReferences[0].name}") + deployment_name=$(${KUBERNETES_DISTRIBUTION_BINARY} get replicaset $replicaset -n "${NAMESPACE}" --context="${CONTEXT}" -o=jsonpath="{.metadata.ownerReferences[0].name}") + echo "Deployment $deployment_name" + else + owner_info=$(${KUBERNETES_DISTRIBUTION_BINARY} get $resource_kind $resource_name -n "${NAMESPACE}" --context="${CONTEXT}" -o=jsonpath="{.metadata.ownerReferences[0].name}") + echo "$owner_kind $owner_info" + fi +} + + +resources=$(${KUBERNETES_DISTRIBUTION_BINARY} get $RESOURCE_KIND -n "${NAMESPACE}" --context="${CONTEXT}" | grep "${RESOURCE_NAME}" | awk '{print $1}' || true) + +if [ -z "$resources" ]; then + echo "No resource found" +else + while read resource_name; do + if [ ! -z "$resource_name" ]; then + owner=$(get_owner "$resource_name" "$RESOURCE_KIND") + if [ -n "$owner" ]; then + echo "$owner" | tr -d '\n' + exit 0 + fi + fi + done <<< $resources +fi \ No newline at end of file diff --git a/codebundles/k8s-podresources-health/identify_resource_contrained_pods.sh b/codebundles/k8s-podresources-health/identify_resource_contrained_pods.sh new file mode 100755 index 00000000..14397f7f --- /dev/null +++ b/codebundles/k8s-podresources-health/identify_resource_contrained_pods.sh @@ -0,0 +1,116 @@ +#!/bin/bash + +# Ensure KUBERNETES_DISTRIBUTION_BINARY, CONTEXT, NAMESPACE, UTILIZATION_THRESHOLD, and DEFAULT_INCREASE are set +if [[ -z "${KUBERNETES_DISTRIBUTION_BINARY}" || -z "${CONTEXT}" || -z "${NAMESPACE}" || -z "${UTILIZATION_THRESHOLD}" || -z "${DEFAULT_INCREASE}" ]]; then + echo "KUBERNETES_DISTRIBUTION_BINARY, CONTEXT, NAMESPACE, UTILIZATION_THRESHOLD, and DEFAULT_INCREASE environment variables must be set." + exit 1 +fi + +# Function to check if a pod has OOMKilled status or exit code 137 +check_pod_status() { + local pod_name=$1 + + pod_status=$(${KUBERNETES_DISTRIBUTION_BINARY} get pod "$pod_name" -n "${NAMESPACE}" --context ${CONTEXT} -o jsonpath='{.status.containerStatuses[*].state.terminated.reason}') + exit_code=$(${KUBERNETES_DISTRIBUTION_BINARY} get pod "$pod_name" -n "${NAMESPACE}" --context ${CONTEXT} -o jsonpath='{.status.containerStatuses[*].state.terminated.exitCode}') + + if [[ $pod_status == *"OOMKilled"* || $exit_code -eq 137 ]]; then + echo true + else + echo false + fi +} + +# Initialize an empty array to store overutilized pods +overutilized_pods=() + +# Get the list of pods and their resource usage in the specified namespace +pods=$(${KUBERNETES_DISTRIBUTION_BINARY} top pod -n "${NAMESPACE}" --context ${CONTEXT} --no-headers | awk '{print $1}') + +# Loop through each pod +for pod in $pods; do + # Get pod resource limits + echo "---" + echo "Processing Pod $pod" + cpu_limit=$(${KUBERNETES_DISTRIBUTION_BINARY} get pod "$pod" -n "${NAMESPACE}" --context ${CONTEXT} -o jsonpath='{.spec.containers[*].resources.limits.cpu}') + mem_limit=$(${KUBERNETES_DISTRIBUTION_BINARY} get pod "$pod" -n "${NAMESPACE}" --context ${CONTEXT} -o jsonpath='{.spec.containers[*].resources.limits.memory}') + + # Convert memory limit to Mi + if [[ $mem_limit == *Gi ]]; then + mem_limit=$(echo "$mem_limit" | sed 's/Gi//' | awk '{printf "%.0f", $1 * 1024}') + elif [[ $mem_limit == *Mi ]]; then + mem_limit=$(echo "$mem_limit" | sed 's/Mi//') + fi + + # Convert CPU limit to millicores + if [[ $cpu_limit == *m ]]; then + cpu_limit=$(echo "$cpu_limit" | sed 's/m//') + else + cpu_limit=$(echo "$cpu_limit" | awk '{printf "%.0f", $1 * 1000}') + fi + + # Handle cases where limits are not set (0 or empty) + cpu_limit=${cpu_limit:-0} + mem_limit=${mem_limit:-0} + + # Get pod current resource usage + cpu_usage=$(${KUBERNETES_DISTRIBUTION_BINARY} top pod "$pod" -n "${NAMESPACE}" --context ${CONTEXT} --no-headers | awk '{print $2}' | sed 's/m//') + mem_usage=$(${KUBERNETES_DISTRIBUTION_BINARY} top pod "$pod" -n "${NAMESPACE}" --context ${CONTEXT} --no-headers | awk '{print $3}' | sed 's/Mi//') + echo "CPU Limit: $cpu_limit (m)" + echo "CPU Usage: $cpu_usage (m)" + echo "Memory Limit: $mem_limit (Mi)" + echo "Memory Usage: $mem_usage (Mi)" + + # Calculate threshold values + if [[ $cpu_limit -ne 0 ]]; then + cpu_threshold=$(awk "BEGIN {printf \"%.0f\", $cpu_limit * $UTILIZATION_THRESHOLD / 100}") + else + cpu_threshold=0 + fi + + if [[ $mem_limit -ne 0 ]]; then + mem_threshold=$(awk "BEGIN {printf \"%.0f\", $mem_limit * $UTILIZATION_THRESHOLD / 100}") + else + mem_threshold=0 + fi + + echo "CPU Threshold: $cpu_threshold (m)" + echo "Memory Threshold: $mem_threshold (Mi)" + + # Check if the pod is overutilized + reason="" + if [[ $cpu_limit -ne 0 && $cpu_usage -gt $cpu_threshold ]]; then + reason="CPU usage exceeds threshold" + fi + if [[ $mem_limit -ne 0 && $mem_usage -gt $mem_threshold ]]; then + if [[ -n $reason ]]; then + reason="$reason and memory usage exceeds threshold" + else + reason="Memory usage exceeds threshold" + fi + fi + + if [[ -n $reason ]]; then + recommended_cpu_increase=$(awk "BEGIN {printf \"%.0f\", $cpu_limit * (1 + $DEFAULT_INCREASE / 100)}") + recommended_mem_increase=$(awk "BEGIN {printf \"%.0f\", $mem_limit * (1 + $DEFAULT_INCREASE / 100)}") + overutilized_pods+=("{\"namespace\":\"${NAMESPACE}\", \"pod\":\"$pod\", \"reason\":\"$reason\", \"cpu_usage\":\"$cpu_usage\", \"mem_usage\":\"$mem_usage\", \"cpu_limit\":\"$cpu_limit\", \"mem_limit\":\"$mem_limit\", \"cpu_threshold\":\"$cpu_threshold\", \"mem_threshold\":\"$mem_threshold\", \"recommended_cpu_increase\":\"$recommended_cpu_increase (m)\", \"recommended_mem_increase\":\"$recommended_mem_increase (Mi)\"}") + fi + + # Check if the pod has an exit code of 137 or OOMKilled status + if [[ $(check_pod_status "$pod") == "true" ]]; then + overutilized_pods+=("{\"namespace\":\"${NAMESPACE}\", \"pod\":\"$pod\", \"reason\":\"OOMKilled or exit code 137\", \"cpu_usage\":\"$cpu_usage\", \"mem_usage\":\"$mem_usage\", \"cpu_limit\":\"$cpu_limit\", \"mem_limit\":\"$mem_limit\"}") + fi +done + +# Convert the array to JSON format +json_output=$(printf '%s\n' "${overutilized_pods[@]}" | jq -s '.') + +# Write the JSON output to a file +output_file="$HOME/overutilized_pods.json" +if [[ ${#overutilized_pods[@]} -eq 0 ]]; then + echo "[]" > "$output_file" +else + echo "$json_output" > "$output_file" +fi + +# Print the JSON output +echo "$json_output" diff --git a/codebundles/k8s-podresources-health/runbook.robot b/codebundles/k8s-podresources-health/runbook.robot index b599962e..c0cfe795 100644 --- a/codebundles/k8s-podresources-health/runbook.robot +++ b/codebundles/k8s-podresources-health/runbook.robot @@ -89,7 +89,7 @@ Get Pod Resource Utilization with Top in Namespace `${NAMESPACE}` RW.Core.Add Pre To Report Pod Resources:\n${resource_util_info} RW.Core.Add Pre To Report Commands Used:\n${history} -Identify Pod Resource Recommendations in Namespace `${NAMESPACE}` +Identify VPA Pod Resource Recommendations in Namespace `${NAMESPACE}` [Documentation] Queries the namespace for any Vertical Pod Autoscaler resource recommendations. [Tags] recommendation resources utilization pods cpu memory allocation vpa ${NAMESPACE} ${vpa_usage}= RW.CLI.Run Bash File @@ -118,35 +118,69 @@ Identify Pod Resource Recommendations in Namespace `${NAMESPACE}` END RW.Core.Add Pre To Report ${vpa_usage.stdout}\n -Scan For Over Utilized Pods In Namespace `${NAMESPACE}` +Identify Resource Constrained Pods In Namespace `${NAMESPACE}` [Documentation] Scans the namespace for pods that are over utilizing resources or may be experiencing resource problems like oomkills or restarts. [Tags] overutilized resources utilization pods cpu memory allocation ${NAMESPACE} oomkill restarts - ${process}= RW.CLI.Run Bash File scan_overutilized_pods.sh - ... cmd_override=./scan_overutilized_pods.sh + ${pod_usage_analysis}= RW.CLI.Run Bash File identify_resource_contrained_pods.sh ... env=${env} ... secret_file__kubeconfig=${kubeconfig} - RW.Core.Add Pre To Report ${process.stdout} - IF "Pods overutilized and restarting" in """${process.stdout}""" - ${pod_names}= RW.CLI.Run Cli echo "${process.stdout}" | grep -A 1 "Pods overutilized and restarting" | tail -n 1 | head | tr -d '\n' - RW.Core.Add Issue title=Detected overutilized pods with restarts in namespace `${NAMESPACE}` - ... severity=3 - ... next_steps=Consider increasing the base requests of the following pods: `${pod_names.stdout}` in namesapce `${NAMESPACE}` - ... expected=The pods should not be restarting and have reasonable utilization. - ... actual=The pods are restarting and may be overutilized. - ... reproduce_hint=Run scan_overutilized_pods.sh - ... details=${process.stdout} - END - IF "Pods at limits" in """${process.stdout}""" - ${pod_names}= RW.CLI.Run Cli echo "${process.stdout}" | tail -n 1 - RW.Core.Add Issue title=Detected pods at their limits in namespace `${NAMESPACE}` - ... severity=3 - ... next_steps=Consider increasing the limits of the following pods: `${pod_names.stdout}` in namesapce `${NAMESPACE}` - ... expected=The pods should not be at their limits and have reasonable utilization. - ... actual=The pod is utilized to its limit. - ... reproduce_hint=Run scan_overutilized_pods.sh - ... details=${process.stdout} + ... show_in_rwl_cheatsheet=true + RW.Core.Add Pre To Report ${pod_usage_analysis.stdout} + ${overutilized_pods}= RW.CLI.Run Cli + ... cmd=cat $HOME/overutilized_pods.json | jq . + ... env=${env} + ${overutilized_pods_list}= Evaluate + ... json.loads(r'''${overutilized_pods.stdout}''') + ... json + FOR ${item} IN @{overutilized_pods_list} + ${item_owner}= RW.CLI.Run Bash File + ... bash_file=find_resource_owners.sh + ... cmd_override=./find_resource_owners.sh Pod ${item["pod"]} ${NAMESPACE} ${CONTEXT} + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... include_in_history=False + ${item_owner_output}= RW.CLI.Run Cli + ... cmd=echo "${item_owner.stdout}" | sed 's/ *$//' | tr -d '\n' + ... env=${env} + ... include_in_history=False + IF len($item_owner_output.stdout) > 0 and ($item_owner_output.stdout) != "No resource found" + ${owner_kind} ${owner_name}= Split String ${item_owner_output.stdout} ${SPACE} + ${owner_name}= Replace String ${owner_name} \n ${EMPTY} + ELSE + ${owner_kind}= Set Variable "Unknown" + ${owner_name}= Set Variable "Unknown" + END + IF 'CPU usage exceeds threshold' in $item['reason'] + RW.Core.Add Issue + ... severity=3 + ... expected=Pods should be operating under their designated resource limits + ... actual=Pods are above their designated resource limits + ... title= ${item["reason"]} for pod `${item["pod"]}` in `${item["namespace"]}` + ... reproduce_hint=${pod_usage_analysis.cmd} + ... details=${item} + ... next_steps=Increase CPU limits for ${owner_kind} `${owner_name}` to ${item["recommended_cpu_increase"]} in namespace `${item["namespace"]}` + END + IF 'Memory usage exceeds threshold' in $item['reason'] + RW.Core.Add Issue + ... severity=3 + ... expected=Pods should be operating under their designated resource limits + ... actual=Pods are above their designated resource limits + ... title= ${item["reason"]} for pod `${item["pod"]}` in `${item["namespace"]}` + ... reproduce_hint=${pod_usage_analysis.cmd} + ... details=${item} + ... next_steps=Increase memory limits for ${owner_kind} `${owner_name}` to ${item["recommended_mem_increase"]} in namespace `${item["namespace"]}` + END + IF 'OOMKilled or exit code 137' in $item['reason'] + RW.Core.Add Issue + ... severity=2 + ... expected=Pods should be operating under their designated resource limits + ... actual=Pods are above their designated resource limits + ... title= Container restarts detected pod `${item["pod"]}` in `${item["namespace"]}` due to exceeded memory usage + ... reproduce_hint=${pod_usage_analysis.cmd} + ... details=${item} + ... next_steps=Increase memory limits for ${owner_kind} `${owner_name}` to ${item["recommended_mem_increase"]} in namespace `${item["namespace"]}` + END END - *** Keywords *** Suite Initialization ${kubeconfig}= RW.Core.Import Secret @@ -184,14 +218,28 @@ Suite Initialization ... description=Home directory to execute scripts from ... example=/home ... default=/home/runwhen + ${UTILIZATION_THRESHOLD}= RW.Core.Import User Variable UTILIZATION_THRESHOLD + ... type=string + ... description=The resource usage threshold at which to identify issues. + ... pattern=\w* + ... example=95 + ... default=95 + ${DEFAULT_INCREASE}= RW.Core.Import User Variable DEFAULT_INCREASE + ... type=string + ... description=The percentage increase for resource recommendations. + ... pattern=\w* + ... example=25 + ... default=25 Set Suite Variable ${kubeconfig} ${kubeconfig} Set Suite Variable ${CONTEXT} ${CONTEXT} Set Suite Variable ${NAMESPACE} ${NAMESPACE} Set Suite Variable ${HOME} ${HOME} + Set Suite Variable ${DEFAULT_INCREASE} ${DEFAULT_INCREASE} + Set Suite Variable ${UTILIZATION_THRESHOLD} ${UTILIZATION_THRESHOLD} Set Suite Variable ${KUBERNETES_DISTRIBUTION_BINARY} ${KUBERNETES_DISTRIBUTION_BINARY} Set Suite Variable ... ${env} - ... {"KUBECONFIG":"./${kubeconfig.key}", "KUBERNETES_DISTRIBUTION_BINARY":"${KUBERNETES_DISTRIBUTION_BINARY}", "CONTEXT":"${CONTEXT}", "NAMESPACE":"${NAMESPACE}", "HOME":"${HOME}"} + ... {"KUBECONFIG":"./${kubeconfig.key}", "KUBERNETES_DISTRIBUTION_BINARY":"${KUBERNETES_DISTRIBUTION_BINARY}", "CONTEXT":"${CONTEXT}", "NAMESPACE":"${NAMESPACE}","DEFAULT_INCREASE":"${DEFAULT_INCREASE}","UTILIZATION_THRESHOLD":"${UTILIZATION_THRESHOLD}", "HOME":"${HOME}"} IF "${LABELS}" != "" ${LABELS}= Set Variable -l ${LABELS} END diff --git a/codebundles/k8s-podresources-health/scan_overutilized_pods.sh b/codebundles/k8s-podresources-health/scan_overutilized_pods.sh deleted file mode 100755 index cb22356e..00000000 --- a/codebundles/k8s-podresources-health/scan_overutilized_pods.sh +++ /dev/null @@ -1,168 +0,0 @@ -#!/bin/bash - -# Environment Variables: -# NAMESPACE -# CONTEXT - -CPU_PERCENT_THRESHOLD=95 -MEM_PERCENT_THRESHOLD=95 - -# Function that converts memory resource units to pure bytes -convert_memory_to_bytes() { - local resource=$1 - local bytes=0 - if [[ $resource =~ ^[0-9]+Ki$ ]]; then - bytes=$(( ${resource%Ki} * 1024 )) - elif [[ $resource =~ ^[0-9]+Mi$ ]]; then - bytes=$(( ${resource%Mi} * 1024 * 1024 )) - elif [[ $resource =~ ^[0-9]+Gi$ ]]; then - bytes=$(( ${resource%Gi} * 1024 * 1024 * 1024 )) - elif [[ $resource =~ ^[0-9]+Ti$ ]]; then - bytes=$(( ${resource%Ti} * 1024 * 1024 * 1024 * 1024 )) - elif [[ $resource =~ ^[0-9]+Pi$ ]]; then - bytes=$(( ${resource%Pi} * 1024 * 1024 * 1024 * 1024 * 1024 )) - elif [[ $resource =~ ^[0-9]+Ei$ ]]; then - bytes=$(( ${resource%Ei} * 1024 * 1024 * 1024 * 1024 * 1024 * 1024 )) - elif [[ $resource =~ ^[0-9]+K$ ]]; then - bytes=$(( ${resource%K} * 1000 )) - elif [[ $resource =~ ^[0-9]+M$ ]]; then - bytes=$(( ${resource%M} * 1000 * 1000 )) - elif [[ $resource =~ ^[0-9]+G$ ]]; then - bytes=$(( ${resource%G} * 1000 * 1000 * 1000 )) - elif [[ $resource =~ ^[0-9]+T$ ]]; then - bytes=$(( ${resource%T} * 1000 * 1000 * 1000 * 1000 )) - elif [[ $resource =~ ^[0-9]+P$ ]]; then - bytes=$(( ${resource%P} * 1000 * 1000 * 1000 * 1000 * 1000 )) - elif [[ $resource =~ ^[0-9]+E$ ]]; then - bytes=$(( ${resource%E} * 1000 * 1000 * 1000 * 1000 * 1000 * 1000 )) - else - bytes=$resource - fi - echo $bytes -} - -# Function that converts CPU resource units to milliCPU -convert_cpu_to_millicpu() { - local resource=$1 - local millicpu=0 - if [[ $resource =~ ^[0-9]+m$ ]]; then - millicpu=${resource%m} - else - millicpu=$(( resource * 1000 )) # Convert whole CPU to milliCPU - fi - echo $millicpu -} - -# Function that finds the owning resource of a pod and returns its name -get_owning_resource() { - local pod_name=$1 - kubectl get pod $pod_name -o jsonpath='{.metadata.ownerReferences[0].name}' -} - -# Function that gets the CPU and memory requests and limits of a given pod as JSON -get_pod_resources_config() { - local pod_name=$1 - kubectl get --context $CONTEXT -n $NAMESPACE pod $pod_name -o jsonpath='{.spec.containers[*].resources}' -} - -# Function that converts the output of kubectl top to JSON using protocol buffers -convert_top_to_json() { - kubectl top --context $CONTEXT -n $NAMESPACE pods --use-protocol-buffers=true --no-headers | awk ' - BEGIN { - print "[" - } - { - if (NR > 1) { - print "," - } - print " {" - print " \"name\": \"" $1 "\"," - print " \"cpu\": \"" $2 "\"," - print " \"memory\": \"" $3 "\"" - print " }" - } - END { - print "]" - }' -} - -main() { - util_restarting_pods=() - util_limit_pods=() - echo "Starting $NAMESPACE Namespace Scan For Overutilized Pods" - top_stats=$(convert_top_to_json) - top_stats_array=($(echo "$top_stats" | jq -c '.[]')) - for item in "${top_stats_array[@]}"; do - name=$(echo "$item" | jq -r '.name') - pod_restarts=$(kubectl get pod $name -n $NAMESPACE -o jsonpath='{.status.containerStatuses[0].restartCount}') - cpu=$(echo "$item" | jq -r '.cpu') - memory=$(echo "$item" | jq -r '.memory') - used_memory_bytes=$(convert_memory_to_bytes $memory) - used_cpu_millicpu=$(convert_cpu_to_millicpu $cpu) - # parse config for resource reservation info - resource_config=$(get_pod_resources_config $name) - requests_cpu=$(echo "$resource_config" | jq -r '.requests.cpu') - requests_memory=$(echo "$resource_config" | jq -r '.requests.memory') - requests_mcpu=$(convert_cpu_to_millicpu $requests_cpu) - requests_memory_bytes=$(convert_memory_to_bytes $requests_memory) - - limits_cpu=$(echo "$resource_config" | jq -r '.limits.cpu') - limits_memory=$(echo "$resource_config" | jq -r '.limits.memory') - limits_mcpu=$(convert_cpu_to_millicpu $limits_cpu) - limits_memory_bytes=$(convert_memory_to_bytes $limits_memory) - - requests_cpu_utilization=$(( used_cpu_millicpu * 100 / requests_mcpu )) - requests_memory_utilization=$(( used_memory_bytes * 100 / requests_memory_bytes )) - limits_cpu_utilization=$(( used_cpu_millicpu * 100 / limits_mcpu )) - limits_memory_utilization=$(( used_memory_bytes * 100 / limits_memory_bytes )) - - echo "Name: $name, Restarts: $pod_restarts, CPU: $cpu, Memory: $memory , mCPU: $used_cpu_millicpu, Memory Bytes: $used_memory_bytes, CPU Utilization: $requests_cpu_utilization%, Memory Utilization: $requests_memory_utilization%" - if [ $requests_cpu_utilization -gt $CPU_PERCENT_THRESHOLD ] && [ $pod_restarts -gt 0 ]; then - echo "Error: Pod $name has $requests_cpu_utilization% CPU utilization and $pod_restarts restarts" - if [[ " ${util_restarting_pods[@]} " =~ " ${name} " ]]; then - : # noop - else - util_restarting_pods+=("$name") - fi - fi - if [ $requests_memory_utilization -gt $MEM_PERCENT_THRESHOLD ] && [ $pod_restarts -gt 0 ]; then - echo "Error: Pod $name has $requests_memory_utilization% memory utilization and $pod_restarts restarts" - if [[ " ${util_restarting_pods[@]} " =~ " ${name} " ]]; then - : # noop - else - echo "add array $name" - util_restarting_pods+=("$name") - echo "array: ${util_restarting_pods[@]}" - fi - fi - if [ $limits_cpu_utilization -gt $CPU_PERCENT_THRESHOLD ]; then - echo "Error: Pod $name is at CPU limit $limits_cpu" - if [[ " ${util_limit_pods[@]} " =~ " ${name} " ]]; then - : # noop - else - util_limit_pods+=("$name") - fi - fi - if [ $limits_memory_utilization -gt $MEM_PERCENT_THRESHOLD ]; then - echo "Error: Pod $name is at memory limit $limits_memory" - if [[ " ${util_limit_pods[@]} " =~ " ${name} " ]]; then - : # noop - else - util_limit_pods+=("$name") - fi - fi - done - - if [ ${#util_restarting_pods[@]} -gt 0 ]; then - echo "" - echo "Pods overutilized and restarting:" - echo ${util_restarting_pods[@]} - fi - if [ ${#util_limit_pods[@]} -gt 0 ]; then - echo "" - echo "Pods at limits:" - echo ${util_limit_pods[@]} - fi -} - -main \ No newline at end of file