updates resource task (#382)

* switch order of add to report in case there are no issues and empty json loads dails * add previous pods logs to log scan * rewrite cpu resource recommendations * add config provided to template * change title * change title * remove ticks from resource change recommendation
runwhen-contrib · Jun 1, 2024 · e953c4a · e953c4a
1 parent b7d0bb7
commit e953c4a
Show file tree

Hide file tree

Showing 9 changed files with 291 additions and 221 deletions.
diff --git a/codebundles/gcp-bucket-health/check_security.sh b/codebundles/gcp-bucket-health/check_security.sh
@@ -109,6 +109,8 @@ done
 echo "Security Issues:"
 if [ ${#ISSUES[@]} -eq 0 ]; then
   echo "No security issues found."
+  # Add empty json list to file so that json loads doesn't fail.
+  echo "[{}]" > $HOME/bucket_security_issues.json
 else
   echo "${ISSUES[@]}" | jq -s . > $HOME/bucket_security_issues.json
   cat $HOME/bucket_security_issues.json

diff --git a/codebundles/gcp-bucket-health/runbook.robot b/codebundles/gcp-bucket-health/runbook.robot
@@ -61,6 +61,9 @@ Check GCP Bucket Security Configuration for `${PROJECT_IDS}`
     ...    env=${env}
     ...    secret_file__gcp_credentials_json=${gcp_credentials_json}
     ...    show_in_rwl_cheatsheet=true
+    RW.Core.Add Pre To Report    GCP Security Configuration Check:\n${bucket_security_configuration.stdout}
+    RW.Core.Add Pre To Report    Commands Used:\n${bucket_security_configuration.cmd}
+
     ${bucket_security_output}=    RW.CLI.Run Cli
     ...    cmd=cat $HOME/bucket_security_issues.json | jq .
     ...    env=${env}
@@ -82,8 +85,6 @@ Check GCP Bucket Security Configuration for `${PROJECT_IDS}`
             ...    next_steps=Review IAM configuration for GCP storage bucket `${item["bucket"]}` in project `${item["project"]}`
         END
     END
-    RW.Core.Add Pre To Report    GCP Security Configuration Check:\n${bucket_security_configuration.stdout}
-    RW.Core.Add Pre To Report    Commands Used:\n${bucket_security_configuration.cmd}
 
 *** Keywords ***
 Suite Initialization

diff --git a/codebundles/k8s-deployment-healthcheck/deployment_logs.sh b/codebundles/k8s-deployment-healthcheck/deployment_logs.sh
@@ -70,34 +70,43 @@ if [ -z "$SELECTOR" ]; then
     exit 1
 fi
 
+fetch_logs() {
+    local POD=$1
+    local CONTAINER=$2
+    local PREVIOUS_FLAG=$3
+
+    if [ -n "$LOGS_ERROR_PATTERN" ] && [ -n "$LOGS_EXCLUDE_PATTERN" ]; then
+        # Both error and exclusion patterns provided
+        LOGS=$($KUBERNETES_DISTRIBUTION_BINARY logs $POD -c $CONTAINER $PREVIOUS_FLAG --limit-bytes=256000 --since=3h --context=$CONTEXT -n $NAMESPACE | grep -Ei "$LOGS_ERROR_PATTERN" | grep -Eiv "$LOGS_EXCLUDE_PATTERN")
+    elif [ -n "$LOGS_ERROR_PATTERN" ]; then
+        # Only error pattern provided
+        LOGS=$($KUBERNETES_DISTRIBUTION_BINARY logs $POD -c $CONTAINER $PREVIOUS_FLAG --limit-bytes=256000 --since=3h --context=$CONTEXT -n $NAMESPACE | grep -Ei "$LOGS_ERROR_PATTERN")
+    elif [ -n "$LOGS_EXCLUDE_PATTERN" ]; then
+        # Only exclusion pattern provided
+        LOGS=$($KUBERNETES_DISTRIBUTION_BINARY logs $POD -c $CONTAINER $PREVIOUS_FLAG --limit-bytes=256000 --since=3h --context=$CONTEXT -n $NAMESPACE | grep -Eiv "$LOGS_EXCLUDE_PATTERN")
+    else
+        # Neither pattern provided
+        LOGS=$($KUBERNETES_DISTRIBUTION_BINARY logs $POD -c $CONTAINER $PREVIOUS_FLAG --limit-bytes=256000 --since=3h --context=$CONTEXT -n $NAMESPACE)
+    fi
+
+    # Check log format and store appropriately
+    FIRST_LINE=$(echo "$LOGS" | head -n 1)
+    EXT=$(echo "$FIRST_LINE" | jq -e . &>/dev/null && echo "json" || echo "txt")
+    FILENAME="${POD}_${CONTAINER}_logs${PREVIOUS_FLAG:+_previous}.$EXT"
+    LOG_FILES+=("$FILENAME")
+    echo "Fetching logs for Pod: $POD, Container: $CONTAINER. Saving to $FILENAME."
+    echo "$LOGS" > $FILENAME
+}
+
 # Iterate through the pods based on the selector and fetch logs
-LOG_FILES=() 
 while read POD; do
-    CONTAINERS=$(${KUBERNETES_DISTRIBUTION_BINARY} get pod $POD -n ${NAMESPACE} --context=${CONTEXT} -o=jsonpath='{range .spec.containers[*]}{.name}{"\n"}{end}')
+    CONTAINERS=$($KUBERNETES_DISTRIBUTION_BINARY get pod $POD -n $NAMESPACE --context=$CONTEXT -o=jsonpath='{range .spec.containers[*]}{.name}{"\n"}{end}')
     for CONTAINER in $CONTAINERS; do
-        if [ -n "$LOGS_ERROR_PATTERN" ] && [ -n "$LOGS_EXCLUDE_PATTERN" ]; then
-            # Both error and exclusion patterns provided
-            LOGS=$(${KUBERNETES_DISTRIBUTION_BINARY} logs $POD -c $CONTAINER --limit-bytes=256000 --since=3h --context=${CONTEXT} -n ${NAMESPACE} | grep -Ei "${LOGS_ERROR_PATTERN}" | grep -Eiv "${LOGS_EXCLUDE_PATTERN}")
-        elif [ -n "$LOGS_ERROR_PATTERN" ]; then
-            # Only error pattern provided
-            LOGS=$(${KUBERNETES_DISTRIBUTION_BINARY} logs $POD -c $CONTAINER --limit-bytes=256000 --since=3h --context=${CONTEXT} -n ${NAMESPACE} | grep -Ei "${LOGS_ERROR_PATTERN}")
-        elif [ -n "$LOGS_EXCLUDE_PATTERN" ]; then
-            # Only exclusion pattern provided
-            LOGS=$(${KUBERNETES_DISTRIBUTION_BINARY} logs $POD -c $CONTAINER --limit-bytes=256000 --since=3h --context=${CONTEXT} -n ${NAMESPACE} | grep -Eiv "${LOGS_EXCLUDE_PATTERN}")
-        else
-            # Neither pattern provided
-            LOGS=$(${KUBERNETES_DISTRIBUTION_BINARY} logs $POD -c $CONTAINER --limit-bytes=256000 --since=3h --context=${CONTEXT} -n ${NAMESPACE})
-        fi
-
-        # Check log format and store appropriately
-        FIRST_LINE=$(echo "$LOGS" | head -n 1)
-        EXT=$(echo "$FIRST_LINE" | jq -e . &>/dev/null && echo "json" || echo "txt")
-        FILENAME="${POD}_${CONTAINER}_logs.$EXT"
-        LOG_FILES+=("$FILENAME")
-        echo "Fetching logs for Pod: $POD, Container: $CONTAINER. Saving to $FILENAME."
-        echo "$LOGS" > $FILENAME
+        fetch_logs $POD $CONTAINER ""
+        fetch_logs $POD $CONTAINER "-p"
     done
-done < <(${KUBERNETES_DISTRIBUTION_BINARY} get pods --selector=$SELECTOR -n ${NAMESPACE} --context=${CONTEXT} -o=jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}')
+done < <($KUBERNETES_DISTRIBUTION_BINARY get pods --selector=$SELECTOR -n $NAMESPACE --context=$CONTEXT -o=jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}')
+
 
 # Initialize an issue description array
 issue_descriptions=()

diff --git a/codebundles/k8s-deployment-healthcheck/runbook.robot b/codebundles/k8s-deployment-healthcheck/runbook.robot
@@ -61,7 +61,6 @@ Check Deployment Log For Issues with `${DEPLOYMENT_NAME}`
     ...    Recent logs from Deployment ${DEPLOYMENT_NAME} in Namespace ${NAMESPACE}:\n\n${logs.stdout}
     RW.Core.Add Pre To Report    Commands Used: ${history}
 
-# Fetch Previous Logs for Deployment `${DEPLOYMENT_NAME}`
 
 Check Liveness Probe Configuration for Deployment `${DEPLOYMENT_NAME}`
     [Documentation]    Validates if a Liveliness probe has possible misconfigurations

diff --git a/codebundles/k8s-podresources-health/.runwhen/templates/k8s-pod-resources-taskset.yaml b/codebundles/k8s-podresources-health/.runwhen/templates/k8s-pod-resources-taskset.yaml
@@ -29,6 +29,10 @@ spec:
       value: {{context}}
     - name: LABELS
       value: ''
+    - name: UTILIZATION_THRESHOLD
+      value: 95
+    - name: DEFAULT_INCREASE
+      value: 25
   secretsProvided:
     - name: kubeconfig
       workspaceKey: {{custom.kubeconfig_secret_name}}
diff --git a/codebundles/k8s-podresources-health/find_resource_owners.sh b/codebundles/k8s-podresources-health/find_resource_owners.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+# set -eo pipefail
+
+# -----------------------------------------------------------------------------
+# Script Information and Metadata
+# -----------------------------------------------------------------------------
+# Author: @stewartshea
+# Description: This script is designed to take in some information about a  
+# resource (typically a pod) and return it's owner.  
+# NOTES: 
+# Not sure if this is best served as a bash script or keyword
+# This is quickly added and likely requires further expansion
+# Not sure if it makes sense to keep this as a shared script, or 
+# packaged multiple times with each codebundle depending on cases 
+# -----------------------------------------------------------------------------
+
+# Define the kind of resource, name (or part of it), namespace, and context
+RESOURCE_KIND="$1"
+RESOURCE_NAME="$2"
+NAMESPACE="$3"
+CONTEXT="$4"
+
+# Command to get the Kubernetes distribution binary, for example, kubectl
+KUBERNETES_DISTRIBUTION_BINARY="kubectl"
+
+# Function to get the owner of a resource
+get_owner() {
+    local resource_name=$1
+    local resource_kind=$2
+    owner_kind=$(${KUBERNETES_DISTRIBUTION_BINARY} get $resource_kind $resource_name -n "${NAMESPACE}" --context="${CONTEXT}" -o=jsonpath="{.metadata.ownerReferences[0].kind}")
+    if [ "$owner_kind" = "" ]; then
+        # No owner reference means there is no parent object. Return the direct object.
+        echo "$resource_kind $resource_name"
+    elif [ "$owner_kind" = "ReplicaSet" ]; then
+        replicaset=$(${KUBERNETES_DISTRIBUTION_BINARY} get $resource_kind $resource_name -n "${NAMESPACE}" --context="${CONTEXT}" -o=jsonpath="{.metadata.ownerReferences[0].name}")
+        deployment_name=$(${KUBERNETES_DISTRIBUTION_BINARY} get replicaset $replicaset -n "${NAMESPACE}" --context="${CONTEXT}" -o=jsonpath="{.metadata.ownerReferences[0].name}")
+        echo "Deployment $deployment_name"
+    else
+        owner_info=$(${KUBERNETES_DISTRIBUTION_BINARY} get $resource_kind $resource_name -n "${NAMESPACE}" --context="${CONTEXT}" -o=jsonpath="{.metadata.ownerReferences[0].name}")
+        echo "$owner_kind $owner_info"
+    fi
+}
+
+
+resources=$(${KUBERNETES_DISTRIBUTION_BINARY} get $RESOURCE_KIND -n "${NAMESPACE}" --context="${CONTEXT}" | grep "${RESOURCE_NAME}" | awk '{print $1}' || true)
+
+if [ -z "$resources" ]; then
+    echo "No resource found"
+else
+    while read resource_name; do
+        if [ ! -z "$resource_name" ]; then
+            owner=$(get_owner "$resource_name" "$RESOURCE_KIND")
+            if [ -n "$owner" ]; then
+                echo "$owner" | tr -d '\n'
+                exit 0
+            fi
+        fi
+    done <<< $resources
+fi
diff --git a/codebundles/k8s-podresources-health/identify_resource_contrained_pods.sh b/codebundles/k8s-podresources-health/identify_resource_contrained_pods.sh
@@ -0,0 +1,116 @@
+#!/bin/bash
+
+# Ensure KUBERNETES_DISTRIBUTION_BINARY, CONTEXT, NAMESPACE, UTILIZATION_THRESHOLD, and DEFAULT_INCREASE are set
+if [[ -z "${KUBERNETES_DISTRIBUTION_BINARY}" || -z "${CONTEXT}" || -z "${NAMESPACE}" || -z "${UTILIZATION_THRESHOLD}" || -z "${DEFAULT_INCREASE}" ]]; then
+  echo "KUBERNETES_DISTRIBUTION_BINARY, CONTEXT, NAMESPACE, UTILIZATION_THRESHOLD, and DEFAULT_INCREASE environment variables must be set."
+  exit 1
+fi
+
+# Function to check if a pod has OOMKilled status or exit code 137
+check_pod_status() {
+  local pod_name=$1
+
+  pod_status=$(${KUBERNETES_DISTRIBUTION_BINARY} get pod "$pod_name" -n "${NAMESPACE}" --context ${CONTEXT} -o jsonpath='{.status.containerStatuses[*].state.terminated.reason}')
+  exit_code=$(${KUBERNETES_DISTRIBUTION_BINARY} get pod "$pod_name" -n "${NAMESPACE}" --context ${CONTEXT} -o jsonpath='{.status.containerStatuses[*].state.terminated.exitCode}')
+
+  if [[ $pod_status == *"OOMKilled"* || $exit_code -eq 137 ]]; then
+    echo true
+  else
+    echo false
+  fi
+}
+
+# Initialize an empty array to store overutilized pods
+overutilized_pods=()
+
+# Get the list of pods and their resource usage in the specified namespace
+pods=$(${KUBERNETES_DISTRIBUTION_BINARY} top pod -n "${NAMESPACE}" --context ${CONTEXT} --no-headers | awk '{print $1}')
+
+# Loop through each pod
+for pod in $pods; do
+  # Get pod resource limits
+  echo "---"
+  echo "Processing Pod $pod"
+  cpu_limit=$(${KUBERNETES_DISTRIBUTION_BINARY} get pod "$pod" -n "${NAMESPACE}" --context ${CONTEXT} -o jsonpath='{.spec.containers[*].resources.limits.cpu}')
+  mem_limit=$(${KUBERNETES_DISTRIBUTION_BINARY} get pod "$pod" -n "${NAMESPACE}" --context ${CONTEXT} -o jsonpath='{.spec.containers[*].resources.limits.memory}')
+
+  # Convert memory limit to Mi
+  if [[ $mem_limit == *Gi ]]; then
+    mem_limit=$(echo "$mem_limit" | sed 's/Gi//' | awk '{printf "%.0f", $1 * 1024}')
+  elif [[ $mem_limit == *Mi ]]; then
+    mem_limit=$(echo "$mem_limit" | sed 's/Mi//')
+  fi
+
+  # Convert CPU limit to millicores
+  if [[ $cpu_limit == *m ]]; then
+    cpu_limit=$(echo "$cpu_limit" | sed 's/m//')
+  else
+    cpu_limit=$(echo "$cpu_limit" | awk '{printf "%.0f", $1 * 1000}')
+  fi
+
+  # Handle cases where limits are not set (0 or empty)
+  cpu_limit=${cpu_limit:-0}
+  mem_limit=${mem_limit:-0}
+
+  # Get pod current resource usage
+  cpu_usage=$(${KUBERNETES_DISTRIBUTION_BINARY} top pod "$pod" -n "${NAMESPACE}" --context ${CONTEXT} --no-headers | awk '{print $2}' | sed 's/m//')
+  mem_usage=$(${KUBERNETES_DISTRIBUTION_BINARY} top pod "$pod" -n "${NAMESPACE}" --context ${CONTEXT} --no-headers | awk '{print $3}' | sed 's/Mi//')
+  echo "CPU Limit: $cpu_limit (m)"
+  echo "CPU Usage: $cpu_usage (m)"
+  echo "Memory Limit: $mem_limit (Mi)"
+  echo "Memory Usage: $mem_usage (Mi)"
+
+  # Calculate threshold values
+  if [[ $cpu_limit -ne 0 ]]; then
+    cpu_threshold=$(awk "BEGIN {printf \"%.0f\", $cpu_limit * $UTILIZATION_THRESHOLD / 100}")
+  else
+    cpu_threshold=0
+  fi
+
+  if [[ $mem_limit -ne 0 ]]; then
+    mem_threshold=$(awk "BEGIN {printf \"%.0f\", $mem_limit * $UTILIZATION_THRESHOLD / 100}")
+  else
+    mem_threshold=0
+  fi
+
+  echo "CPU Threshold: $cpu_threshold (m)"
+  echo "Memory Threshold: $mem_threshold (Mi)"
+
+  # Check if the pod is overutilized
+  reason=""
+  if [[ $cpu_limit -ne 0 && $cpu_usage -gt $cpu_threshold ]]; then
+    reason="CPU usage exceeds threshold"
+  fi
+  if [[ $mem_limit -ne 0 && $mem_usage -gt $mem_threshold ]]; then
+    if [[ -n $reason ]]; then
+      reason="$reason and memory usage exceeds threshold"
+    else
+      reason="Memory usage exceeds threshold"
+    fi
+  fi
+
+  if [[ -n $reason ]]; then
+    recommended_cpu_increase=$(awk "BEGIN {printf \"%.0f\", $cpu_limit * (1 + $DEFAULT_INCREASE / 100)}")
+    recommended_mem_increase=$(awk "BEGIN {printf \"%.0f\", $mem_limit * (1 + $DEFAULT_INCREASE / 100)}")
+    overutilized_pods+=("{\"namespace\":\"${NAMESPACE}\", \"pod\":\"$pod\", \"reason\":\"$reason\", \"cpu_usage\":\"$cpu_usage\", \"mem_usage\":\"$mem_usage\", \"cpu_limit\":\"$cpu_limit\", \"mem_limit\":\"$mem_limit\", \"cpu_threshold\":\"$cpu_threshold\", \"mem_threshold\":\"$mem_threshold\", \"recommended_cpu_increase\":\"$recommended_cpu_increase (m)\", \"recommended_mem_increase\":\"$recommended_mem_increase (Mi)\"}")
+  fi
+
+  # Check if the pod has an exit code of 137 or OOMKilled status
+  if [[ $(check_pod_status "$pod") == "true" ]]; then
+    overutilized_pods+=("{\"namespace\":\"${NAMESPACE}\", \"pod\":\"$pod\", \"reason\":\"OOMKilled or exit code 137\", \"cpu_usage\":\"$cpu_usage\", \"mem_usage\":\"$mem_usage\", \"cpu_limit\":\"$cpu_limit\", \"mem_limit\":\"$mem_limit\"}")
+  fi
+done
+
+# Convert the array to JSON format
+json_output=$(printf '%s\n' "${overutilized_pods[@]}" | jq -s '.')
+
+# Write the JSON output to a file
+output_file="$HOME/overutilized_pods.json"
+if [[ ${#overutilized_pods[@]} -eq 0 ]]; then
+  echo "[]" > "$output_file"
+else
+  echo "$json_output" > "$output_file"
+fi
+
+# Print the JSON output
+echo "$json_output"