Add/cluster-resource-health-cb (#359)

* remove service ref * add cluster resource db * script fix * test exclusion of no requests set * clean up
runwhen-contrib · May 5, 2024 · 7d30f67 · 7d30f67
1 parent b23cbbf
commit 7d30f67
Show file tree

Hide file tree

Showing 12 changed files with 415 additions and 13 deletions.
diff --git a/...es/k8s-cluster-resource-health/.runwhen/generation-rules/k8s-cluster-resource-health.yaml b/...es/k8s-cluster-resource-health/.runwhen/generation-rules/k8s-cluster-resource-health.yaml
@@ -0,0 +1,23 @@
+apiVersion: runwhen.com/v1
+kind: GenerationRules
+spec:
+  platform: kubernetes
+  generationRules:
+    - resourceTypes:
+        -  cluster
+      matchRules:
+        - type: and
+          matches:
+            - type: pattern
+              pattern: ".+"
+              properties: [name]
+              mode: substring
+      slxs:
+        - baseName: cluster-resource
+          qualifiers: [ "cluster"]
+          baseTemplateName: k8s-cluster-resource-health
+          levelOfDetail: basic
+          outputItems:
+            - type: slx
+            - type: runbook
+              templateName: k8s-cluster-resource-health-taskset.yaml
diff --git a/...ndles/k8s-cluster-resource-health/.runwhen/templates/k8s-cluster-resource-health-sli.yaml b/...ndles/k8s-cluster-resource-health/.runwhen/templates/k8s-cluster-resource-health-sli.yaml
@@ -0,0 +1,36 @@
+apiVersion: runwhen.com/v1
+kind: ServiceLevelIndicator
+metadata:
+  name: {{slx_name}}
+  labels:
+    {% include "common-labels.yaml" %}
+  annotations:
+    {% include "common-annotations.yaml" %}
+spec:
+  displayUnitsLong: OK
+  displayUnitsShort: ok
+  locations:
+    - {{ default_location }}
+  description: Measures the response code and latency to the AKS LoadBalancer Object.
+  codeBundle:
+    {% if repo_url %}
+    repoUrl: {{repo_url}}
+    {% else %}
+    repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git
+    {% endif %}
+    {% if ref %}
+    ref: {{ref}}
+    {% else %}
+    ref: main
+    {% endif %}
+    pathToRobot: codebundles/k8s-cluster-resource-health/sli.robot
+  intervalStrategy: intermezzo
+  intervalSeconds: 30
+  configProvided:
+    - name: CONTEXT
+      value: "{{cluster.context}}"
+    - name: KUBERNETES_DISTRIBUTION_BINARY
+      value: {{custom.kubernetes_distribution_binary}}
+  secretsProvided:
+    - name: kubeconfig
+      workspaceKey: {{custom.kubeconfig_secret_name}}
diff --git a/...ndles/k8s-cluster-resource-health/.runwhen/templates/k8s-cluster-resource-health-slx.yaml b/...ndles/k8s-cluster-resource-health/.runwhen/templates/k8s-cluster-resource-health-slx.yaml
@@ -0,0 +1,21 @@
+apiVersion: runwhen.com/v1
+kind: ServiceLevelX
+metadata:
+  name: {{slx_name}}
+  labels:
+    {% include "common-labels.yaml" %}
+  annotations:
+    {% include "common-annotations.yaml" %}
+spec:
+  imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/kubernetes-icon-color.svg
+  alias: {{ cluster.name }} Cluster Resource Health
+  asMeasuredBy: Node cpu and memory utilization.    
+  configProvided:
+  - name: OBJECT_NAME
+    value: {{cluster.namee}}
+  owners:
+  - {{workspace.owner_email}}
+  statement: Cluster resources for {{cluster.context}} should be less than 90% utilization. 
+  additionalContext:  
+    cluster: "{{ cluster.name }}"
+    context: "{{ cluster.context }}"
diff --git a/...s/k8s-cluster-resource-health/.runwhen/templates/k8s-cluster-resource-health-taskset.yaml b/...s/k8s-cluster-resource-health/.runwhen/templates/k8s-cluster-resource-health-taskset.yaml
@@ -0,0 +1,30 @@
+apiVersion: runwhen.com/v1
+kind: Runbook
+metadata:
+  name: {{slx_name}}
+  labels:
+    {% include "common-labels.yaml" %}
+  annotations:
+    {% include "common-annotations.yaml" %}
+spec:
+  location: {{default_location}}
+  codeBundle:
+    {% if repo_url %}
+    repoUrl: {{repo_url}}
+    {% else %}
+    repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git
+    {% endif %}
+    {% if ref %}
+    ref: {{ref}}
+    {% else %}
+    ref: main
+    {% endif %}
+    pathToRobot: codebundles/k8s-cluster-resource-health/runbook.robot
+  configProvided:
+    - name: CONTEXT
+      value: "{{cluster.context}}"
+    - name: KUBERNETES_DISTRIBUTION_BINARY
+      value: {{custom.kubernetes_distribution_binary}}
+  secretsProvided:
+    - name: kubeconfig
+      workspaceKey: {{custom.kubeconfig_secret_name}}
diff --git a/codebundles/k8s-cluster-resource-health/README.md b/codebundles/k8s-cluster-resource-health/README.md
@@ -0,0 +1,20 @@
+# K8s Cluster Resource Health
+
+## SLI
+The Service Level Indicator will count the amount of nodes that are over 90% active utilization according to `kubectl top nodes`
+
+## TaskSet 
+### Identify High Utilization Nodes for Cluster
+Create a report of all nodes that are above 90% utilization. Raise issues for each node that is in this state. 
+
+### Identify Pods Causing High Node Utilization in Cluster
+This task identifies overutilized nodes and creates a report of each pod that is using more than it's defined request. Since requests are what a cluster autoscaler uses to make decisions, this list should be used to increase the pod requests so that autoscalers can make better scaling decisions. 
+
+Raises an issue for each namespace
+
+
+## Requirements
+- Service account with permissions to: 
+    - get nodes
+    - list nodes
+    - get/list nodes in api group "metrics.k8s.io"
diff --git a/codebundles/k8s-cluster-resource-health/get_high_use_nodes.sh b/codebundles/k8s-cluster-resource-health/get_high_use_nodes.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+# Define Kubernetes binary and context with dynamic defaults
+KUBERNETES_DISTRIBUTION_BINARY="${KUBERNETES_DISTRIBUTION_BINARY:-kubectl}" # Default to 'kubectl' if not set in the environment
+DEFAULT_CONTEXT=$(${KUBERNETES_DISTRIBUTION_BINARY} config current-context)
+CONTEXT="${CONTEXT:-$DEFAULT_CONTEXT}" # Use environment variable or the current context from kubectl
+
+
+
+process_nodes_and_usage() {
+    # Get Node Details including allocatable resources
+    nodes=$(${KUBERNETES_DISTRIBUTION_BINARY} get nodes --context ${CONTEXT} -o json | jq '[.items[] | {
+        name: .metadata.name,
+        cpu_allocatable: (.status.allocatable.cpu | rtrimstr("m") | tonumber),
+        memory_allocatable: (.status.allocatable.memory | gsub("Ki"; "") | tonumber / 1024)
+    }]')
+
+    # Fetch node usage details
+    usage=$(${KUBERNETES_DISTRIBUTION_BINARY} top nodes --context ${CONTEXT} | awk 'BEGIN { printf "[" } NR>1 { printf "%s{\"name\":\"%s\",\"cpu_usage\":\"%s\",\"memory_usage\":\"%s\"}", (NR>2 ? "," : ""), $1, ($2 == "<unknown>" ? "0" : $2), ($4 == "<unknown>" ? "0" : $4) } END { printf "]" }' | jq '.')
+
+    # Combine and process the data
+    jq -n --argjson nodes "$nodes" --argjson usage "$usage" '{
+        nodes: $nodes | map({name: .name, cpu_allocatable: .cpu_allocatable, memory_allocatable: .memory_allocatable}),
+        usage: $usage | map({name: .name, cpu_usage: (.cpu_usage | rtrimstr("m") | tonumber // 0), memory_usage: (.memory_usage | rtrimstr("Mi") | tonumber // 0)})
+    } | .nodes as $nodes | .usage as $usage | 
+    $nodes | map(
+        . as $node | 
+        $usage[] | 
+        select(.name == $node.name) | 
+        {
+            name: .name, 
+            cpu_utilization_percentage: (.cpu_usage / $node.cpu_allocatable * 100),
+            memory_utilization_percentage: (.memory_usage / $node.memory_allocatable * 100)
+        }
+    ) | map(select(.cpu_utilization_percentage >= 90 or .memory_utilization_percentage >= 90))'
+}
+
+process_nodes_and_usage > high_use_nodes.json
+
+cat high_use_nodes.json
diff --git a/codebundles/k8s-cluster-resource-health/pods_impacting_high_use_nodes.sh b/codebundles/k8s-cluster-resource-health/pods_impacting_high_use_nodes.sh
@@ -0,0 +1,98 @@
+#!/bin/bash
+
+# Define Kubernetes binary and context with dynamic defaults
+KUBERNETES_DISTRIBUTION_BINARY="${KUBERNETES_DISTRIBUTION_BINARY:-kubectl}" # Default to 'kubectl' if not set in the environment
+DEFAULT_CONTEXT=$(${KUBERNETES_DISTRIBUTION_BINARY} config current-context)
+CONTEXT="${CONTEXT:-$DEFAULT_CONTEXT}" # Use environment variable or the current context from kubectl
+
+process_nodes_and_usage() {
+    # Get Node Details including allocatable resources
+    nodes=$(${KUBERNETES_DISTRIBUTION_BINARY} get nodes --context ${CONTEXT} -o json | jq '[.items[] | {
+        name: .metadata.name,
+        cpu_allocatable: (.status.allocatable.cpu | rtrimstr("m") | tonumber),
+        memory_allocatable: (.status.allocatable.memory | gsub("Ki"; "") | tonumber / 1024)
+    }]')
+
+    # Fetch node usage details
+    usage=$(${KUBERNETES_DISTRIBUTION_BINARY} top nodes --context ${CONTEXT} | awk 'BEGIN { printf "[" } NR>1 { printf "%s{\"name\":\"%s\",\"cpu_usage\":\"%s\",\"memory_usage\":\"%s\"}", (NR>2 ? "," : ""), $1, ($2 == "<unknown>" ? "0" : $2), ($4 == "<unknown>" ? "0" : $4) } END { printf "]" }' | jq '.')
+
+    # Combine and process the data
+    jq -n --argjson nodes "$nodes" --argjson usage "$usage" '{
+        nodes: $nodes | map({name: .name, cpu_allocatable: .cpu_allocatable, memory_allocatable: .memory_allocatable}),
+        usage: $usage | map({name: .name, cpu_usage: (.cpu_usage | rtrimstr("m") | tonumber // 0), memory_usage: (.memory_usage | rtrimstr("Mi") | tonumber // 0)})
+    } | .nodes as $nodes | .usage as $usage | 
+    $nodes | map(
+        . as $node | 
+        $usage[] | 
+        select(.name == $node.name) | 
+        {
+            name: .name, 
+            cpu_utilization_percentage: (.cpu_usage / $node.cpu_allocatable * 100),
+            memory_utilization_percentage: (.memory_usage / $node.memory_allocatable * 100)
+        }
+    ) | map(select(.cpu_utilization_percentage >= 90 or .memory_utilization_percentage >= 90))'
+}
+
+
+# Fetch pod resource requests
+${KUBERNETES_DISTRIBUTION_BINARY} get pods --context ${CONTEXT} --all-namespaces -o json | jq -r '.items[] | {namespace: .metadata.namespace, pod: .metadata.name, nodeName: .spec.nodeName, cpu_request: (.spec.containers[].resources.requests.cpu // "0m"), memory_request: (.spec.containers[].resources.requests.memory // "0Mi")} | select(.cpu_request != "0m" and .memory_request != "0Mi")' | jq -s '.' > pod_requests.json
+
+
+
+# Fetch current pod metrics
+${KUBERNETES_DISTRIBUTION_BINARY} top pods --context ${CONTEXT} --all-namespaces --containers | awk 'BEGIN { printf "[" } NR>1 { printf "%s{\"namespace\":\"%s\",\"pod\":\"%s\",\"container\":\"%s\",\"cpu_usage\":\"%s\",\"memory_usage\":\"%s\"}", (NR>2 ? "," : ""), $1, $2, $3, $4, $5 } END { printf "]" }' | jq '.' > pod_usage.json
+
+
+
+# Normalize units and compare
+jq -s '[
+    .[0][] as $usage | 
+    .[1][] | 
+    select(.pod == $usage.pod and .namespace == $usage.namespace) |
+    {
+        pod: .pod,
+        namespace: .namespace,
+        node: .nodeName,
+        cpu_usage: $usage.cpu_usage,
+        cpu_request: .cpu_request,
+        cpu_usage_exceeds: (
+            # Convert CPU usage to millicores, assuming all inputs need to be converted from milli-units if they end with 'm'
+            ($usage.cpu_usage | 
+                if test("m$") then rtrimstr("m") | tonumber 
+                else tonumber * 1000 
+                end
+            ) > (
+                # Convert CPU request to millicores, assuming it may already be in millicores if it ends with 'm'
+                .cpu_request | 
+                if test("m$") then rtrimstr("m") | tonumber 
+                else tonumber * 1000 
+                end
+            )
+        ),
+        memory_usage: $usage.memory_usage,
+        memory_request: .memory_request,
+        memory_usage_exceeds: (
+            # Normalize memory usage to MiB, handling MiB and GiB
+            ($usage.memory_usage | 
+                if test("Gi$") then rtrimstr("Gi") | tonumber * 1024
+                elif test("G$") then rtrimstr("G") | tonumber * 1024
+                elif test("Mi$") then rtrimstr("Mi") | tonumber
+                elif test("M$") then rtrimstr("M") | tonumber
+                else tonumber
+                end
+            ) > (
+                # Normalize memory request to MiB
+                .memory_request | 
+                if test("Gi$") then rtrimstr("Gi") | tonumber * 1024
+                elif test("G$") then rtrimstr("G") | tonumber * 1024
+                elif test("Mi$") then rtrimstr("Mi") | tonumber
+                elif test("M$") then rtrimstr("M") | tonumber
+                else tonumber
+                end
+            )
+        )
+    }
+    | select(.cpu_usage_exceeds or .memory_usage_exceeds)
+] | group_by(.namespace) | map({(.[0].namespace): .}) | add' pod_usage.json pod_requests.json > pods_exceeding_requests.json
+
+cat pods_exceeding_requests.json
diff --git a/codebundles/k8s-cluster-resource-health/runbook.robot b/codebundles/k8s-cluster-resource-health/runbook.robot
@@ -0,0 +1,93 @@
+*** Settings ***
+Documentation       Identify resource constraints or issues in a cluster.
+Metadata            Author    stewartshea
+Metadata            Display Name    Kubernetes Cluster Resource Health
+Metadata            Supports    Kubernetes,AKS,EKS,GKE,OpenShift
+
+Library             RW.Core
+Library             RW.CLI
+Library             RW.platform
+Library             OperatingSystem
+Library             Collections
+
+Suite Setup         Suite Initialization
+
+
+*** Tasks ***
+Identify High Utilization Nodes for Cluster `${CONTEXT}`
+    [Documentation]    Identify nodes with high utilization . Requires jq.
+    [Tags]    cluster    resources    cpu    memory    utilization    saturation    exhaustion    starvation
+    ${node_usage_details}=    RW.CLI.Run Bash File
+    ...    bash_file=get_high_use_nodes.sh
+    ...    env=${env}
+    ...    secret_file__kubeconfig=${kubeconfig}
+    ...    include_in_history=False
+    ...    show_in_rwl_cheatsheet=true
+    ${node_list}=    Evaluate    json.loads(r'''${node_usage_details.stdout}''')    json
+    IF    len(@{node_list}) > 0
+        RW.Core.Add Issue
+        ...    severity=2
+        ...    expected=Nodes in Cluster Context `${CONTEXT}` should have available CPU and Memory resources.
+        ...    actual=Nodes in Cluster Context `${CONTEXT}` should have available CPU and Memory resources.
+        ...    title= Node usage is too high in Cluster Context `${CONTEXT}`.
+        ...    reproduce_hint=View Commands Used in Report Output
+        ...    details=Node CPU and Memory Utilization: ${node_list}
+        ...    next_steps=Identify Pods Causing High Node Utilization in Cluster `${CONTEXT}` \nAdd Nodes to Cluster Context `${CONTEXT}` 
+    END
+    RW.Core.Add Pre To Report    Node Usage Details:\n${node_usage_details.stdout}
+    RW.Core.Add Pre To Report    Commands Used:\n${node_usage_details.cmd}
+
+Identify Pods Causing High Node Utilization in Cluster `${CONTEXT}`
+    [Documentation]    Identify nodes with high utilization and match to pods that are significantly above their resource request configuration. Requires jq.
+    [Tags]    pods    resources    requests    utilization    cpu    memory    exhaustion
+    ${pod_and_node_usage_details}=    RW.CLI.Run Bash File
+    ...    bash_file=pods_impacting_high_use_nodes.sh
+    ...    env=${env}
+    ...    secret_file__kubeconfig=${kubeconfig}
+    ...    include_in_history=False
+    ...    show_in_rwl_cheatsheet=true
+
+    ${namespace_list}=    Evaluate    json.loads(r'''${pod_and_node_usage_details.stdout}''')    json
+
+    IF    len(@{namespace_list}) > 0
+        FOR    ${item}    IN    @{namespace_list}
+            ${pod_details}=    Get From Dictionary    ${namespace_list}    ${item}
+            RW.Core.Add Issue
+            ...    severity=2
+            ...    expected=Pods in Cluster Context `${CONTEXT}` are causing resource pressure.
+            ...    actual=Pods in Cluster Context `${CONTEXT}` should have appropriate resource requests that do not cause pressure.
+            ...    title= Pods in namespace `${item}` are contributing to resource pressure in Cluster Context `${CONTEXT}`.
+            ...    reproduce_hint=View Commands Used in Report Output
+            ...    details=Node CPU and Memory Utilization: ${pod_details}
+            ...    next_steps=Add Nodes to Cluster Context `${CONTEXT}` \nIncrease Pod Resource Requests \nIdentify Pod Resource Recommendations in Namespace `${item}`
+        END
+    END
+
+    RW.Core.Add Pre To Report    Pods Needing Adjustment:\n${pod_and_node_usage_details.stdout}
+    RW.Core.Add Pre To Report    Commands Used:\n${pod_and_node_usage_details.cmd}
+
+
+*** Keywords ***
+Suite Initialization
+    ${kubeconfig}=    RW.Core.Import Secret
+    ...    kubeconfig
+    ...    type=string
+    ...    description=The kubernetes kubeconfig yaml containing connection configuration used to connect to cluster(s).
+    ...    pattern=\w*
+    ...    example=For examples, start here https://kubernetes.io/docs/concepts/configuration/organize-cluster-access-kubeconfig/
+    ${KUBERNETES_DISTRIBUTION_BINARY}=    RW.Core.Import User Variable    KUBERNETES_DISTRIBUTION_BINARY
+    ...    type=string
+    ...    description=Which binary to use for Kubernetes CLI commands.
+    ...    enum=[kubectl,oc]
+    ...    example=kubectl
+    ...    default=kubectl
+    ${CONTEXT}=    RW.Core.Import User Variable    CONTEXT
+    ...    type=string
+    ...    description=Which Kubernetes context to operate within.
+    ...    pattern=\w*
+    ...    default=default
+    ...    example=my-main-cluster
+    Set Suite Variable    ${KUBERNETES_DISTRIBUTION_BINARY}    ${KUBERNETES_DISTRIBUTION_BINARY}
+    Set Suite Variable    ${kubeconfig}    ${kubeconfig}
+    Set Suite Variable    ${CONTEXT}    ${CONTEXT}
+    Set Suite Variable    ${env}    {"KUBECONFIG":"./${kubeconfig.key}"}