Cb/rm updates with remediate (#311)

* add timeout to node premempt sli * update resource check task and script * update workload next steps * add details to report * remove legacy service components * resource quota updates and remediation tasks * set empty default context * fix object_type/object_name * change format to avoid md rendering * update report format * revert report back * script update. add podresource vpa check * remove kubectl service * update var * typo fix * text update * wip remediation * wip * restructure and stack github changes into single PR based on object/file * tag update * fix vars * update next step * fix vars * test splitting the recommendations into two next steps? * update vpa for remediate * update gen rule --------- Co-authored-by: RunWhen Runsession Bot <runsessions@runwhen.com>
runwhen-contrib · Jan 29, 2024 · 6177043 · 6177043
1 parent a535597
commit 6177043
Show file tree

Hide file tree

Showing 17 changed files with 1,447 additions and 39 deletions.
diff --git a/codebundles/gcloud-node-preempt/sli.robot b/codebundles/gcloud-node-preempt/sli.robot
@@ -43,5 +43,6 @@ Count the number of nodes in active prempt operation
     ...    cmd=gcloud auth activate-service-account --key-file=$GOOGLE_APPLICATION_CREDENTIALS && gcloud compute operations list --filter='operationType:(compute.instances.preempted)' --format=json --project=${GCP_PROJECT_ID} | jq -r --arg now "$(date -u +%s)" '[.[] | select((.startTime | sub("\\\\.[0-9]+"; "") | strptime("%Y-%m-%dT%H:%M:%S%z") | mktime) > ($now | tonumber - (${AGE}*60)))] | length'
     ...    env=${env}
     ...    secret_file__gcp_credentials_json=${gcp_credentials_json}
+    ...    timeout_seconds=180
     ${metric}=     Convert To Number    ${preempt_node_list.stdout}
     RW.Core.Push Metric    ${metric}
diff --git a/codebundles/k8s-deployment-healthcheck/workload_next_steps.sh b/codebundles/k8s-deployment-healthcheck/workload_next_steps.sh
@@ -54,6 +54,9 @@ if [[ $messages =~ "ImagePullBackOff" || $messages =~ "Back-off pulling image" |
     next_steps+=("List Images and Tags for Every Container in Failed Pods for Namespace \`$NAMESPACE\`")
 fi
 
+if [[ $messages =~ "forbidden: failed quota" ]]; then
+    next_steps+=("Check Resource Quota Utilization in Namepace `${NAMESPACE}`")
+fi
 
 # Display the list of recommendations
 printf "%s\n" "${next_steps[@]}" | sort | uniq
diff --git a/codebundles/k8s-gitops-gh-remediate/.runwhen/generation-rules/k8s-gitops-gh-remediate.yaml b/codebundles/k8s-gitops-gh-remediate/.runwhen/generation-rules/k8s-gitops-gh-remediate.yaml
@@ -0,0 +1,31 @@
+apiVersion: runwhen.com/v1
+kind: GenerationRules
+spec:
+  generationRules:
+    - resourceTypes:
+        - namespace
+      matchRules:
+        - type: and
+          matches:
+            - type: pattern
+              pattern: ".+"
+              properties: [name]
+              mode: substring
+            - type: pattern
+              pattern: "kustomize.toolkit.fluxcd.io/name"
+              properties: [labels]
+              mode: substring
+            - resourceType: variables
+              type: pattern
+              pattern: "github"
+              properties: [custom/gitops_provider]
+              mode: substring
+      slxs:
+        - baseName: gitops-gh-fix
+          levelOfDetail: detailed
+          qualifiers: ["namespace", "cluster"]
+          baseTemplateName: k8s-gitops-gh-remediate
+          outputItems:
+            - type: slx
+            - type: runbook
+              templateName: k8s-gitops-gh-remediate-taskset.yaml
diff --git a/codebundles/k8s-gitops-gh-remediate/.runwhen/templates/k8s-gitops-gh-remediate-slx.yaml b/codebundles/k8s-gitops-gh-remediate/.runwhen/templates/k8s-gitops-gh-remediate-slx.yaml
@@ -0,0 +1,23 @@
+apiVersion: runwhen.com/v1
+kind: ServiceLevelX
+metadata:
+  name: {{slx_name}}
+  labels:
+    {% include "common-labels.yaml" %}
+  annotations:
+    {% include "common-annotations.yaml" %}
+spec:
+  imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/github-mark.svg
+  alias: {{match_resource.resource.metadata.namespace}} Deployment GitOps Configuration Remediations
+  asMeasuredBy: "" 
+  configProvided:
+  - name: OBJECT_NAME
+    value: {{match_resource.resource.metadata.name}}
+  owners:
+  - {{workspace.owner_email}}
+  statement: Remediate deployments in Namespace {{match_resource.resource.metadata.namespace}} managed in GitHub repositories. 
+  additionalContext:  
+    namespace: "{{match_resource.resource.metadata.namespace}}"
+    labelMap: "{{match_resource.resource.metadata.labels}}"
+    cluster: "{{ cluster.name }}"
+    context: "{{ cluster.context }}"
diff --git a/codebundles/k8s-gitops-gh-remediate/.runwhen/templates/k8s-gitops-gh-remediate-taskset.yaml b/codebundles/k8s-gitops-gh-remediate/.runwhen/templates/k8s-gitops-gh-remediate-taskset.yaml
@@ -0,0 +1,34 @@
+apiVersion: runwhen.com/v1
+kind: Runbook
+metadata:
+  name: {{slx_name}}
+  labels:
+    {% include "common-labels.yaml" %}
+  annotations:
+    {% include "common-annotations.yaml" %}
+spec: 
+  location: {{default_location}}
+  codeBundle:
+    {% if repo_url %}
+    repoUrl: {{repo_url}}
+    {% else %}
+    repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git
+    {% endif %}
+    {% if ref %}
+    ref: {{ref}}
+    {% else %}
+    ref: main
+    {% endif %}
+    pathToRobot: codebundles/k8s-deployment-gitops-gh-remediate/runbook.robot
+  configProvided:
+    - name: NAMESPACE
+      value: {{match_resource.resource.metadata.namespace}}
+    - name: CONTEXT
+      value: {{context}}
+    - name: KUBERNETES_DISTRIBUTION_BINARY
+      value: {{custom.kubernetes_distribution_binary}}
+  secretsProvided:
+    - name: kubeconfig
+      workspaceKey: {{custom.kubeconfig_secret_name}}
+    - name: github_token
+      workspaceKey: {{custom.github_token_secret_name}}
diff --git a/codebundles/k8s-gitops-gh-remediate/README.md b/codebundles/k8s-gitops-gh-remediate/README.md
@@ -0,0 +1,23 @@
+# Kubernetes GitOps GitHub Remediate
+
+This codebundle provides a suite of tasks aimed at remediating configuration issues related to Kubernetes deployments managed in github repositories.
+
+## Tasks
+`Remediate Readiness and Liveness Probe GitOps Manifests in Namespace`
+`Increase ResourceQuota for Namespace`
+`Adjust Pod Resources to Match VPA Recommendation in`
+
+## Configuration
+The TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:
+
+- `kubeconfig`: The kubeconfig secret containing access info for the cluster.
+- `KUBERNETES_DISTRIBUTION_BINARY`: Which binary to use for Kubernetes CLI commands. Default value is `kubectl`.
+- `CONTEXT`: The Kubernetes context to operate within.
+- `NAMESPACE`: The name of the namespace to search. Leave it blank to search in all namespaces.
+
+## Requirements
+- A kubeconfig with appropriate RBAC permissions to perform the desired command.
+
+## TODO
+- [ ] Add additional documentation.
+
diff --git a/codebundles/k8s-gitops-gh-remediate/meta.yaml b/codebundles/k8s-gitops-gh-remediate/meta.yaml
@@ -0,0 +1 @@
+commands: []
diff --git a/codebundles/k8s-gitops-gh-remediate/resource_quota_check.sh b/codebundles/k8s-gitops-gh-remediate/resource_quota_check.sh
@@ -0,0 +1,144 @@
+#!/bin/bash
+
+# Initialize recommendations array
+declare -a recommendations
+
+# Function to convert memory to Mi
+convert_memory_to_mib() {
+    local memory=$1
+
+    # Extract the number and unit separately
+    local number=${memory//[!0-9]/}
+    local unit=${memory//[0-9]/}
+
+    case $unit in
+        Gi)
+            echo $(( number * 1024 ))  # Convert Gi to Mi
+            ;;
+        Mi)
+            echo $number  # Already in Mi
+            ;;
+        Ki)
+            echo $(( number / 1024 ))  # Convert Ki to Mi
+            ;;
+        *)
+            echo $(( number / (1024 * 1024) ))  # Convert bytes to Mi
+            ;;
+    esac
+}
+
+# Function to convert CPU to millicores
+convert_cpu_to_millicores() {
+    local cpu=$1
+    if [[ $cpu =~ ^[0-9]+m$ ]]; then
+        echo ${cpu%m}
+    else
+        echo $(($cpu * 1000))  # Convert CPU cores to millicores
+    fi
+}
+
+# Function to calculate and display resource usage status with recommendations
+check_usage() {
+    local quota_name=$1
+    local resource=$2
+    local used=$3
+    local hard=$4
+
+    # Convert memory and CPU to a common unit (Mi and millicores respectively)
+    if [[ $resource == *memory* ]]; then
+        used=$(convert_memory_to_mib $used)
+        hard=$(convert_memory_to_mib $hard)
+    elif [[ $resource == *cpu* ]]; then
+        used=$(convert_cpu_to_millicores $used)
+        hard=$(convert_cpu_to_millicores $hard)
+    fi
+
+    # Calculating percentage
+    local percentage=0
+    if [ $hard -ne 0 ]; then
+        percentage=$(( 100 * used / hard ))
+    fi
+
+    # Generate recommendation based on usage
+    local recommendation=""
+    local increase_percentage=0
+    local increased_value=0
+    if [ $percentage -ge 100 ]; then
+        if [ $used -gt $hard ]; then
+            # If usage is over 100%, match the current usage
+            echo "$resource: OVER LIMIT ($percentage%) - Adjust resource quota to match current usage with some headroom for $resource in $NAMESPACE"
+            increase_percentage="${CRITICAL_INCREASE_LEVEL:-40}"
+            increased_value=$(( used * increase_percentage / 100 ))
+            suggested_value=$(( increased_value + used ))
+        else
+            echo "$resource: AT LIMIT ($percentage%) - Immediately increase the resource quota for $resource in $NAMESPACE"
+            increase_percentage="${CRITICAL_INCREASE_LEVEL:-40}"
+            increased_value=$(( hard * increase_percentage / 100 ))
+            suggested_value=$(( increased_value + hard ))
+        fi
+        recommendation="{\"object_type\":\"ResourceQuota\",\"object_name\":\"$quota_name\",\"remediation_type\":\"resourcequota_update\",\"increase_percentage\":\"$increase_percentage\",\"limit_type\":\"hard\",\"current_value\":\"$hard\",\"suggested_value\":\"$suggested_value\",\"quota_name\": \"$quota_name\", \"resource\": \"$resource\", \"usage\": \"at or above 100%\", \"severity\": \"1\", \"next_step\": \"Increase the resource quota for $resource in \`$NAMESPACE\`\"}"
+    elif [ $percentage -ge 90 ]; then
+        echo "$resource: WARNING ($percentage%) - Consider increasing the resource quota for $resource in $NAMESPACE"
+        increase_percentage="${WARNING_INCREASE_LEVEL:-25}"
+        increased_value=$(( hard * increase_percentage / 100 ))
+        suggested_value=$(( increased_value + hard ))
+        recommendation="{\"object_type\":\"ResourceQuota\",\"object_name\":\"$quota_name\",\"remediation_type\":\"resourcequota_update\",\"increase_percentage\":\"$increase_percentage\",\"limit_type\":\"hard\",\"current_value\":\"$hard\",\"suggested_value\":\"$suggested_value\",\"quota_name\": \"$quota_name\", \"resource\": \"$resource\", \"usage\": \"between 90-99%\", \"severity\": \"2\", \"next_step\": \"Consider increasing the resource quota for $resource in \`$NAMESPACE\`\"}"
+    elif [ $percentage -ge 80 ]; then
+        echo "$resource: INFO ($percentage%) - Monitor the resource quota for $resource in $NAMESPACE"
+        increase_percentage="${INFO_INCREASE_LEVEL:-10}"
+        increased_value=$(( hard * increase_percentage / 100 ))
+        suggested_value=$(( increased_value + hard ))
+        recommendation="{\"object_type\":\"ResourceQuota\",\"object_name\":\"$quota_name\",\"remediation_type\":\"resourcequota_update\",\"increase_percentage\":\"$increase_percentage\",\"limit_type\":\"hard\",\"current_value\":\"$hard\",\"suggested_value\":\"$suggested_value\",\"quota_name\": \"$quota_name\", \"resource\": \"$resource\", \"usage\": \"between 80-90%\", \"severity\": \"3\", \"next_step\": \"Monitor the resource quota for $resource in \`$NAMESPACE\`\"}"
+    else
+        echo "$resource: OK ($percentage%)"
+    fi
+
+    # Concatenate recommendation to the string
+    if [ -n "$recommendation" ]; then
+        if [ -z "$recommendations" ]; then
+            recommendations="$recommendation"
+        else
+            recommendations="$recommendations, $recommendation"
+        fi
+    fi
+}
+
+# Fetching resource quota details
+quota_json=$(${KUBERNETES_DISTRIBUTION_BINARY} get quota -n "$NAMESPACE" --context "$CONTEXT" -o json)
+
+# Processing the quota JSON
+echo "Resource Quota and Usage for Namespace: $NAMESPACE in Context: $CONTEXT"
+echo "==========================================="
+
+# Parsing quota JSON
+while IFS= read -r item; do
+    quota_name=$(echo "$item" | jq -r '.metadata.name')
+    echo "Quota: $quota_name"
+
+    # Create temporary files
+    hard_file=$(mktemp)
+    used_file=$(mktemp)
+
+    echo "$item" | jq -r '.status.hard | to_entries | .[] | "\(.key) \(.value)"' > "$hard_file"
+    echo "$item" | jq -r '.status.used | to_entries | .[] | "\(.key) \(.value)"' > "$used_file"
+
+    # Process 'hard' limits and 'used' resources
+    while read -r key value; do
+        hard=$(grep "^$key " "$hard_file" | awk '{print $2}')
+        used=$(grep "^$key " "$used_file" | awk '{print $2}')
+        check_usage "$quota_name" "$key" "${used:-0}" "$hard"
+    done < "$hard_file"
+
+    echo "-----------------------------------"
+
+    # Clean up temporary files
+    rm "$hard_file" "$used_file"
+done < <(echo "$quota_json" | jq -c '.items[]')
+
+# Outputting recommendations as JSON
+if [ -n "$recommendations" ]; then
+    echo "Recommended Next Steps:"
+    echo "[$recommendations]" | jq .
+else
+    echo "No recommendations."
+fi