From e01d7d416eece4098435b5f2518e3511095fc23d Mon Sep 17 00:00:00 2001 From: Shea Stewart Date: Mon, 30 Oct 2023 10:34:14 -0400 Subject: [PATCH] update (#228) * fixes https://github.com/runwhen/platform-core/issues/1198 * update pdb format * add gcp ingress first task * add backend health check from annotation * updates to gce-ingress cb * template / gen rule updates * tweak genrule * debug template * update name * test rule * fix template * touchups * add function check * debug runtime * failfast * add debug task * test none * platy with os path * add auth to gcloud * remove debug * add timeout * fix typo * debug * test * switch to evaluate * update * update * update * x * x * update * try backtick * x * x * remove code ticks * update timeout_seconds for deployment script * test runbook update * debug * test * test env var * debug * more debug * add defaults * remove invalid dict * revert most changes - focus on simple tasks * fix cb * set dit details * add new rules * try newline edit * try to escape it * remove target service * update issue next steps * update gcloud * update env * gcoud target removal test 2 * target service removals (not all) * update * fix kind * add space * hardcode kind * add readme --- codebundles/cli-test/runbook.robot | 4 - codebundles/cmd-test/runbook.robot | 1 - .../runbook.robot | 9 +- .../runbook.robot | 9 +- .../.runwhen/templates/http-ok-taskset.yaml | 2 + ...blic-loadbalancer-ext-dns-tls-taskset.yaml | 2 + .../templates/http-ok-tls-taskset.yaml | 2 + codebundles/curl-http-ok/runbook.robot | 14 +- .../gcloud-log-inspection/runbook.robot | 5 +- codebundles/gcloud-node-preempt/runbook.robot | 4 +- .../runbook.robot | 5 - .../deployment_logs.sh | 14 +- .../k8s-deployment-healthcheck/runbook.robot | 5 +- .../k8s-ingress-gce-healthcheck | 31 +++ .../k8s-ingress-gce-healthcheck-slx.yaml | 23 +++ .../k8s-ingress-gce-healthcheck-taskset.yaml | 41 ++++ .../k8s-ingress-gce-healthcheck/README.md | 32 +++ .../check_gce_ingress_objects.sh | 71 +++++++ .../k8s-ingress-gce-healthcheck/runbook.robot | 185 ++++++++++++++++++ .../k8s-namespace-healthcheck/runbook.robot | 20 +- 20 files changed, 433 insertions(+), 46 deletions(-) create mode 100644 codebundles/k8s-ingress-gce-healthcheck/.runwhen/generation-rules/k8s-ingress-gce-healthcheck create mode 100644 codebundles/k8s-ingress-gce-healthcheck/.runwhen/templates/k8s-ingress-gce-healthcheck-slx.yaml create mode 100644 codebundles/k8s-ingress-gce-healthcheck/.runwhen/templates/k8s-ingress-gce-healthcheck-taskset.yaml create mode 100644 codebundles/k8s-ingress-gce-healthcheck/README.md create mode 100755 codebundles/k8s-ingress-gce-healthcheck/check_gce_ingress_objects.sh create mode 100644 codebundles/k8s-ingress-gce-healthcheck/runbook.robot diff --git a/codebundles/cli-test/runbook.robot b/codebundles/cli-test/runbook.robot index c5090b71..ab7bbdd0 100644 --- a/codebundles/cli-test/runbook.robot +++ b/codebundles/cli-test/runbook.robot @@ -41,7 +41,6 @@ Run CLI and Parse Output For Issues [Tags] Stdout Test Output Pods ${rsp}= RW.CLI.Run Cli ... cmd=kubectl get pods --context ${CONTEXT} -n ${NAMESPACE} - ... target_service=${kubectl} ... env=${env} ... secret_file__kubeconfig=${kubeconfig} # TODO: remove double slashes and find WYSIWYG method for regex passing @@ -66,7 +65,6 @@ Run CLI and Parse Output For Issues ${rsp}= RW.CLI.Run Cli ... cmd=kubectl get pods --context ${CONTEXT} -n ${NAMESPACE} -ojson - ... target_service=${kubectl} ... env=${env} ... secret_file__kubeconfig=${kubeconfig} ${rsp}= RW.CLI.Parse Cli Json Output @@ -89,7 +87,6 @@ Exec Test [Tags] Remote Exec Command Tags Workload Pod ${df}= RW.CLI.Run Cli ... cmd=df - ... target_service=${kubectl} ... env=${env} ... secret_file__kubeconfig=${kubeconfig} ... run_in_workload_with_name=deploy/crashi @@ -97,7 +94,6 @@ Exec Test ... optional_context=${CONTEXT} ${ls}= RW.CLI.Run Cli ... cmd=ls - ... target_service=${kubectl} ... env=${env} ... secret_file__kubeconfig=${kubeconfig} ... run_in_workload_with_labels=app=crashi diff --git a/codebundles/cmd-test/runbook.robot b/codebundles/cmd-test/runbook.robot index c430e37f..ed3a9569 100644 --- a/codebundles/cmd-test/runbook.robot +++ b/codebundles/cmd-test/runbook.robot @@ -18,7 +18,6 @@ Run CLI Command [Tags] stdout test output pods ${rsp}= RW.CLI.Run Cli ... cmd=${CLI_COMMAND} - ... target_service=${kubectl} ... env={"KUBECONFIG":"./${kubeconfig.key}"} ... secret_file__kubeconfig=${kubeconfig} RW.Core.Add Pre To Report Command Stdout:\n${rsp.stdout} diff --git a/codebundles/curl-gmp-kong-ingress-inspection/runbook.robot b/codebundles/curl-gmp-kong-ingress-inspection/runbook.robot index b921bab1..8b2d561c 100644 --- a/codebundles/curl-gmp-kong-ingress-inspection/runbook.robot +++ b/codebundles/curl-gmp-kong-ingress-inspection/runbook.robot @@ -10,6 +10,7 @@ Library RW.Core Library RW.CLI Library RW.platform Library RW.NextSteps +Library OperatingSystem Suite Setup Suite Initialization @@ -21,7 +22,6 @@ Check If Kong Ingress HTTP Error Rate Violates HTTP Error Threshold ${gmp_rsp}= RW.CLI.Run Cli ... cmd=gcloud auth activate-service-account --key-file=$GOOGLE_APPLICATION_CREDENTIALS && response=$(curl -s -d "query=rate(kong_http_requests_total{service='${INGRESS_SERVICE}',code=~'${HTTP_ERROR_CODES}'}[${TIME_SLICE}]) > ${HTTP_ERROR_RATE_THRESHOLD}" -H "Authorization: Bearer $(gcloud auth print-access-token)" 'https://monitoring.googleapis.com/v1/projects/runwhen-nonprod-sandbox/location/global/prometheus/api/v1/query') && echo "$response" | jq -e '.data.result | length > 0' && echo "$response" | jq -r '.data.result[] | "Route:" + .metric.route + " Service:" + .metric.service + " Kong Instance:" + .metric.instance + " HTTP Error Count:" + .value[1]' || echo "No HTTP Error threshold violations found for ${INGRESS_SERVICE}." ... render_in_commandlist=true - ... target_service=${GCLOUD_SERVICE} ... env=${env} ... secret_file__gcp_credentials_json=${gcp_credentials_json} ${ingress_name}= RW.CLI.Run Cli @@ -48,7 +48,6 @@ Check If Kong Ingress HTTP Error Rate Violates HTTP Error Threshold ... _line__raise_issue_if_contains=Route ${gmp_json}= RW.CLI.Run Cli ... cmd=gcloud auth activate-service-account --key-file=$GOOGLE_APPLICATION_CREDENTIALS && curl -s -d "query=rate(kong_http_requests_total{service='${INGRESS_SERVICE}',code=~'${HTTP_ERROR_CODES}'}[${TIME_SLICE}])" -H "Authorization: Bearer $(gcloud auth print-access-token)" 'https://monitoring.googleapis.com/v1/projects/runwhen-nonprod-sandbox/location/global/prometheus/api/v1/query' | jq . - ... target_service=${GCLOUD_SERVICE} ... env=${env} ... secret_file__gcp_credentials_json=${gcp_credentials_json} ${history}= RW.CLI.Pop Shell History @@ -62,7 +61,6 @@ Check If Kong Ingress HTTP Request Latency Violates Threshold ${gmp_rsp}= RW.CLI.Run Cli ... cmd=gcloud auth activate-service-account --key-file=$GOOGLE_APPLICATION_CREDENTIALS && response=$(curl -s -d "query=histogram_quantile(0.99, sum(rate(kong_request_latency_ms_bucket{service='${INGRESS_SERVICE}'}[${TIME_SLICE}])) by (le)) > ${REQUEST_LATENCY_THRESHOLD}" -H "Authorization: Bearer $(gcloud auth print-access-token)" 'https://monitoring.googleapis.com/v1/projects/runwhen-nonprod-sandbox/location/global/prometheus/api/v1/query') && echo "$response" | jq -e '.data.result | length > 0' && echo "$response" | jq -r '.data.result[] | "Service: ${INGRESS_SERVICE}" + " HTTP Request Latency(ms):" + .value[1]' || echo "No HTTP request latency threshold violations found for ${INGRESS_SERVICE}." ... render_in_commandlist=true - ... target_service=${GCLOUD_SERVICE} ... env=${env} ... secret_file__gcp_credentials_json=${gcp_credentials_json} ${ingress_name}= RW.CLI.Run Cli @@ -99,7 +97,6 @@ Check If Kong Ingress Controller Reports Upstream Errors ${gmp_healthchecks_off_rsp}= RW.CLI.Run Cli ... cmd=gcloud auth activate-service-account --key-file=$GOOGLE_APPLICATION_CREDENTIALS && response=$(curl -s -d "query=kong_upstream_target_health{upstream='${INGRESS_UPSTREAM}',state='healthchecks_off'} > 0" -H "Authorization: Bearer $(gcloud auth print-access-token)" 'https://monitoring.googleapis.com/v1/projects/runwhen-nonprod-sandbox/location/global/prometheus/api/v1/query') && echo "$response" | jq -e '.data.result | length > 0' && echo "$response" | jq -r '.data.result[] | "Service: ${INGRESS_UPSTREAM}" + " Healthchecks Disabled!' || echo "${INGRESS_UPSTREAM} has healthchecks enabled." ... render_in_commandlist=true - ... target_service=${GCLOUD_SERVICE} ... env=${env} ... secret_file__gcp_credentials_json=${gcp_credentials_json} ${next_steps}= RW.NextSteps.Suggest @@ -118,7 +115,6 @@ Check If Kong Ingress Controller Reports Upstream Errors ${gmp_healthchecks_rsp}= RW.CLI.Run Cli ... cmd=gcloud auth activate-service-account --key-file=$GOOGLE_APPLICATION_CREDENTIALS && response=$(curl -s -d "query=kong_upstream_target_health{upstream='${INGRESS_UPSTREAM}',state=~'dns_error|unhealthy'} > 0" -H "Authorization: Bearer $(gcloud auth print-access-token)" 'https://monitoring.googleapis.com/v1/projects/runwhen-nonprod-sandbox/location/global/prometheus/api/v1/query') && echo "$response" | jq -e '.data.result | length > 0' && echo "$response" | jq -r '.data.result[] | "Issue detected with Service: ${INGRESS_UPSTREAM}" + " Healthcheck subsystem-state: " + .metric.subsystem + "-" + .metric.state + " Target: " + .metric.target' || echo "${INGRESS_UPSTREAM} is reported as healthy from the Kong ingress controller." ... render_in_commandlist=true - ... target_service=${GCLOUD_SERVICE} ... env=${env} ... secret_file__gcp_credentials_json=${gcp_credentials_json} ${next_steps}= RW.NextSteps.Suggest @@ -193,6 +189,7 @@ Suite Initialization ... description=The threshold in ms for request latency to be considered unhealthy. ... pattern=\w* ... example=100 + ${OS_PATH}= Get Environment Variable PATH Set Suite Variable ${GCLOUD_SERVICE} ${GCLOUD_SERVICE} Set Suite Variable ${gcp_credentials_json} ${gcp_credentials_json} Set Suite Variable ${GCP_PROJECT_ID} ${GCP_PROJECT_ID} @@ -204,4 +201,4 @@ Suite Initialization Set Suite Variable ${HTTP_ERROR_CODES} ${HTTP_ERROR_CODES} Set Suite Variable ... ${env} - ... {"CLOUDSDK_CORE_PROJECT":"${GCP_PROJECT_ID}","GOOGLE_APPLICATION_CREDENTIALS":"./${gcp_credentials_json.key}"} + ... {"CLOUDSDK_CORE_PROJECT":"${GCP_PROJECT_ID}","GOOGLE_APPLICATION_CREDENTIALS":"./${gcp_credentials_json.key}","PATH":"$PATH:${OS_PATH}"} diff --git a/codebundles/curl-gmp-nginx-ingress-inspection/runbook.robot b/codebundles/curl-gmp-nginx-ingress-inspection/runbook.robot index 21c50019..c6ba148f 100644 --- a/codebundles/curl-gmp-nginx-ingress-inspection/runbook.robot +++ b/codebundles/curl-gmp-nginx-ingress-inspection/runbook.robot @@ -9,6 +9,7 @@ Library BuiltIn Library RW.Core Library RW.CLI Library RW.platform +Library OperatingSystem Suite Setup Suite Initialization @@ -21,22 +22,18 @@ Fetch Nginx Ingress HTTP Errors From GMP And Perform Inspection On Results ${gmp_rsp}= RW.CLI.Run Cli ... cmd=gcloud auth activate-service-account --key-file=$GOOGLE_APPLICATION_CREDENTIALS && curl -d "query=rate(nginx_ingress_controller_requests{host='${INGRESS_HOST}', service='${INGRESS_SERVICE}', status=~'${ERROR_CODES}'}[${TIME_SLICE}]) > 0" -H "Authorization: Bearer $(gcloud auth print-access-token)" 'https://monitoring.googleapis.com/v1/projects/${GCP_PROJECT_ID}/location/global/prometheus/api/v1/query' | jq -r 'if .data.result[0] then "Host:" + .data.result[0].metric.host + " Ingress:" + .data.result[0].metric.ingress + " Namespace:" + .data.result[0].metric.exported_namespace + " Service:" + .data.result[0].metric.service else "" end' ... render_in_commandlist=true - ... target_service=${GCLOUD_SERVICE} ... env=${env} ... secret_file__gcp_credentials_json=${gcp_credentials_json} ${gmp_json}= RW.CLI.Run Cli ... cmd=gcloud auth activate-service-account --key-file=$GOOGLE_APPLICATION_CREDENTIALS && curl -d "query=rate(nginx_ingress_controller_requests{host='${INGRESS_HOST}', service='${INGRESS_SERVICE}', status=~'${ERROR_CODES}'}[${TIME_SLICE}]) > 0" -H "Authorization: Bearer $(gcloud auth print-access-token)" 'https://monitoring.googleapis.com/v1/projects/${GCP_PROJECT_ID}/location/global/prometheus/api/v1/query' - ... target_service=${GCLOUD_SERVICE} ... env=${env} ... secret_file__gcp_credentials_json=${gcp_credentials_json} ${k8s_ingress_details}= RW.CLI.Run Cli ... cmd=namespace="${NAMESPACE}"; context="${CONTEXT}"; ingress="${INGRESS_OBJECT_NAME}"; echo "Ingress: $ingress"; health_status="NA"; services=(); backend_services=$(${KUBERNETES_DISTRIBUTION_BINARY} get ingress "$ingress" -n "$namespace" --context "$context" -ojsonpath='{range .spec.rules[*].http.paths[*]}{.backend.service.name}{" "}{.backend.service.port.number}{"\\n"}{end}'); IFS=$'\\n'; for line in $backend_services; do service=$(echo "$line" | cut -d " " -f 1); port=$(echo "$line" | cut -d " " -f 2); if [ -n "$service" ] && [ -n "$port" ]; then echo "Backend Service: $service, Port: $port"; service_exists=$(${KUBERNETES_DISTRIBUTION_BINARY} get service "$service" -n "$namespace" --context "$context" -ojsonpath='{.metadata.name}'); if [ -z "$service_exists" ]; then health_status="Unhealthy"; echo "Validation: Service $service does not exist"; else endpoint_pods=$(${KUBERNETES_DISTRIBUTION_BINARY} get endpoints "$service" -n "$namespace" --context "$context" -ojsonpath='{range .subsets[*].addresses[*]}- Pod Name: {.targetRef.name}, Pod IP: {.ip}\\n{end}'); if [ -z "$endpoint_pods" ]; then health_status="Unhealthy"; echo "Validation: Endpoint for service $service does not have any pods"; else echo "Endpoint Pod:"; echo -e "$endpoint_pods"; for pod in $endpoint_pods; do if [[ $pod == *"- Pod Name: "* ]]; then pod_name="\${pod#*- Pod Name: }"; pod_name="\${pod_name%%,*}"; if [ -n "$pod_name" ]; then owner_kind=$(${KUBERNETES_DISTRIBUTION_BINARY} get pod "$pod_name" -n "$namespace" --context "$context" -o=jsonpath='{.metadata.ownerReferences[0].kind}'); if [ -n "$owner_kind" ]; then if [ "$owner_kind" = "StatefulSet" ] || [ "$owner_kind" = "DaemonSet" ]; then owner_info="$(${KUBERNETES_DISTRIBUTION_BINARY} get pod "$pod_name" -n "$namespace" --context "$context" -o=jsonpath='{.metadata.ownerReferences[0].name}') $owner_kind"; else replicaset=$(${KUBERNETES_DISTRIBUTION_BINARY} get pod "$pod_name" -n "$namespace" --context "$context" -o=jsonpath='{.metadata.ownerReferences[0].name}'); if [ -n "$replicaset" ]; then owner_kind=$(${KUBERNETES_DISTRIBUTION_BINARY} get replicaset "$replicaset" -n "$namespace" --context "$context" -o=jsonpath='{.metadata.ownerReferences[0].kind}'); owner_name=$(${KUBERNETES_DISTRIBUTION_BINARY} get replicaset "$replicaset" -n "$namespace" --context "$context" -o=jsonpath='{.metadata.ownerReferences[0].name}'); owner_info="$owner_kind:$owner_name"; fi; fi; fi; if [ -n "$owner_info" ]; then echo "Owner: $owner_info"; fi; fi; fi; done; health_status="Healthy"; fi; fi; services+=("$service"); fi; done; for service in "\${services[@]}"; do service_exists=$(${KUBERNETES_DISTRIBUTION_BINARY} get service "$service" -n "$namespace" --context "$context" -ojsonpath='{.metadata.name}'); if [ -z "$service_exists" ]; then health_status="Unhealthy"; echo "Validation: Service $service does not exist"; else endpoint_exists=$(${KUBERNETES_DISTRIBUTION_BINARY} get endpoints "$service" -n "$namespace" --context "$context" -ojsonpath='{.metadata.name}'); if [ -z "$endpoint_exists" ]; then health_status="Unhealthy"; echo "Validation: Endpoint for service $service does not exist"; fi; fi; done; if [ "$health_status" = "Unhealthy" ]; then echo "Health Status: $health_status"; echo "====================="; elif [ "$health_status" = "Healthy" ]; then echo "Health Status: $health_status"; fi; echo "------------" - ... target_service=${kubectl} ... env=${env} ... secret_file__kubeconfig=${kubeconfig} ${ingress_owner}= RW.CLI.Run Cli ... cmd=echo "${k8s_ingress_details.stdout}" | grep 'Owner:[^ ]*' | awk -F': ' '{print $2}' - ... target_service=${GCLOUD_SERVICE} RW.CLI.Parse Cli Output By Line ... rsp=${gmp_rsp} ... set_severity_level=2 @@ -62,7 +59,6 @@ Find Ingress Owner and Service Health [Tags] owner ingress service endpoints ${k8s_ingress_details}= RW.CLI.Run Cli ... cmd=namespace="${NAMESPACE}"; context="${CONTEXT}"; ingress="${INGRESS_OBJECT_NAME}"; echo "Ingress: $ingress"; health_status="NA"; services=(); backend_services=$(${KUBERNETES_DISTRIBUTION_BINARY} get ingress "$ingress" -n "$namespace" --context "$context" -ojsonpath='{range .spec.rules[*].http.paths[*]}{.backend.service.name}{" "}{.backend.service.port.number}{"\\n"}{end}'); IFS=$'\\n'; for line in $backend_services; do service=$(echo "$line" | cut -d " " -f 1); port=$(echo "$line" | cut -d " " -f 2); if [ -n "$service" ] && [ -n "$port" ]; then echo "Backend Service: $service, Port: $port"; service_exists=$(${KUBERNETES_DISTRIBUTION_BINARY} get service "$service" -n "$namespace" --context "$context" -ojsonpath='{.metadata.name}'); if [ -z "$service_exists" ]; then health_status="Unhealthy"; echo "Validation: Service $service does not exist"; else endpoint_pods=$(${KUBERNETES_DISTRIBUTION_BINARY} get endpoints "$service" -n "$namespace" --context "$context" -ojsonpath='{range .subsets[*].addresses[*]}- Pod Name: {.targetRef.name}, Pod IP: {.ip}\\n{end}'); if [ -z "$endpoint_pods" ]; then health_status="Unhealthy"; echo "Validation: Endpoint for service $service does not have any pods"; else echo "Endpoint Pod:"; echo -e "$endpoint_pods"; for pod in $endpoint_pods; do if [[ $pod == *"- Pod Name: "* ]]; then pod_name="\${pod#*- Pod Name: }"; pod_name="\${pod_name%%,*}"; if [ -n "$pod_name" ]; then owner_kind=$(${KUBERNETES_DISTRIBUTION_BINARY} get pod "$pod_name" -n "$namespace" --context "$context" -o=jsonpath='{.metadata.ownerReferences[0].kind}'); if [ -n "$owner_kind" ]; then if [ "$owner_kind" = "StatefulSet" ] || [ "$owner_kind" = "DaemonSet" ]; then owner_info="$(${KUBERNETES_DISTRIBUTION_BINARY} get pod "$pod_name" -n "$namespace" --context "$context" -o=jsonpath='{.metadata.ownerReferences[0].name}') $owner_kind"; else replicaset=$(${KUBERNETES_DISTRIBUTION_BINARY} get pod "$pod_name" -n "$namespace" --context "$context" -o=jsonpath='{.metadata.ownerReferences[0].name}'); if [ -n "$replicaset" ]; then owner_kind=$(${KUBERNETES_DISTRIBUTION_BINARY} get replicaset "$replicaset" -n "$namespace" --context "$context" -o=jsonpath='{.metadata.ownerReferences[0].kind}'); owner_name=$(${KUBERNETES_DISTRIBUTION_BINARY} get replicaset "$replicaset" -n "$namespace" --context "$context" -o=jsonpath='{.metadata.ownerReferences[0].name}'); owner_info="$owner_name $owner_kind"; fi; fi; fi; if [ -n "$owner_info" ]; then echo "Owner: $owner_info"; fi; fi; fi; done; health_status="Healthy"; fi; fi; services+=("$service"); fi; done; for service in "\${services[@]}"; do service_exists=$(${KUBERNETES_DISTRIBUTION_BINARY} get service "$service" -n "$namespace" --context "$context" -ojsonpath='{.metadata.name}'); if [ -z "$service_exists" ]; then health_status="Unhealthy"; echo "Validation: Service $service does not exist"; else endpoint_exists=$(${KUBERNETES_DISTRIBUTION_BINARY} get endpoints "$service" -n "$namespace" --context "$context" -ojsonpath='{.metadata.name}'); if [ -z "$endpoint_exists" ]; then health_status="Unhealthy"; echo "Validation: Endpoint for service $service does not exist"; fi; fi; done; if [ "$health_status" = "Unhealthy" ]; then echo "Health Status: $health_status"; echo "====================="; elif [ "$health_status" = "Healthy" ]; then echo "Health Status: $health_status"; fi; echo "------------" - ... target_service=${kubectl} ... env=${env} ... secret_file__kubeconfig=${kubeconfig} ... render_in_commandlist=true @@ -147,6 +143,7 @@ Suite Initialization ... pattern=\w* ... example=500 ... default=500|501|502 + ${OS_PATH}= Get Environment Variable PATH Set Suite Variable ${kubeconfig} ${kubeconfig} Set Suite Variable ${kubectl} ${kubectl} Set Suite Variable ${KUBERNETES_DISTRIBUTION_BINARY} ${KUBERNETES_DISTRIBUTION_BINARY} @@ -161,4 +158,4 @@ Suite Initialization Set Suite Variable ${INGRESS_OBJECT_NAME} ${INGRESS_OBJECT_NAME} Set Suite Variable ... ${env} - ... {"CLOUDSDK_CORE_PROJECT":"${GCP_PROJECT_ID}","GOOGLE_APPLICATION_CREDENTIALS":"./${gcp_credentials_json.key}", "KUBECONFIG":"./${kubeconfig.key}"} + ... {"CLOUDSDK_CORE_PROJECT":"${GCP_PROJECT_ID}","GOOGLE_APPLICATION_CREDENTIALS":"./${gcp_credentials_json.key}", "KUBECONFIG":"./${kubeconfig.key}","PATH":"$PATH:${OS_PATH}"} diff --git a/codebundles/curl-http-ok/.runwhen/templates/http-ok-taskset.yaml b/codebundles/curl-http-ok/.runwhen/templates/http-ok-taskset.yaml index 83740d00..40d3ef7c 100644 --- a/codebundles/curl-http-ok/.runwhen/templates/http-ok-taskset.yaml +++ b/codebundles/curl-http-ok/.runwhen/templates/http-ok-taskset.yaml @@ -27,5 +27,7 @@ spec: value: '1.2' - name: DESIRED_RESPONSE_CODE value: '200' + - name: OWNER_DETAILS + value: "{'name':'{{match_resource.resource.metadata.name}}', 'kind':'Ingress','namespace':'{{match_resource.resource.metadata.namespace}}'}" secretsProvided: [] servicesProvided: [] \ No newline at end of file diff --git a/codebundles/curl-http-ok/.runwhen/templates/http-ok-tls-aks-public-loadbalancer-ext-dns-tls-taskset.yaml b/codebundles/curl-http-ok/.runwhen/templates/http-ok-tls-aks-public-loadbalancer-ext-dns-tls-taskset.yaml index 5586d36f..8f0ec47b 100644 --- a/codebundles/curl-http-ok/.runwhen/templates/http-ok-tls-aks-public-loadbalancer-ext-dns-tls-taskset.yaml +++ b/codebundles/curl-http-ok/.runwhen/templates/http-ok-tls-aks-public-loadbalancer-ext-dns-tls-taskset.yaml @@ -27,5 +27,7 @@ spec: value: "1.2" - name: DESIRED_RESPONSE_CODE value: "200" + - name: OWNER_DETAILS + value: "{'name':'{{match_resource.resource.metadata.name}}', 'kind':'Service','namespace':'{{match_resource.resource.metadata.namespace}}'}" secretsProvided: [] servicesProvided: [] diff --git a/codebundles/curl-http-ok/.runwhen/templates/http-ok-tls-taskset.yaml b/codebundles/curl-http-ok/.runwhen/templates/http-ok-tls-taskset.yaml index 0afb9f6d..ac77b6c4 100644 --- a/codebundles/curl-http-ok/.runwhen/templates/http-ok-tls-taskset.yaml +++ b/codebundles/curl-http-ok/.runwhen/templates/http-ok-tls-taskset.yaml @@ -27,5 +27,7 @@ spec: value: '1.2' - name: DESIRED_RESPONSE_CODE value: '200' + - name: OWNER_DETAILS + value: "{'name':'{{match_resource.resource.metadata.name}}', 'kind':'Ingress','namespace':'{{match_resource.resource.metadata.namespace}}'}" secretsProvided: [] servicesProvided: [] \ No newline at end of file diff --git a/codebundles/curl-http-ok/runbook.robot b/codebundles/curl-http-ok/runbook.robot index 81a250e3..95dd74ca 100644 --- a/codebundles/curl-http-ok/runbook.robot +++ b/codebundles/curl-http-ok/runbook.robot @@ -19,13 +19,18 @@ Checking HTTP URL Is Available And Timely ${curl_rsp}= RW.CLI.Run Cli ... cmd=curl -o /dev/null -w '{"http_code": \%{http_code}, "time_total": \%{time_total}}' -s ${URL} ... render_in_commandlist=true + ${owner_details_dict}= Evaluate eval(json.loads($OWNER_DETAILS)) + ${owner_kind}= Set Variable ${owner_details_dict['kind']} + ${owner_name}= Set Variable ${owner_details_dict['name']} + ${owner_namespace}= Set Variable ${owner_details_dict['namespace']} ${http_rsp_code}= RW.CLI.Parse Cli Json Output ... rsp=${curl_rsp} ... extract_path_to_var__http_code=http_code ... set_issue_title=Actual HTTP Response Code Does Not Match Desired HTTP Response Code ... set_severity_level=4 ... http_code__raise_issue_if_neq=${DESIRED_RESPONSE_CODE} - ... set_issue_details=${URL} responded with a status of:$http_code - check service, pods, namespace, virtual machines & load balancers. + ... set_issue_details=${URL} responded with a status of:$http_code \n\n Check related ingress objects, services, and pods. + ... set_issue_next_steps=Check:\n\n `${owner_name}` `${owner_kind}` Health, `${owner_namespace}` Namespace Health ... assign_stdout_from_var=http_code ${http_latency}= RW.CLI.Parse Cli Json Output ... rsp=${curl_rsp} @@ -61,6 +66,13 @@ Suite Initialization ... pattern=\w* ... default=200 ... example=200 + ${OWNER_DETAILS}= RW.Core.Import User Variable OWNER_DETAILS + ... type=string + ... description=Json list of owner details + ... pattern=\w* + ... default="{'name':'my-ingress', 'kind':'Ingress','namespace':'default'}" + ... example="{'name':'my-ingress', 'kind':'Ingress','namespace':'default'}" Set Suite Variable ${DESIRED_RESPONSE_CODE} ${DESIRED_RESPONSE_CODE} Set Suite Variable ${URL} ${URL} Set Suite Variable ${TARGET_LATENCY} ${TARGET_LATENCY} + Set Suite Variable ${OWNER_DETAILS} ${OWNER_DETAILS} diff --git a/codebundles/gcloud-log-inspection/runbook.robot b/codebundles/gcloud-log-inspection/runbook.robot index 22e9f3e2..f1d52c39 100644 --- a/codebundles/gcloud-log-inspection/runbook.robot +++ b/codebundles/gcloud-log-inspection/runbook.robot @@ -6,6 +6,7 @@ Documentation Fetches logs from a GCP using a configurable query and raises Suite Setup Suite Initialization Library RW.Core Library RW.CLI +Library OperatingSystem *** Keywords *** Suite Initialization @@ -38,6 +39,7 @@ Suite Initialization ... description=The GCP Project ID to scope the API to. ... pattern=\w* ... example=myproject-ID + ${OS_PATH}= Get Environment Variable PATH Set Suite Variable ${SEVERITY} ${SEVERITY} Set Suite Variable ${GCLOUD_SERVICE} ${GCLOUD_SERVICE} Set Suite Variable ${gcp_credentials_json} ${gcp_credentials_json} @@ -46,7 +48,7 @@ Suite Initialization ${ADD_FILTERS}= Set Variable \ AND ${ADD_FILTERS} END Set Suite Variable ${ADD_FILTERS} ${ADD_FILTERS} - Set Suite Variable ${env} {"CLOUDSDK_CORE_PROJECT":"${GCP_PROJECT_ID}","GOOGLE_APPLICATION_CREDENTIALS":"./${gcp_credentials_json.key}"} + Set Suite Variable ${env} {"CLOUDSDK_CORE_PROJECT":"${GCP_PROJECT_ID}","GOOGLE_APPLICATION_CREDENTIALS":"./${gcp_credentials_json.key}","PATH":"$PATH:${OS_PATH}"} *** Tasks *** Inspect GCP Logs For Common Errors @@ -55,7 +57,6 @@ Inspect GCP Logs For Common Errors ${cmd} Set Variable gcloud auth activate-service-account --key-file=$GOOGLE_APPLICATION_CREDENTIALS && gcloud logging read "severity>=${SEVERITY}${ADD_FILTERS}" --freshness=120m --limit=50 --format=json ${rsp}= RW.CLI.Run Cli ... cmd=${cmd} - ... target_service=${GCLOUD_SERVICE} ... env=${env} ... secret_file__gcp_credentials_json=${gcp_credentials_json} ${namespace_list}= RW.CLI.Parse Cli Json Output diff --git a/codebundles/gcloud-node-preempt/runbook.robot b/codebundles/gcloud-node-preempt/runbook.robot index 1d334fb9..ce761819 100644 --- a/codebundles/gcloud-node-preempt/runbook.robot +++ b/codebundles/gcloud-node-preempt/runbook.robot @@ -19,7 +19,6 @@ List all nodes in an active prempt operation [Tags] stdout gcloud node preempt gcp ${preempt_node_list}= RW.CLI.Run Cli ... cmd=gcloud auth activate-service-account --key-file=$GOOGLE_APPLICATION_CREDENTIALS && gcloud compute operations list --filter="operationType:( compute.instances.preempted ) AND NOT status:( DONE )" --format=json --project=${GCP_PROJECT_ID} | jq '[.[] | {startTime,targetLink, statusMessage, progress, zone, selfLink}]' - ... target_service=${GCLOUD_SERVICE} ... env=${env} ... secret_file__gcp_credentials_json=${gcp_credentials_json} ... render_in_commandlist=true @@ -55,9 +54,10 @@ Suite Initialization ... description=The GCP Project ID to scope the API to. ... pattern=\w* ... example=myproject-ID + ${OS_PATH}= Get Environment Variable PATH Set Suite Variable ${GCP_PROJECT_ID} ${GCP_PROJECT_ID} Set Suite Variable ${GCLOUD_SERVICE} ${GCLOUD_SERVICE} Set Suite Variable ${gcp_credentials_json} ${gcp_credentials_json} Set Suite Variable ... ${env} - ... {"CLOUDSDK_CORE_PROJECT":"${GCP_PROJECT_ID}","GOOGLE_APPLICATION_CREDENTIALS":"./${gcp_credentials_json.key}"} + ... {"CLOUDSDK_CORE_PROJECT":"${GCP_PROJECT_ID}","GOOGLE_APPLICATION_CREDENTIALS":"./${gcp_credentials_json.key}","PATH":"$PATH:${OS_PATH}"} diff --git a/codebundles/k8s-argocd-application-health/runbook.robot b/codebundles/k8s-argocd-application-health/runbook.robot index 892f3b83..80fc87e1 100644 --- a/codebundles/k8s-argocd-application-health/runbook.robot +++ b/codebundles/k8s-argocd-application-health/runbook.robot @@ -73,7 +73,6 @@ Fetch ArgoCD Application Sync Status & Health [Tags] Application Sync Health ArgoCD ${app_sync_status}= RW.CLI.Run Cli ... cmd=${binary_name} get applications.argoproj.io ${APPLICATION} -n ${APPLICATION_APP_NAMESPACE} --context ${CONTEXT} -o jsonpath='Application Name: {.metadata.name}, Sync Status: {.status.sync.status}, Health Status: {.status.health.status}, Message: {.status.conditions[].message}' - ... target_service=${kubectl} ... env=${env} ... secret_file__kubeconfig=${kubeconfig} ... render_in_commandlist=true @@ -87,7 +86,6 @@ Fetch ArgoCD Application Last Sync Operation Details [Tags] Application SyncOperation History ArgoCD ${last_sync_status}= RW.CLI.Run Cli ... cmd=${binary_name} get applications.argoproj.io ${APPLICATION} -n ${APPLICATION_APP_NAMESPACE} --context ${CONTEXT} -o json | jq -r '"Application Name: " + .metadata.name + "\\nApplication Namespace: "+ .metadata.namespace + "\\nLast Sync Start Time: " + .status.operationState.finishedAt + "\\nLast Sync Finish Time: " + .status.operationState.startedAt + "\\nLast Sync Status: " + .status.operationState.phase + "\\nLast Sync Message: " + .status.operationState.message' - ... target_service=${kubectl} ... env=${env} ... secret_file__kubeconfig=${kubeconfig} ... render_in_commandlist=true @@ -102,7 +100,6 @@ Fetch Unhealthy ArgoCD Application Resources [Tags] Resources Unhealthy SyncStatus ArgoCD ${unhealthy_resources}= RW.CLI.Run Cli ... cmd=${binary_name} get applications.argoproj.io ${APPLICATION} -n ${APPLICATION_APP_NAMESPACE} --context ${CONTEXT} -o json | jq -r '[.status.resources[] | select(.health.status != null) | select(.health.status != "Healthy") | {name,kind,namespace,health}]' - ... target_service=${kubectl} ... env=${env} ... secret_file__kubeconfig=${kubeconfig} ... render_in_commandlist=true @@ -125,7 +122,6 @@ Scan For Errors in Pod Logs Related to ArgoCD Application Deployments [Tags] Error Logs Deployments ArgoCD Pods ${log_errors}= RW.CLI.Run Cli ... cmd=for deployment_name in $(${binary_name} get deployments -l argocd.argoproj.io/instance=${APPLICATION_TARGET_NAMESPACE}_${APPLICATION} -o=custom-columns=NAME:.metadata.name --no-headers -n ${APPLICATION_TARGET_NAMESPACE}); do echo "\\nDEPLOYMENT NAME: $deployment_name \\n" && kubectl logs deployment/$deployment_name --tail=50 -n ${APPLICATION_TARGET_NAMESPACE} | grep -E '${ERROR_PATTERN}'; done - ... target_service=${kubectl} ... env=${env} ... secret_file__kubeconfig=${kubeconfig} ... render_in_commandlist=true @@ -139,7 +135,6 @@ Fully Describe ArgoCD Application [Tags] Application Describe ArgoCD ${application_describe}= RW.CLI.Run Cli ... cmd=${binary_name} describe applications.argoproj.io ${APPLICATION} -n ${APPLICATION_APP_NAMESPACE} --context ${CONTEXT} - ... target_service=${kubectl} ... env=${env} ... secret_file__kubeconfig=${kubeconfig} ... render_in_commandlist=true diff --git a/codebundles/k8s-deployment-healthcheck/deployment_logs.sh b/codebundles/k8s-deployment-healthcheck/deployment_logs.sh index 67088c65..be61344f 100755 --- a/codebundles/k8s-deployment-healthcheck/deployment_logs.sh +++ b/codebundles/k8s-deployment-healthcheck/deployment_logs.sh @@ -228,7 +228,7 @@ for match in "${FUZZY_ENV_VAR_RESOURCE_MATCHES[@]}"; do resource=${parts[1]} env_key=${parts[2]} env_value=${parts[3]} - echo "Found string **$string** in resource **$resource**. Check manifest and environment variable **$env_key** for accuracy." + echo "Found string `$string` in resource `$resource`. Check manifest and environment variable `$env_key` for accuracy. " done # Fetch namespace events for searching through @@ -257,7 +257,7 @@ if [[ ${#FUZZY_ENV_VAR_RESOURCE_MATCHES[@]} -ne 0 ]]; then env_value=${parts[3]} if [[ -z ${seen_resources[$resource]} ]]; then - recommendations+=("Review manifest for **$resource** in namespace: **${NAMESPACE}**. Matched error log string **$string** in environment variable **$env_key**.") + recommendations+=("Review manifest for `$resource` in namespace: `${NAMESPACE}`. Matched error log string `$string` in environment variable `$env_key`. ") seen_resources[$resource]=1 fi done @@ -279,20 +279,20 @@ if [[ -n "$INTERESTING_RESOURCES" ]]; then case "$type" in pod) if [[ "$status" != "Running" ]]; then - recommendations+=("Troubleshoot *failed pods* in *namespace* **${NAMESPACE}**") + recommendations+=("Troubleshoot *failed pods* in *namespace* `${NAMESPACE}` ") fi if ((restarts > 0)); then - recommendations+=("Troubleshoot *container restarts* in *namespace* **${NAMESPACE}**") + recommendations+=("Troubleshoot *container restarts* in *namespace* `${NAMESPACE}` ") fi ;; deployment|deployment.apps) - recommendations+=("Check *deployment* health **$name** in *namespace* **${NAMESPACE}**") + recommendations+=("Check *deployment* health `$name` in *namespace* `${NAMESPACE}` ") ;; service) - recommendations+=("Check *service* health **$name** in *namespace* **${NAMESPACE}**") + recommendations+=("Check *service* health `$name` in *namespace* `${NAMESPACE}` ") ;; statefulset|statefulset.apps) - recommendations+=("Check *statefulset* health **$name** in *namespace* **${NAMESPACE}**") + recommendations+=("Check *statefulset* health `$name` in *namespace* `${NAMESPACE}` ") ;; esac done <<< "$INTERESTING_RESOURCES" diff --git a/codebundles/k8s-deployment-healthcheck/runbook.robot b/codebundles/k8s-deployment-healthcheck/runbook.robot index c993203d..c5b60d80 100644 --- a/codebundles/k8s-deployment-healthcheck/runbook.robot +++ b/codebundles/k8s-deployment-healthcheck/runbook.robot @@ -22,12 +22,13 @@ Check Deployment Log For Issues ... bash_file=deployment_logs.sh ... env=${env} ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=180 ${recommendations}= RW.CLI.Run Cli - ... cmd=echo "${logs.stdout}" | awk '/Recommended Next Steps:/ {flag=1; next} flag' + ... cmd=echo '''${logs.stdout}''' | awk "/Recommended Next Steps:/ {start=1; getline} start" ... env=${env} ... include_in_history=false ${issues}= RW.CLI.Run Cli - ... cmd=echo "${logs.stdout}" | awk '/Issues Identified:/,/^$/' | tail -n +2 + ... cmd=echo '''${logs.stdout}''' | awk '/Issues Identified:/ {start=1} /The namespace online-boutique has produced the following interesting events:/ {start=0} start' ... env=${env} ... include_in_history=false RW.CLI.Parse Cli Output By Line diff --git a/codebundles/k8s-ingress-gce-healthcheck/.runwhen/generation-rules/k8s-ingress-gce-healthcheck b/codebundles/k8s-ingress-gce-healthcheck/.runwhen/generation-rules/k8s-ingress-gce-healthcheck new file mode 100644 index 00000000..e59e4b91 --- /dev/null +++ b/codebundles/k8s-ingress-gce-healthcheck/.runwhen/generation-rules/k8s-ingress-gce-healthcheck @@ -0,0 +1,31 @@ +apiVersion: runwhen.com/v1 +kind: GenerationRules +spec: + generationRules: + - resourceTypes: + - ingress + matchRules: + - type: and + matches: + - type: pattern + pattern: ".+" + properties: [name] + mode: substring + - type: pattern + pattern: "networking.gke.io/ingress-finalizer-V2" + properties: [metadata/finalizers] + mode: substring + - resourceType: variables + type: pattern + pattern: "gcp" + properties: [custom/cloud_provider] + mode: substring + slxs: + - baseName: ingress-gce-health + qualifiers: ["resource", "namespace", "cluster"] + baseTemplateName: k8s-ingress-gce-healthcheck + levelOfDetail: detailed + outputItems: + - type: slx + - type: runbook + templateName: k8s-ingress-gce-healthcheck-taskset.yaml diff --git a/codebundles/k8s-ingress-gce-healthcheck/.runwhen/templates/k8s-ingress-gce-healthcheck-slx.yaml b/codebundles/k8s-ingress-gce-healthcheck/.runwhen/templates/k8s-ingress-gce-healthcheck-slx.yaml new file mode 100644 index 00000000..74890e1b --- /dev/null +++ b/codebundles/k8s-ingress-gce-healthcheck/.runwhen/templates/k8s-ingress-gce-healthcheck-slx.yaml @@ -0,0 +1,23 @@ +apiVersion: runwhen.com/v1 +kind: ServiceLevelX +metadata: + name: {{slx_name}} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} +spec: + imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/gcp/cloud_load_balancing/cloud_load_balancing.svg + alias: {{match_resource.resource.metadata.name}} GCP HTTP Load Balancer Ingress Health + asMeasuredBy: GCP HTTP Load Balancer and GKE Ingress objects with healthy backends. + configProvided: + - name: OBJECT_NAME + value: {{match_resource.resource.metadata.name}} + owners: + - {{workspace.owner_email}} + statement: Ingress objects with HTTP Load Balancers should have healthy backends. + additionalContext: + namespace: "{{match_resource.resource.metadata.namespace}}" + labelMap: "{{match_resource.resource.metadata.labels}}" + cluster: "{{ cluster.name }}" + context: "{{ cluster.context }}" \ No newline at end of file diff --git a/codebundles/k8s-ingress-gce-healthcheck/.runwhen/templates/k8s-ingress-gce-healthcheck-taskset.yaml b/codebundles/k8s-ingress-gce-healthcheck/.runwhen/templates/k8s-ingress-gce-healthcheck-taskset.yaml new file mode 100644 index 00000000..3b84cf0a --- /dev/null +++ b/codebundles/k8s-ingress-gce-healthcheck/.runwhen/templates/k8s-ingress-gce-healthcheck-taskset.yaml @@ -0,0 +1,41 @@ +apiVersion: runwhen.com/v1 +kind: Runbook +metadata: + name: {{slx_name}} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} +spec: + location: {{default_location}} + codeBundle: + {% if repo_url %} + repoUrl: {{repo_url}} + {% else %} + repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git + {% endif %} + {% if ref %} + ref: {{ref}} + {% else %} + ref: main + {% endif %} + pathToRobot: codebundles/k8s-ingress-gce-healthcheck/runbook.robot + configProvided: + - name: NAMESPACE + value: {{match_resource.resource.metadata.namespace}} + - name: CONTEXT + value: {{context}} + - name: KUBERNETES_DISTRIBUTION_BINARY + value: {{custom.kubernetes_distribution_binary}} + - name: GCP_PROJECT_ID + value: {{custom.gcp_project_id}} + - name: INGRESS + value: {{match_resource.resource.metadata.name}} + secretsProvided: + - name: kubeconfig + workspaceKey: {{custom.kubeconfig_secret_name}} + - name: gcp_credentials_json + workspaceKey: {{custom.gcp_ops_suite_sa}} + servicesProvided: + - name: {{custom.kubernetes_distribution_binary}} + locationServiceName: {{custom.kubernetes_distribution_binary}}-service.shared \ No newline at end of file diff --git a/codebundles/k8s-ingress-gce-healthcheck/README.md b/codebundles/k8s-ingress-gce-healthcheck/README.md new file mode 100644 index 00000000..9735b31e --- /dev/null +++ b/codebundles/k8s-ingress-gce-healthcheck/README.md @@ -0,0 +1,32 @@ +# Kubernetes Ingress-GCE HealthCheck + +Triages the GCP HTTP Load Balancer resources that are created when an ingress object is detected and created by the ingress-gce controller. + +## Tasks +- `Search For GCE Ingress Warnings in GKE`- Executes CLI commands to find warning events related to GCE Ingress and services objects. Parses the CLI output to identify and report issues. + +- `Identify Unhealthy GCE HTTP Ingress Backends` - Uses CLI commands to check the backend annotations on the Ingress object for health issues. Parses the CLI output to identify and report unhealthy backends. + +- `Validate GCP HTTP Load Balancer Configurations` Executes bash scripts to validate GCP HTTP Load Balancer components extracted from Ingress annotations. Parses the output for issues and recommendations. + +- `Fetch Network Error Logs from GCP Operations Manager for Ingress Backends` - Executes CLI commands to fetch network error logs for Ingress backends. Parses the CLI output to identify and report network error issues. + +- `Review GCP Operations Logging Dashboard`: Generates URLs to access GCP Operations Logging Dashboard for Load Balancer logs and backend logs. + +## Configuration + +The TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set: + +- `kubeconfig`: The kubeconfig secret containing access info for the cluster. +- `KUBERNETES_DISTRIBUTION_BINARY`: Which binary to use for Kubernetes CLI commands. Default value is `kubectl`. +- `CONTEXT`: The Kubernetes context to operate within. +- `NAMESPACE`: The name of the namespace to search. +- `INGRESS`: The name of the ingress object to triage. +- `GCP_PROJECT_ID`: The id of the gcp project to query. +- `gcp_credentials_json`: The name of the secret that contains GCP service account json details with project `Viewer` access. + + +## TODO +- [ ] Add documentation +- [ ] Add github integration with source code vs image comparison +- [ ] Find applicable raise issue use \ No newline at end of file diff --git a/codebundles/k8s-ingress-gce-healthcheck/check_gce_ingress_objects.sh b/codebundles/k8s-ingress-gce-healthcheck/check_gce_ingress_objects.sh new file mode 100755 index 00000000..be7caef0 --- /dev/null +++ b/codebundles/k8s-ingress-gce-healthcheck/check_gce_ingress_objects.sh @@ -0,0 +1,71 @@ +#!/bin/bash +set -eo pipefail + + +# Check if a command exists +function check_command_exists() { + if ! command -v $1 &> /dev/null; then + echo "$1 could not be found" + exit + fi +} + +# ------------------------- Dependency Verification --------------------------- + +# Ensure all the required binaries are accessible +check_command_exists ${KUBERNETES_DISTRIBUTION_BINARY} +check_command_exists gcloud + +# Auth to gcloud +gcloud auth activate-service-account --key-file=$GOOGLE_APPLICATION_CREDENTIALS + +# Extract the necessary annotations from the Ingress +FORWARDING_RULE=$(${KUBERNETES_DISTRIBUTION_BINARY} get ingress $INGRESS -n $NAMESPACE --context $CONTEXT -o=jsonpath='{.metadata.annotations.ingress\.kubernetes\.io/forwarding-rule}') +URL_MAP=$(${KUBERNETES_DISTRIBUTION_BINARY} get ingress $INGRESS -n $NAMESPACE --context $CONTEXT -o=jsonpath='{.metadata.annotations.ingress\.kubernetes\.io/url-map}') +TARGET_PROXY=$(${KUBERNETES_DISTRIBUTION_BINARY} get ingress $INGRESS -n $NAMESPACE --context $CONTEXT -o=jsonpath='{.metadata.annotations.ingress\.kubernetes\.io/target-proxy}') +BACKENDS_JSON=$(${KUBERNETES_DISTRIBUTION_BINARY} get ingress $INGRESS -n $NAMESPACE --context $CONTEXT -o=jsonpath='{.metadata.annotations.ingress\.kubernetes\.io/backends}') +BACKENDS=( $(echo $BACKENDS_JSON | jq -r 'keys[]') ) # Assuming jq is installed for JSON parsing + +recommendations=() + +# Verify Forwarding Rule +echo "--- Verifying Forwarding Rule $FORWARDING_RULE ---" +if ! gcloud compute forwarding-rules describe $FORWARDING_RULE --global --project=$GCP_PROJECT_ID &>/dev/null; then + recommendations+=("Warning: Forwarding Rule [$FORWARDING_RULE] doesn't exist! Verify the correctness of the Ingress configuration and ensure the forwarding rule is properly created.") +fi + +# Verify URL Map +echo "--- Verifying URL Map $URL_MAP ---" +if ! gcloud compute url-maps describe $URL_MAP --global --project=$GCP_PROJECT_ID &>/dev/null; then + recommendations+=("Warning: URL Map [$URL_MAP] doesn't exist! Check the associated ingress controller's logs and the GCP logs for any errors relating to the URL map creation.") +fi + +# Verify Target Proxy (both HTTP and HTTPS) +echo "--- Verifying Target Proxy $TARGET_PROXY ---" +if ! gcloud compute target-https-proxies describe $TARGET_PROXY --global --project=$GCP_PROJECT_ID &>/dev/null && ! gcloud compute target-http-proxies describe $TARGET_PROXY --global --project=$GCP_PROJECT_ID &>/dev/null; then + recommendations+=("Warning: Target Proxy [$TARGET_PROXY] doesn't exist! Ensure the Ingress is correctly set up to create the required target proxy.") +fi + +# Display Backend Service's health status and check for problematic backends +echo "--- Backend Service Health Status ---" +for backend in "${BACKENDS[@]}"; do + health_status=$(gcloud compute backend-services get-health $backend --global --project=$GCP_PROJECT_ID) + echo "Backend Service: $backend" + echo "$health_status" + echo "-----------------------------" + + if [[ ! $health_status =~ "HEALTHY" ]] || [[ $health_status =~ "UNHEALTHY" ]]; then + recommendations+=("Warning: Backend Service [$backend] has problematic health status. Check health checks and firewall rules for this backend. View GCP Logs. Verify IPs are on routable subnets (container-native load balancing) or using NodePort.") + fi +done + +# Display aggregated recommendations +if [[ ${#recommendations[@]} -ne 0 ]]; then + echo "Recommendations:" + for recommendation in "${recommendations[@]}"; do + echo "- $recommendation" + done + +else + echo "All resources associated with the ingress appear healthy." +fi \ No newline at end of file diff --git a/codebundles/k8s-ingress-gce-healthcheck/runbook.robot b/codebundles/k8s-ingress-gce-healthcheck/runbook.robot new file mode 100644 index 00000000..1538ec36 --- /dev/null +++ b/codebundles/k8s-ingress-gce-healthcheck/runbook.robot @@ -0,0 +1,185 @@ +*** Settings *** +Documentation Troubleshoot GCE Ingress Resources related to GCP HTTP Load Balancer in GKE +Metadata Author stewartshea +Metadata Display Name Kubernetes Ingress GCE & GCP HTTP Load Balancer Healthcheck +Metadata Supports Kubernetes,GKE,GCE,GCP + +Library BuiltIn +Library RW.Core +Library RW.CLI +Library RW.platform +Library OperatingSystem + +Suite Setup Suite Initialization + + +*** Tasks *** +Search For GCE Ingress Warnings in GKE + [Documentation] Find warning events related to GCE Ingress and services objects + [Tags] service ingress endpoint health ingress-gce gke + ${event_warnings}= RW.CLI.Run Cli + ... cmd=INGRESS_NAME=${INGRESS}; NAMESPACE=${NAMESPACE}; CONTEXT=${CONTEXT}; ${KUBERNETES_DISTRIBUTION_BINARY} get events -n $NAMESPACE --context $CONTEXT --field-selector involvedObject.kind=Ingress,involvedObject.name=$INGRESS_NAME,type!=Normal; for SERVICE_NAME in $(${KUBERNETES_DISTRIBUTION_BINARY} get ingress $INGRESS_NAME -n $NAMESPACE --context $CONTEXT -o=jsonpath='{.spec.rules[*].http.paths[*].backend.service.name}'); do ${KUBERNETES_DISTRIBUTION_BINARY} get events -n $NAMESPACE --context $CONTEXT --field-selector involvedObject.kind=Service,involvedObject.name=$SERVICE_NAME,type!=Normal; done + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... render_in_commandlist=true + + RW.CLI.Parse Cli Output By Line + ... rsp=${event_warnings} + ... set_severity_level=3 + ... set_issue_expected=GCE ingress and services should not have warnings in namespace `${NAMESPACE}` for ingress `${INGRESS}` + ... set_issue_actual=Ingress and service objects have warnings in namespace `${NAMESPACE}` for ingress `${INGRESS}` + ... set_issue_title=Unhealthy GCE ingress or service objects found in namespace `${NAMESPACE}` for ingress `${INGRESS}` + ... set_issue_details=The following warning events were found:\n\n${event_warnings.stdout}\n\n + ... set_issue_next_steps=Validate GCP HTTP Load Balancer Configurations for ${INGRESS} + ... _line__raise_issue_if_contains=Warning + ${history}= RW.CLI.Pop Shell History + RW.Core.Add Pre To Report GCE Ingress warnings for ${NAMESPACE}:\n\n${event_warnings.stdout} + RW.Core.Add Pre To Report Commands Used: ${history} + +Identify Unhealthy GCE HTTP Ingress Backends + [Documentation] Checks the backend annotations on the ingress object to determine if they are not regstered as healthy + [Tags] service ingress endpoint health ingress-gce gke + ${unhealthy_backends}= RW.CLI.Run Cli + ... cmd=INGRESS_NAME=${INGRESS}; NAMESPACE=${NAMESPACE}; CONTEXT=${CONTEXT}; ${KUBERNETES_DISTRIBUTION_BINARY} get ingress $INGRESS_NAME -n $NAMESPACE --context $CONTEXT -o=json | jq -r '.metadata.annotations["ingress.kubernetes.io/backends"] | fromjson | to_entries[] | select(.value != "HEALTHY") | "Backend: " + .key + " Status: " + .value' + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... render_in_commandlist=true + + RW.CLI.Parse Cli Output By Line + ... rsp=${unhealthy_backends} + ... set_severity_level=2 + ... set_issue_expected=GCE HTTP Load Balancer should have all backends in a HEALTHY state for ingress `${INGRESS}` + ... set_issue_actual=GCE HTTP Load Balancer has unhealthy backends for ingress `${INGRESS}` + ... set_issue_title=GCE HTTP Load Balancer has unhealthy backends for ingress `${INGRESS}` + ... set_issue_details=The following GCP HTTP Load Balancer backends are not healthy :\n\n${unhealthy_backends.stdout}\n\n + ... set_issue_next_steps=Fetch Network Error Logs from GCP Operations Manager for HTTP Load Balancer for backends:\n\n${unhealthy_backends.stdout}\n\n + ... _line__raise_issue_if_contains=Backend + ${history}= RW.CLI.Pop Shell History + RW.Core.Add Pre To Report + ... GCE unhealthy backends in `${NAMESPACE}` for ingress `${INGRESS}`:\n\n${unhealthy_backends.stdout} + RW.Core.Add Pre To Report Commands Used: ${history} + +Validate GCP HTTP Load Balancer Configurations + [Documentation] Extract GCP HTTP Load Balancer components from ingress annotations and check health of each object + [Tags] service ingress endpoint health backends urlmap gce + ${gce_config_objects}= RW.CLI.Run Bash File + ... bash_file=check_gce_ingress_objects.sh + ... secret_file__kubeconfig=${kubeconfig} + ... secret_file__gcp_credentials_json=${gcp_credentials_json} + ... env=${env} + ... include_in_history=false + ... timeout_seconds=120 + + ${recommendations}= RW.CLI.Run Cli + ... cmd=echo '''${gce_config_objects.stdout}''' | awk "/Recommendations:/ {start=1; getline} start" + ... env=${env} + ... include_in_history=false + + RW.CLI.Parse Cli Output By Line + ... rsp=${recommendations} + ... set_severity_level=3 + ... set_issue_expected=GCP HTTP Load Balancer objects should exist in a healthy state for ingress: `${INGRESS}` + ... set_issue_actual=GCP HTTP Load Balancer objects are unhealthy, unknown, or missing for ingress : `${INGRESS}` + ... set_issue_title=Unhealthy or missing GCP HTTP Load Balancer configurations found for ingress `${INGRESS}` + ... set_issue_details=The following report is related to all GCP HTTP Load Balancer objects:\n\n${gce_config_objects.stdout}\n\n + ... set_issue_next_steps=${recommendations.stdout} + ... _line__raise_issue_if_contains=- + ${history}= RW.CLI.Pop Shell History + RW.Core.Add Pre To Report Ingress object summary for ingress: `${INGRESS}` in namespace: `${NAMESPACE}`:\n\n${gce_config_objects.stdout} + + +Fetch Network Error Logs from GCP Operations Manager for Ingress Backends + [Documentation] Fetch logs from the last 1d that are specific to the HTTP Load Balancer within the last 60 minutes + [Tags] service ingress endpoint health + ${network_error_logs}= RW.CLI.Run Cli + ... cmd=INGRESS_NAME=${INGRESS}; NAMESPACE=${NAMESPACE}; CONTEXT=${CONTEXT}; GCP_PROJECT_ID=${GCP_PROJECT_ID};for backend in $(${KUBERNETES_DISTRIBUTION_BINARY} get ingress $INGRESS_NAME -n $NAMESPACE --context $CONTEXT -o=json | jq -r '.metadata.annotations["ingress.kubernetes.io/backends"] | fromjson | to_entries[] | select(.value != "HEALTHY") | .key'); do echo "Backend: \${backend}" && gcloud auth activate-service-account --key-file=$GOOGLE_APPLICATION_CREDENTIALS && gcloud logging read 'severity="ERROR" AND resource.type="gce_network" AND protoPayload.resourceName=~"'\${backend}'"' --freshness=1d --limit=50 --project "$GCP_PROJECT_ID" --format=json | jq '[.[] | {timestamp: .timestamp, ip: .protoPayload.request.networkEndpoints[].ipAddress, message: .protoPayload.response.error.message}] | group_by(.message) | map(max_by(.timestamp)) | .[] | (.timestamp + " | IP: " + .ip + " | Error: " + .message)'; done + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... secret_file__gcp_credentials_json=${gcp_credentials_json} + ... render_in_commandlist=true + RW.CLI.Parse Cli Output By Line + ... rsp=${network_error_logs} + ... set_severity_level=2 + ... set_issue_expected=No network error logs should be found related to Ingress `${INGRESS}` + ... set_issue_actual=Network error logs are found in GCP Operations Console related to Ingress `${INGRESS}` + ... set_issue_title=Network error logs are found for Ingress `${INGRESS}` + ... set_issue_details=Network error logs were found:\n\n${network_error_logs.stdout}\n\n + ... set_issue_next_steps=Review Logs and check GCP documentation to help verify configuration vaility. + ... set_issue_reproduce_hint=Check the ingress object for related annotations. Inspect those objects in the GCP Console. + ... _line__raise_issue_if_contains= + ${history}= RW.CLI.Pop Shell History + RW.Core.Add Pre To Report Network error logs possibly related to Ingress ${INGRESS}:\n\n${network_error_logs.stdout} + RW.Core.Add Pre To Report Commands Used: ${history} + +Review GCP Operations Logging Dashboard + [Documentation] Create urls that will help users obtain logs from the GCP Dashboard + [Tags] service ingress endpoint health logging http loadbalancer + ${loadbalancer_log_url}= RW.CLI.Run CLI + ... cmd=INGRESS=${INGRESS}; NAMESPACE=${NAMESPACE}; CONTEXT=${CONTEXT}; FORWARDING_RULE=$(${KUBERNETES_DISTRIBUTION_BINARY} get ingress $INGRESS -n $NAMESPACE --context $CONTEXT -o=jsonpath='{.metadata.annotations.ingress\\.kubernetes\\.io/forwarding-rule}') && URL_MAP=$(${KUBERNETES_DISTRIBUTION_BINARY} get ingress $INGRESS -n $NAMESPACE --context $CONTEXT -o=jsonpath='{.metadata.annotations.ingress\\.kubernetes\\.io/url-map}') && TARGET_PROXY=$(${KUBERNETES_DISTRIBUTION_BINARY} get ingress $INGRESS -n $NAMESPACE --context $CONTEXT -o=jsonpath='{.metadata.annotations.ingress\\.kubernetes\\.io/target-proxy}') && LOG_QUERY="resource.type=\\"http_load_balancer\\" AND resource.labels.forwarding_rule_name=\\"$FORWARDING_RULE\\" AND resource.labels.target_proxy_name=\\"$TARGET_PROXY\\" AND resource.labels.url_map_name=\\"$URL_MAP\\"" && ENCODED_LOG_QUERY=$(echo $LOG_QUERY | sed -e 's| |%20|g' -e 's|"|%22|g' -e 's|(|%28|g' -e 's|)|%29|g' -e 's|=|%3D|g' -e 's|/|%2F|g') && GCP_LOGS_URL="https://console.cloud.google.com/logs/query;query=$ENCODED_LOG_QUERY?project=$GCP_PROJECT_ID" && echo $GCP_LOGS_URL + ... secret_file__kubeconfig=${kubeconfig} + ... secret_file__gcp_credentials_json=${gcp_credentials_json} + ... env=${env} + ... render_in_commandlist=true + ${backend_log_url}= RW.CLI.Run Cli + ... cmd=INGRESS=${INGRESS}; NAMESPACE=${NAMESPACE}; CONTEXT=${CONTEXT}; QUERY="resource.type=\\"gce_network\\"" && for backend in $(${KUBERNETES_DISTRIBUTION_BINARY} get ingress $INGRESS -n $NAMESPACE --context $CONTEXT -o=json | jq -r '.metadata.annotations["ingress.kubernetes.io/backends"] | fromjson | to_entries[] | select(.value != "HEALTHY") | .key'); do QUERY="$QUERY AND protoPayload.resourceName=~\\"$backend\\""; done && ENCODED_QUERY=$(echo $QUERY | jq -sRr @uri) && DASHBOARD_URL="https://console.cloud.google.com/logs/query;query=$ENCODED_QUERY?project=$GCP_PROJECT_ID" && echo $DASHBOARD_URL + ... secret_file__kubeconfig=${kubeconfig} + ... env=${env} + ... render_in_commandlist=true + ${history}= RW.CLI.Pop Shell History + RW.Core.Add Pre To Report GCP Ops Logs for HTTP Load Balancer ${INGRESS}:\n\n${loadbalancer_log_url.stdout} + RW.Core.Add Pre To Report GCP Ops Logs for ${INGRESS} backends:\n\n${backend_log_url.stdout} + RW.Core.Add Pre To Report Commands Used: ${history} + + + +*** Keywords *** +Suite Initialization + ${kubeconfig}= RW.Core.Import Secret + ... kubeconfig + ... type=string + ... description=The kubernetes kubeconfig yaml containing connection configuration used to connect to cluster(s). + ... pattern=\w* + ... example=For examples, start here https://kubernetes.io/docs/concepts/configuration/organize-cluster-access-kubeconfig/ + ${NAMESPACE}= RW.Core.Import User Variable NAMESPACE + ... type=string + ... description=The name of the Kubernetes namespace to scope actions and searching to. + ... pattern=\w* + ... example=my-namespace + ${CONTEXT}= RW.Core.Import User Variable CONTEXT + ... type=string + ... description=Which Kubernetes context to operate within. + ... pattern=\w* + ... example=my-main-cluster + ${INGRESS}= RW.Core.Import User Variable INGRESS + ... type=string + ... description=Which Ingress object to troubleshoot. + ... pattern=\w* + ... example=my-ingress + ${KUBERNETES_DISTRIBUTION_BINARY}= RW.Core.Import User Variable KUBERNETES_DISTRIBUTION_BINARY + ... type=string + ... description=Which binary to use for Kubernetes CLI commands. + ... enum=[kubectl,oc] + ... example=kubectl + ... default=kubectl + ${gcp_credentials_json}= RW.Core.Import Secret gcp_credentials_json + ... type=string + ... description=GCP service account json used to authenticate with GCP APIs. + ... pattern=\w* + ... example={"type": "service_account","project_id":"myproject-ID", ... super secret stuff ...} + ${GCP_PROJECT_ID}= RW.Core.Import User Variable GCP_PROJECT_ID + ... type=string + ... description=The GCP Project ID to scope the API to. + ... pattern=\w* + ... example=myproject-ID + ${OS_PATH}= Get Environment Variable PATH + + Set Suite Variable ${kubeconfig} ${kubeconfig} + Set Suite Variable ${KUBERNETES_DISTRIBUTION_BINARY} ${KUBERNETES_DISTRIBUTION_BINARY} + Set Suite Variable ${CONTEXT} ${CONTEXT} + Set Suite Variable ${NAMESPACE} ${NAMESPACE} + Set Suite Variable ${INGRESS} ${INGRESS} + Set Suite Variable ${gcp_credentials_json} ${gcp_credentials_json} + Set Suite Variable ${GCP_PROJECT_ID} ${GCP_PROJECT_ID} + Set Suite Variable + ... ${env} + ... {"KUBECONFIG":"./${kubeconfig.key}", "GCP_PROJECT_ID":"${GCP_PROJECT_ID}","CLOUDSDK_CORE_PROJECT":"${GCP_PROJECT_ID}","GOOGLE_APPLICATION_CREDENTIALS":"./${gcp_credentials_json.key}", "KUBERNETES_DISTRIBUTION_BINARY":"${KUBERNETES_DISTRIBUTION_BINARY}", "CONTEXT":"${CONTEXT}","NAMESPACE":"${NAMESPACE}","INGRESS":"${INGRESS}", "PATH":"$PATH:${OS_PATH}"} diff --git a/codebundles/k8s-namespace-healthcheck/runbook.robot b/codebundles/k8s-namespace-healthcheck/runbook.robot index e154537f..354db78d 100644 --- a/codebundles/k8s-namespace-healthcheck/runbook.robot +++ b/codebundles/k8s-namespace-healthcheck/runbook.robot @@ -349,20 +349,20 @@ Check Missing or Risky PodDisruptionBudget Policies RW.CLI.Parse Cli Output By Line ... rsp=${pdb_check} ... set_severity_level=2 - ... set_issue_expected=PodDisruptionBudgets in ${NAMESPACE} should not block regular maintenance - ... set_issue_actual=We detected PodDisruptionBudgets in namespace ${NAMESPACE} which are considered Risky to maintenance operations - ... set_issue_title=Risky PodDisruptionBudgets Found in namespace ${NAMESPACE} - ... set_issue_details=Review the PodDisruptionBudget check for ${NAMESPACE}:\n$_stdout - ... set_issue_next_steps=Review & Edit PodDisruptionBudget for ${risky_pdbs.stdout} + ... set_issue_expected=PodDisruptionBudgets in `${NAMESPACE}` should not block regular maintenance + ... set_issue_actual=We detected PodDisruptionBudgets in namespace `${NAMESPACE}` which are considered Risky to maintenance operations + ... set_issue_title=Risky PodDisruptionBudgets Found in namespace `${NAMESPACE}` + ... set_issue_details=Review the PodDisruptionBudget check for `${NAMESPACE}`:$_stdout + ... set_issue_next_steps=Review & Edit PodDisruptionBudget for `${risky_pdbs.stdout}` ... _line__raise_issue_if_contains=(.*?) RW.CLI.Parse Cli Output By Line ... rsp=${pdb_check} ... set_severity_level=4 - ... set_issue_expected=PodDisruptionBudgets in ${NAMESPACE} should exist for applications that have more than 1 replica - ... set_issue_actual=We detected Deployments or StatefulSets in namespace ${NAMESPACE} which are missing PodDisruptionBudgets - ... set_issue_title=Deployments or StatefulSets in namespace ${NAMESPACE} are missing PodDisruptionBudgets - ... set_issue_details=Review the Deployments and StatefulSets missing PodDisruptionBudget in ${NAMESPACE}:\n$_stdout - ... set_issue_next_steps=Create missing Pod Distruption Budgets for ${missing_pdbs.stdout} + ... set_issue_expected=PodDisruptionBudgets in `${NAMESPACE}` should exist for applications that have more than 1 replica + ... set_issue_actual=We detected Deployments or StatefulSets in namespace `${NAMESPACE}` which are missing PodDisruptionBudgets + ... set_issue_title=Deployments or StatefulSets in namespace `${NAMESPACE}` are missing PodDisruptionBudgets + ... set_issue_details=Review the Deployments and StatefulSets missing PodDisruptionBudget in `${NAMESPACE}`:\n$_stdout + ... set_issue_next_steps=Create missing Pod Distruption Budgets for `${missing_pdbs.stdout}` ... _line__raise_issue_if_contains=Missing ${history}= RW.CLI.Pop Shell History RW.Core.Add To Report ${pdb_check.stdout}\n