From 88c4ea935b4d8b3672c86f931ac92c291e6e4919 Mon Sep 17 00:00:00 2001 From: Jacob Salmela Date: Thu, 9 Jan 2025 19:08:34 -0600 Subject: [PATCH] CASMTRIAGE-7657 adjust k8s_verify_cluster_2 test to avoid some false (#659) --- .../scripts/verify_kube_system_pods.sh | 74 +++++++++++++++++++ .../tests/ncn/goss-k8s-verify-cluster.yaml | 18 ++--- 2 files changed, 82 insertions(+), 10 deletions(-) create mode 100755 goss-testing/scripts/verify_kube_system_pods.sh diff --git a/goss-testing/scripts/verify_kube_system_pods.sh b/goss-testing/scripts/verify_kube_system_pods.sh new file mode 100755 index 00000000..4182e6ba --- /dev/null +++ b/goss-testing/scripts/verify_kube_system_pods.sh @@ -0,0 +1,74 @@ +#!/bin/bash +# +# MIT License +# +# (C) Copyright 2022-2025 Hewlett Packard Enterprise Development LP +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +set -euo pipefail + +DEBUG=${DEBUG:=false} + +# get an array of all non-running pods in the kube-system namespace +while IFS='' read -r line; do non_running_pods+=("$line"); done < <(kubectl get po -n kube-system --no-headers \ + | awk '{ print $1" "$3 }' \ + | grep -Ev ' (Running|Completed)$' \ + | grep -Ev '-etcdbackup-.* (ContainerCreating|Init:[0-9]+/[0-9]+|NotReady|Pending|PodInitializing|Terminating)$' \ + | awk '{ print $1 }') + + +for pod_name in "${non_running_pods[@]}"; do + if [ "${DEBUG}" == "true" ]; then echo "checking non-running pod: $pod_name";fi + # get the label name of the current pod + label_name=$(kubectl get -n kube-system po "$pod_name" -o jsonpath='{.metadata.labels.app\.kubernetes\.io/name}') + if [ "${DEBUG}" == "true" ]; then echo "checking for other pods with the label: $label_name";fi + # check if there are any other pods with the same label name since they may be newer and have a different status + while IFS='' read -r line; do same_pods+=("$line"); done < <(kubectl get pods -l app.kubernetes.io/name="$label_name" -n kube-system -o json | jq -r '.items[].metadata.name') + most_recent_pod="" + most_recent_time=0 + # for each pod with the same label name, check if it is running or completed + for same in "${same_pods[@]}"; do + status=$(kubectl get pod -n kube-system "$same" -o json | jq -r '.status.phase') + if [ "${DEBUG}" == "true" ]; then echo " found: $same $status";fi + # if there is no error, get the start time of the pod + if [ "$status" != "Error" ]; then + start_time=$(kubectl get pod -n kube-system "$same" -o json | jq -r '.status.startTime') + # Calculate the start time in seconds since epoch + start_time_seconds=$(date -u -d "$start_time" +%s) + # if the start time is greater than the most recent time, update the most recent time and pod + if [ "$start_time_seconds" -gt "$most_recent_time" ]; then + if [ "${DEBUG}" == "true" ]; then echo " $same start_time is greater than most_recent_time: $start_time_seconds > $most_recent_time";fi + most_recent_time=$start_time_seconds + most_recent_pod=$same + fi + fi + done + # if there is a most recent pod and it is not the current pod, go to the next non_running_pod + # this is considered a success and prevents false positives since the newer pod has successfully started + if [ -n "$most_recent_pod" ] && [ "$most_recent_pod" != "$pod_name" ]; then + if [ "${DEBUG}" == "true" ]; then echo " a more recent pod is not in a fail state: $pod_name";fi + continue + else + # print the pod that is in a poor state to fail the test + echo "$pod_name is not running or completed" >&2 + fi +done + +exit 0 diff --git a/goss-testing/tests/ncn/goss-k8s-verify-cluster.yaml b/goss-testing/tests/ncn/goss-k8s-verify-cluster.yaml index fbc9878f..6bda5226 100644 --- a/goss-testing/tests/ncn/goss-k8s-verify-cluster.yaml +++ b/goss-testing/tests/ncn/goss-k8s-verify-cluster.yaml @@ -1,7 +1,7 @@ # # MIT License # -# (C) Copyright 2014-2022, 2024 Hewlett Packard Enterprise Development LP +# (C) Copyright 2014-2025 Hewlett Packard Enterprise Development LP # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), @@ -21,9 +21,10 @@ # ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR # OTHER DEALINGS IN THE SOFTWARE. # - + {{ $kubectl := .Vars.kubectl }} {{ $logrun := .Env.GOSS_BASE | printf "%s/scripts/log_run.sh" }} +{{ $verify_kube_system_pods := .Env.GOSS_BASE | printf "%s/scripts/verify_kube_system_pods.sh" }} command: {{ $testlabel_1 := "k8s_verify_cluster_1" }} {{$testlabel_1}}: @@ -56,16 +57,13 @@ command: # Look for pods that are not Running or Completed # However, we do allow etcdbackup job pods to be in other "normal" lifecycle states exec: |- - "{{$logrun}}" -l "{{$testlabel_2}}" \ - "{{$kubectl}}" get po -n kube-system --no-headers \ - | awk '{ print $1" "$3 }' \ - | grep -Ev ' (Running|Completed)$' \ - | grep -Ev '-etcdbackup-.* (ContainerCreating|Init:[0-9]+/[0-9]+|NotReady|Pending|PodInitializing|Terminating)$' - # We expect no output and for the grep command to return non-0 + "{{$logrun}}" -l "{{$testlabel_2}}" "{{$verify_kube_system_pods}}" + # We expect no output when all pods are in a good state + # or if a failed pod has a newer one that succeeded stdout: - "!/./" stderr: - "!/./" - exit-status: - gt: 0 + # the script should return 0 as it checks all pods and the errors will be checked via the stdout/stderr + exit-status: 0 timeout: 20000