From 7e90960ed0bd1e0546b8ceaab9cee3ab50e29b60 Mon Sep 17 00:00:00 2001 From: Hunter Gregory <42728408+huntergregory@users.noreply.github.com> Date: Wed, 29 Nov 2023 18:37:46 -0500 Subject: [PATCH] test(scale): [NPM] fix flakes in kwok and capture kernel state on failure (#2249) * test(kwok): try standard tier for cluster * Revert "test(kwok): try standard tier for cluster" This reverts commit f76e50a559a214f05fa909a41f3a54fd91303f51. * test: run kwok as pod * fix: add execute permission to sh files * fix: allow scheduling on linux for kwok pod * fix: wait timeouts and add retry logic * fix: make sure to reapply kwok nodes if wait fails * test: print out cluster state if wait fails * test: prevent kwok from scheduling on windows node * test: first wait for kwok pods (20 minutes) * style: rearrange wait check * fix: scale up kwok controller for reliability * fix: typo in scaling kwok pods * fix: check kwok pods running in test-connectivity instead of test-scale * fix: wait for pods before adding NetPol * fix: 7 second timeout for windows agnhost connect * feat: get cluster state on failure * debug: fake a failure to verify log capture * fix: bugs in getting cluster state * fix: remove newline instead of "n" * Revert "debug: fake a failure to verify log capture" This reverts commit 24ec92742547fc9feec5113dc5c86cf270e2922c. * feat(win-debug): get prom metrics * fix: leave timeout=5s for win * style: remove new, unused --connect-timeout parameter * style: comment * feat: top node/pod --- .pipelines/npm/npm-scale-test.yaml | 128 ++++++++++--------- debug/windows/npm/win-debug.sh | 54 ++++++-- test/scale/connectivity/test-connectivity.sh | 2 +- test/scale/run-kwok-as-pod.sh | 5 + test/scale/test-scale.sh | 75 ++++++++--- 5 files changed, 172 insertions(+), 92 deletions(-) create mode 100644 test/scale/run-kwok-as-pod.sh diff --git a/.pipelines/npm/npm-scale-test.yaml b/.pipelines/npm/npm-scale-test.yaml index c8a418053a..866b887d18 100644 --- a/.pipelines/npm/npm-scale-test.yaml +++ b/.pipelines/npm/npm-scale-test.yaml @@ -96,7 +96,7 @@ jobs: displayName: "Verify Directory Exists" failOnStderr: true - task: AzureCLI@2 - displayName: "Download Kubectl and Kwok" + displayName: "Download Kubectl" inputs: azureSubscription: $(BUILD_VALIDATIONS_SERVICE_CONNECTION) scriptType: "bash" @@ -107,11 +107,6 @@ jobs: set -e curl -LO https://dl.k8s.io/release/v1.23.0/bin/linux/amd64/kubectl chmod +x kubectl - - KWOK_REPO=kubernetes-sigs/kwok - KWOK_LATEST_RELEASE=$(curl "https://api.github.com/repos/${KWOK_REPO}/releases/latest" | jq -r '.tag_name') - wget -O kwok -c "https://github.com/kubernetes-sigs/kwok/releases/download/${KWOK_LATEST_RELEASE}/kwok-$(go env GOOS)-$(go env GOARCH)" - chmod +x kwok - task: AzureCLI@2 displayName: "Create AKS Cluster" inputs: @@ -140,17 +135,22 @@ jobs: --vm-set-type VirtualMachineScaleSets \ --node-vm-size Standard_D4s_v3 \ --node-count 1 \ + --tier standard \ --max-pods 100 - if [[ $(PROFILE) == *ws22 ]]; then - # don't schedule anything on the linux system pool - echo "Updating $CLUSTER_NAME to not schedule anything on linux pool..." - az aks nodepool update \ - --cluster-name $CLUSTER_NAME \ - -g $(RESOURCE_GROUP) \ - -n nodepool1 \ - --node-taints CriticalAddonsOnly=true:NoSchedule + echo "Getting credentials to $CLUSTER_NAME" + az aks get-credentials -g $(RESOURCE_GROUP) -n $CLUSTER_NAME --overwrite-existing --file ./kubeconfig + mkdir -p ~/.kube/ + cp ./kubeconfig ~/.kube/config + + # install kwok on linux node + cd $(Pipeline.Workspace)/s/test/scale/ + chmod u+x run-kwok-as-pod.sh test-scale.sh connectivity/test-connectivity.sh + ./run-kwok-as-pod.sh + # need reliability in case multiple controllers enter CrashLoopBackOff from "context cancelled" + kubectl scale deployment -n kube-system -l app=kwok-controller --replicas=5 + if [[ $(PROFILE) == *ws22 ]]; then echo "Adding Windows nodepool to $CLUSTER_NAME" az aks nodepool add \ --resource-group $(RESOURCE_GROUP) \ @@ -163,11 +163,6 @@ jobs: --max-pods 100 fi - echo "Getting credentials to $CLUSTER_NAME" - az aks get-credentials -g $(RESOURCE_GROUP) -n $CLUSTER_NAME --overwrite-existing --file ./kubeconfig - mkdir -p ~/.kube/ - cp ./kubeconfig ~/.kube/config - - task: AzureCLI@2 displayName: "Deploy NPM to Test Cluster" inputs: @@ -230,15 +225,6 @@ jobs: set -e export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE) mkdir -p $(System.DefaultWorkingDirectory)/$CLUSTER_NAME - ./kwok --kubeconfig ~/.kube/config \ - --cidr=155.0.0.0/16 \ - --node-ip=155.0.0.1 \ - --manage-all-nodes=false \ - --manage-nodes-with-annotation-selector=kwok.x-k8s.io/node=fake \ - --manage-nodes-with-label-selector= \ - --disregard-status-with-annotation-selector=kwok.x-k8s.io/status=custom \ - --disregard-status-with-label-selector= > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/kwok-scale-up.log & - kwok_pid=$! # 20 kwok nodes # 1000 kwok Pods @@ -262,7 +248,6 @@ jobs: --num-unique-labels-per-deployment=2 \ --num-shared-labels-per-pod=10 rc=$? - kill $kwok_pid exit $rc - task: AzureCLI@2 @@ -277,20 +262,17 @@ jobs: set -e export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE) mkdir -p $(System.DefaultWorkingDirectory)/$CLUSTER_NAME - ./kwok --kubeconfig ~/.kube/config \ - --cidr=155.0.0.0/16 \ - --node-ip=155.0.0.1 \ - --manage-all-nodes=false \ - --manage-nodes-with-annotation-selector=kwok.x-k8s.io/node=fake \ - --manage-nodes-with-label-selector= \ - --disregard-status-with-annotation-selector=kwok.x-k8s.io/status=custom \ - --disregard-status-with-label-selector= > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/kwok-bootup-latency.log & - kwok_pid=$! kubectl rollout restart -n kube-system ds azure-npm-win echo "sleeping 3 minutes to allow NPM pods to restart after scale-up..." sleep 3m + kubectl get pod -n kube-system -l app=kwok-controller -owide + kubectl get pod -n kube-system -l app=kwok-controller -owide | grep -q Running || { + echo "##vso[task.logissue type=error]need at least one kwok pod running" + exit 1 + } + cd $(Pipeline.Workspace)/s/test/scale/connectivity/ # notes for Windows: # initial connectivity should be established within 15 minutes of NPM restart (12 minute timeout since we already waited 3 minutes above) @@ -302,11 +284,29 @@ jobs: --max-wait-after-adding-netpol=30 rc=$? if [[ $rc != 0 ]]; then + echo "capturing cluster state due to failure" + if [[ $(PROFILE) == *ws22 ]]; then + cd $(Pipeline.Workspace)/s/debug/windows/npm/ + chmod u+x win-debug.sh + ./win-debug.sh + mv logs_* $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/ + else + set -x + npmPod=`kubectl get pod -n kube-system | grep npm | grep -v npm-win | awk '{print $1}' | head -n 1 | tr -d '\n'` + kubectl exec -n kube-system $npmPod -- iptables-nft -vnL > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/iptables.out + kubectl exec -n kube-system $npmPod -- ipset -L > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/ipset.out + fi + kubectl get pod -n scale-test kubectl get pod -n connectivity-test + exit $rc fi - kill $kwok_pid - exit $rc + + kubectl get pod -n kube-system -l app=kwok-controller -owide + kubectl get pod -n kube-system -l app=kwok-controller -owide | grep -q Running || { + echo "##vso[task.logissue type=error]need at least one kwok pod running" + exit 1 + } - task: AzureCLI@2 displayName: "CRUD at Medium Scale" @@ -320,15 +320,6 @@ jobs: set -e export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE) mkdir -p $(System.DefaultWorkingDirectory)/$CLUSTER_NAME - ./kwok --kubeconfig ~/.kube/config \ - --cidr=155.0.0.0/16 \ - --node-ip=155.0.0.1 \ - --manage-all-nodes=false \ - --manage-nodes-with-annotation-selector=kwok.x-k8s.io/node=fake \ - --manage-nodes-with-label-selector= \ - --disregard-status-with-annotation-selector=kwok.x-k8s.io/status=custom \ - --disregard-status-with-label-selector= > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/kwok-crud.log & - kwok_pid=$! # will delete scale-test and connectivity-test namespaces from previous run # 10 kwok Pods @@ -359,7 +350,6 @@ jobs: --delete-pods-interval=120 \ --delete-pods-times=2 rc=$? - kill $kwok_pid exit $rc - task: AzureCLI@2 @@ -374,15 +364,13 @@ jobs: set -e export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE) mkdir -p $(System.DefaultWorkingDirectory)/$CLUSTER_NAME - ./kwok --kubeconfig ~/.kube/config \ - --cidr=155.0.0.0/16 \ - --node-ip=155.0.0.1 \ - --manage-all-nodes=false \ - --manage-nodes-with-annotation-selector=kwok.x-k8s.io/node=fake \ - --manage-nodes-with-label-selector= \ - --disregard-status-with-annotation-selector=kwok.x-k8s.io/status=custom \ - --disregard-status-with-label-selector= > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/kwok-crud-connectivity.log & - kwok_pid=$! + + kubectl get pod -n kube-system -l app=kwok-controller -owide + kubectl get pod -n kube-system -l app=kwok-controller -owide | grep -q Running || { + echo "##vso[task.logissue type=error]need at least one kwok pod running" + exit 1 + } + cd $(Pipeline.Workspace)/s/test/scale/connectivity/ # initial connectivity should be established within 10 minutes @@ -394,11 +382,29 @@ jobs: --max-wait-after-adding-netpol=20 rc=$? if [[ $rc != 0 ]]; then + echo "capturing cluster state due to failure" + if [[ $(PROFILE) == *ws22 ]]; then + cd $(Pipeline.Workspace)/s/debug/windows/npm/ + chmod u+x win-debug.sh + ./win-debug.sh + mv logs_* $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/ + else + set -x + npmPod=`kubectl get pod -n kube-system | grep npm | grep -v npm-win | awk '{print $1}' | head -n 1 | tr -d '\n'` + kubectl exec -n kube-system $npmPod -- iptables-nft -vnL > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/iptables.out + kubectl exec -n kube-system $npmPod -- ipset -L > $(System.DefaultWorkingDirectory)/$CLUSTER_NAME/ipset.out + fi + kubectl get pod -n scale-test kubectl get pod -n connectivity-test + exit $rc fi - kill $kwok_pid - exit $rc + + kubectl get pod -n kube-system -l app=kwok-controller -owide + kubectl get pod -n kube-system -l app=kwok-controller -owide | grep -q Running || { + echo "##vso[task.logissue type=error]need at least one kwok pod running" + exit 1 + } - bash: | export CLUSTER_NAME=$(RESOURCE_GROUP)-$(PROFILE) diff --git a/debug/windows/npm/win-debug.sh b/debug/windows/npm/win-debug.sh index 28f13ca20f..11e4a066c0 100755 --- a/debug/windows/npm/win-debug.sh +++ b/debug/windows/npm/win-debug.sh @@ -1,20 +1,23 @@ +kubeconfig=$1 +if [[ -z $1 ]]; then + echo "kubeconfig not provided. using default kubeconfig" +else + echo "using kubeconfig: $kubeconfig" + kubeconfigArg="--kubeconfig $kubeconfig" +fi + # NOTE: you may not be able to unzip logs.zip in Linux since it was compressed in Windows -set -e +set -x dateString=`date -I` # like 2022-09-24 filepath=logs_$dateString mkdir $filepath echo "gathering logs and writing to $filepath/" -kubectl get pod -A -o wide --show-labels > $filepath/allpods.out -kubectl get netpol -A -o yaml > $filepath/all-netpol-yamls.out -kubectl describe netpol -A > $filepath/all-netpol-descriptions.out - npmPods=() nodes=() -for npmPodOrNode in `kubectl get pod -n kube-system -owide --output=custom-columns='Name:.metadata.name,Node:spec.nodeName' | grep "npm-win"`; do +for npmPodOrNode in `kubectl $kubeconfigArg get pod -n kube-system -owide --output=custom-columns='Name:.metadata.name,Node:spec.nodeName' | grep "npm-win"`; do # for loop will go over each item (npm pod, then its node, then the next npm pod, then its node, ...) - set +e echo $npmPodOrNode | grep -q azure-npm-win- if [ $? -eq 0 ]; then npmPods+=($npmPodOrNode) @@ -22,7 +25,6 @@ for npmPodOrNode in `kubectl get pod -n kube-system -owide --output=custom-colum nodes+=($npmPodOrNode) fi done -set -e echo "npm pods: ${npmPods[@]}" echo "nodes of npm pods: ${nodes[@]}" @@ -33,22 +35,48 @@ for i in $(seq 1 ${#npmPods[*]}); do node=${nodes[$j]} echo "gathering logs. npm pod: $npmPod. node: $node" - kubectl logs -n kube-system $npmPod > $filepath/logs_$npmPod.out + kubectl $kubeconfigArg logs -n kube-system $npmPod > $filepath/logs_$npmPod.out ips=() - for ip in `kubectl get pod -A -owide --output=custom-columns='IP:.status.podIP,Node:spec.nodeName' | grep $node | grep -oP "\d+\.\d+\.\d+\.\d+"`; do + for ip in `kubectl $kubeconfigArg get pod -A -owide --output=custom-columns='IP:.status.podIP,Node:spec.nodeName' | grep $node | grep -oP "\d+\.\d+\.\d+\.\d+"`; do ips+=($ip) done echo "node $node has IPs: ${ips[@]}" echo "copying ps1 file into $npmPod" - kubectl cp ./pod_exec.ps1 kube-system/"$npmPod":execw.ps1 + kubectl $kubeconfigArg cp ./pod_exec.ps1 kube-system/"$npmPod":execw.ps1 echo "executing ps1 file on $npmPod" - kubectl exec -it -n kube-system $npmPod -- powershell.exe -Command .\\execw.ps1 "'${ips[@]}'" + kubectl $kubeconfigArg exec -n kube-system $npmPod -- powershell.exe -Command .\\execw.ps1 "'${ips[@]}'" echo "copying logs.zip from $npmPod. NOTE: this will be a windows-based compressed archive (probably need windows to expand it)" - kubectl cp kube-system/"$npmPod":npm-exec-logs.zip $filepath/npm-exec-logs_$node.zip + kubectl $kubeconfigArg cp kube-system/"$npmPod":npm-exec-logs.zip $filepath/npm-exec-logs_$node.zip +done + +echo "finished getting HNS info. getting prometheus metrics" + +mkdir -p $filepath/prometheus/node-metrics +for i in $(seq 1 ${#npmPods[*]}); do + j=$((i-1)) + npmPod=${npmPods[$j]} + kubectl $kubeconfigArg exec -n kube-system $npmPod -- powershell.exe -Command "(Invoke-WebRequest -UseBasicParsing http://localhost:10091/node-metrics).Content" > $filepath/prometheus/node-metrics/$npmPod.out +done + +echo "finished getting prometheus metrics. getting cluster state" + +kubectl $kubeconfigArg get pod -A -o wide --show-labels > $filepath/allpods.out +kubectl $kubeconfigArg get netpol -A -o yaml > $filepath/all-netpol-yamls.out +kubectl $kubeconfigArg describe netpol -A > $filepath/all-netpol-descriptions.out + +for ns in `kubectl $kubeconfigArg get pod -A | grep -v Running | grep -v STATUS | awk '{print $1}' | sort | uniq`; do + echo "describing failed pods in namespace $ns..." + failingPods=`kubectl $kubeconfigArg get pod -n $ns | grep -v Running | grep -v STATUS | awk '{print $1}' | xargs echo` + if [[ -z $failingPods ]]; then + continue + fi + echo "failing Pods: $failingPods" + kubectl $kubeconfigArg describe pod -n $ns $failingPods > $filepath/describepod_$ns.out + break done echo "finished gathering all logs. written to $filepath/" diff --git a/test/scale/connectivity/test-connectivity.sh b/test/scale/connectivity/test-connectivity.sh index 8feef98f6a..0f8ae9c89b 100755 --- a/test/scale/connectivity/test-connectivity.sh +++ b/test/scale/connectivity/test-connectivity.sh @@ -29,7 +29,7 @@ REQUIRED PARAMETERS: OPTIONAL PARAMETERS: --kubeconfig= path to kubeconfig file - --kubectl-binary= path to kubectl binary. Default is kubectl + --kubectl-binary= path to kubectl binary. Default is kubectl EXIT CODES: 0 - success diff --git a/test/scale/run-kwok-as-pod.sh b/test/scale/run-kwok-as-pod.sh new file mode 100644 index 0000000000..600bbd0a5b --- /dev/null +++ b/test/scale/run-kwok-as-pod.sh @@ -0,0 +1,5 @@ +# source: https://kwok.sigs.k8s.io/docs/user/kwok-in-cluster/ +KWOK_REPO=kubernetes-sigs/kwok +KWOK_LATEST_RELEASE=$(curl "https://api.github.com/repos/${KWOK_REPO}/releases/latest" | jq -r '.tag_name') +kubectl apply -f "https://github.com/${KWOK_REPO}/releases/download/${KWOK_LATEST_RELEASE}/kwok.yaml" +kubectl apply -f "https://github.com/${KWOK_REPO}/releases/download/${KWOK_LATEST_RELEASE}/stage-fast.yaml" diff --git a/test/scale/test-scale.sh b/test/scale/test-scale.sh index 1a24f7fe7c..167959295b 100755 --- a/test/scale/test-scale.sh +++ b/test/scale/test-scale.sh @@ -276,21 +276,49 @@ fi ## HELPER FUNCTIONS wait_for_pods() { - # wait for all pods to run - minutesToWaitForRealPods=$(( 10 + $numRealPods / 250 )) - set -x - if [[ $numRealPods -gt 0 ]]; then - $KUBECTL $KUBECONFIG_ARG wait --for=condition=Ready pods -n scale-test -l is-real=true --all --timeout="${minutesToWaitForRealPods}m" + if [[ $numKwokPods -gt 0 ]]; then + # wait up to 20 minutes + startDate=`date +%s` + count=0 + while : ; do + echo "waiting for fake pods to run (try $count)" + count=$((count+1)) + # just make sure kwok pods are Running, not necessarily Ready (sometimes kwok pods have NodeNotReady even though the node is ready) + set +e -x + $KUBECTL $KUBECONFIG_ARG wait --for=condition=Initialized pods -n scale-test -l is-kwok=true --all --timeout=0 && set -e +x && break + endDate=`date +%s` + set -e +x + if [[ $endDate -gt $(( startDate + (20*60) )) ]]; then + echo "timed out waiting for all kwok pods to run" + k get pod -n scale-test -owide + k get node + k get pod -n kube-system -l app=kwok-controller -owide + k top pod -n kube-system -l app=kwok-controller + k top node + exit 1 + fi + # try recreating nodes if KWOK controller failed + $KUBECTL $KUBECONFIG_ARG apply -f generated/kwok-nodes/ + done fi - set +x - # just make sure kwok pods are Running, not necessarily Ready (sometimes kwok pods have NodeNotReady even though the node is ready) - minutesToWaitForKwokPods=$(( 1 + $numKwokPods / 500 )) - set -x - if [[ $numKwokPods -gt 0 ]]; then - $KUBECTL $KUBECONFIG_ARG wait --for=condition=Initialized pods -n scale-test -l is-kwok=true --all --timeout="${minutesToWaitForKwokPods}m" + if [[ $numRealPods -gt 0 ]]; then + # wait up to 10 minutes + startDate=`date +%s` + count=0 + while : ; do + echo "waiting for real pods to run (try $count)" + count=$((count+1)) + set +e -x + $KUBECTL $KUBECONFIG_ARG wait --for=condition=Ready pods -n scale-test -l is-real=true --all --timeout=0 && set -e +x && break + set -e +x + endDate=`date +%s` + if [[ $endDate -gt $(( startDate + (10*60) )) ]]; then + echo "timed out waiting for all real pods to run" + exit 1 + fi + done fi - set +x } ## FILE SETUP @@ -444,19 +472,29 @@ echo set -x $KUBECTL $KUBECONFIG_ARG create ns scale-test +set +x + if [[ $numKwokNodes -gt 0 ]]; then + set -x $KUBECTL $KUBECONFIG_ARG apply -f generated/kwok-nodes/ + set +x fi if [[ $numRealPods -gt 0 ]]; then + set -x $KUBECTL $KUBECONFIG_ARG apply -f generated/deployments/real-$realPodType/ + set +x fi if [[ $numKwokPods -gt 0 ]]; then + set -x $KUBECTL $KUBECONFIG_ARG apply -f generated/deployments/kwok/ + set +x fi if [[ $numRealServices -gt 0 ]]; then + set -x $KUBECTL $KUBECONFIG_ARG apply -f generated/services/real/ + set +x fi -set +x + add_shared_labels() { if [[ $numSharedLabelsPerPod -gt 0 ]]; then @@ -489,16 +527,19 @@ if [[ $numUniqueLabelsPerPod -gt 0 ]]; then done fi -set -x +# to better evaluate time to apply ACLs, wait for pods to come up first (takes a variable amount of time) before applying the NetPols +wait_for_pods + if [[ $numUnappliedNetworkPolicies -gt 0 ]]; then + set -x $KUBECTL $KUBECONFIG_ARG apply -f generated/networkpolicies/unapplied + set +x fi if [[ $numNetworkPolicies -gt 0 ]]; then + set -x $KUBECTL $KUBECONFIG_ARG apply -f generated/networkpolicies/applied + set +x fi -set +x - -wait_for_pods echo echo "done scaling at $(date -u). Had started at $startDate."