Skip to content

Commit

Permalink
Merge pull request #2772 from Josh-Tigera/josh.flake-fixes
Browse files Browse the repository at this point in the history
-Add localized caching of OSS Calico images to speed up performance and improve test reliability by removing variability of image pull timing per test. Also has the added benefit of not ripping through docker pull request limits
-Fix race condition in CRD management tests
-Fix race condition in mainline tests
-Use non-caching k8s client and remove use of unstructured types
  • Loading branch information
Josh-Tigera authored Jul 28, 2023
2 parents 18d3f88 + c721b1d commit 6d0bf25
Show file tree
Hide file tree
Showing 7 changed files with 186 additions and 148 deletions.
6 changes: 5 additions & 1 deletion .semaphore/semaphore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,8 @@ blocks:
commands:
- make ut
- make dirty-check
epilogue:
commands:
- '[[ -d ./report/ut ]] && test-results publish ./report/ut'

- name: 'FV'
Expand All @@ -144,7 +146,9 @@ blocks:
commands:
- make fv
- make dirty-check
- '[[ -f ./report/fv/fv_suite.xml ]] && test-results publish ./report/fv/fv_suite.xml'
epilogue:
commands:
- '[[ -d ./report/fv ]] && test-results publish ./report/fv'

after_pipeline:
task:
Expand Down
70 changes: 69 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ CALICO_BUILD?=calico/go-build:$(GO_BUILD_VER)-$(ARCH)
SRC_FILES=$(shell find ./pkg -name '*.go')
SRC_FILES+=$(shell find ./api -name '*.go')
SRC_FILES+=$(shell find ./controllers -name '*.go')
SRC_FILES+=$(shell find ./test -name '*.go')
SRC_FILES+=main.go

EXTRA_DOCKER_ARGS += -e GO111MODULE=on -e GOPRIVATE=github.com/tigera/*
Expand Down Expand Up @@ -299,7 +300,7 @@ ut:
ginkgo -trace -r -focus="$(GINKGO_FOCUS)" $(GINKGO_ARGS) "$(UT_DIR)"'

## Run the functional tests
fv: cluster-create run-fvs cluster-destroy
fv: cluster-create load-container-images run-fvs cluster-destroy
run-fvs:
-mkdir -p .go-pkg-cache report
$(CONTAINERIZED) $(CALICO_BUILD) sh -c '$(GIT_CONFIG_SSH) \
Expand All @@ -325,6 +326,73 @@ cluster-create: $(BINDIR)/kubectl $(BINDIR)/kind
# Wait for controller manager to be running and healthy.
while ! KUBECONFIG=$(KUBECONFIG) $(BINDIR)/kubectl get serviceaccount default; do echo "Waiting for default serviceaccount to be created..."; sleep 2; done

IMAGE_REGISTRY := docker.io
VERSION_TAG := master
NODE_IMAGE := calico/node
APISERVER_IMAGE := calico/apiserver
CNI_IMAGE := calico/cni
FLEXVOL_IMAGE := calico/pod2daemon-flexvol
KUBECONTROLLERS_IMAGE := calico/kube-controllers
TYPHA_IMAGE := calico/typha
CSI_IMAGE := calico/csi
NODE_DRIVER_REGISTRAR_IMAGE := calico/node-driver-registrar

.PHONY: calico-node.tar
calico-node.tar:
docker pull $(IMAGE_REGISTRY)/$(NODE_IMAGE):$(VERSION_TAG)
docker save --output $@ $(NODE_IMAGE):$(VERSION_TAG)

.PHONY: calico-apiserver.tar
calico-apiserver.tar:
docker pull $(IMAGE_REGISTRY)/$(APISERVER_IMAGE):$(VERSION_TAG)
docker save --output $@ $(APISERVER_IMAGE):$(VERSION_TAG)

.PHONY: calico-cni.tar
calico-cni.tar:
docker pull $(IMAGE_REGISTRY)/$(CNI_IMAGE):$(VERSION_TAG)
docker save --output $@ $(CNI_IMAGE):$(VERSION_TAG)

.PHONY: calico-pod2daemon-flexvol.tar
calico-pod2daemon-flexvol.tar:
docker pull $(IMAGE_REGISTRY)/$(FLEXVOL_IMAGE):$(VERSION_TAG)
docker save --output $@ $(FLEXVOL_IMAGE):$(VERSION_TAG)

.PHONY: calico-kube-controllers.tar
calico-kube-controllers.tar:
docker pull $(IMAGE_REGISTRY)/$(KUBECONTROLLERS_IMAGE):$(VERSION_TAG)
docker save --output $@ $(KUBECONTROLLERS_IMAGE):$(VERSION_TAG)

.PHONY: calico-typha.tar
calico-typha.tar:
docker pull $(IMAGE_REGISTRY)/$(TYPHA_IMAGE):$(VERSION_TAG)
docker save --output $@ $(TYPHA_IMAGE):$(VERSION_TAG)

.PHONY: calico-csi.tar
calico-csi.tar:
docker pull $(IMAGE_REGISTRY)/$(CSI_IMAGE):$(VERSION_TAG)
docker save --output $@ $(CSI_IMAGE):$(VERSION_TAG)

.PHONY: calico-node-driver-registrar.tar
calico-node-driver-registrar.tar:
docker pull $(IMAGE_REGISTRY)/$(NODE_DRIVER_REGISTRAR_IMAGE):$(VERSION_TAG)
docker save --output $@ $(NODE_DRIVER_REGISTRAR_IMAGE):$(VERSION_TAG)

IMAGE_TARS := calico-node.tar \
calico-apiserver.tar \
calico-cni.tar \
calico-pod2daemon-flexvol.tar \
calico-kube-controllers.tar \
calico-typha.tar \
calico-csi.tar \
calico-node-driver-registrar.tar

load-container-images: ./test/load_images_on_kind_cluster.sh $(IMAGE_TARS)
# Load the latest tar files onto the currently running kind cluster.
KUBECONFIG=$(KUBECONFIG) ./test/load_images_on_kind_cluster.sh $(IMAGE_TARS)
# Restart the Calico containers so they launch with the newly loaded code.
# TODO: We should be able to do this without restarting everything in kube-system.
KUBECONFIG=$(KUBECONFIG) $(BINDIR)/kubectl delete pods -n kube-system --all

## Deploy CRDs needed for UTs. CRDs needed by ECK that we don't use are not deployed.
## kubectl create is used for prometheus as a workaround for https://github.com/prometheus-community/helm-charts/issues/1500
## kubectl create is used for operator CRDS since the Installation API is large enough now that we hit the following error:
Expand Down
20 changes: 2 additions & 18 deletions test/active_test.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2021-2022 Tigera, Inc. All rights reserved.
// Copyright (c) 2021-2023 Tigera, Inc. All rights reserved.

// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -27,8 +27,6 @@ import (
corev1 "k8s.io/api/core/v1"
kerror "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/runtime/schema"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/config"
logf "sigs.k8s.io/controller-runtime/pkg/log"
Expand Down Expand Up @@ -74,21 +72,7 @@ var _ = Describe("pkg/active with apiserver", func() {
}
err := c.Delete(context.Background(), ns)
Expect(err).NotTo(HaveOccurred())
// Validate the calico-system namespace is deleted using an unstructured type.
// This hits the API server directly instead of using the client cache.
// This should help with flaky tests.
Eventually(func() error {
u := &unstructured.Unstructured{}
u.SetGroupVersionKind(schema.GroupVersionKind{
Group: "",
Version: "v1",
Kind: "Namespace",
})

k := client.ObjectKey{Name: "calico-system"}
err := c.Get(context.Background(), k, u)
return err
}, 240*time.Second).ShouldNot(BeNil())
ExpectResourceDestroyed(c, ns, 10*time.Second)
active.OsExitOverride = os.Exit
active.TickerRateOverride = originalTickRate
})
Expand Down
98 changes: 31 additions & 67 deletions test/crd_management_test.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2021-2022 Tigera, Inc. All rights reserved.
// Copyright (c) 2021-2023 Tigera, Inc. All rights reserved.

// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -28,9 +28,7 @@ import (
apiextenv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
kerror "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/client-go/kubernetes"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/config"
Expand Down Expand Up @@ -79,18 +77,7 @@ var _ = Describe("CRD management tests", func() {
})

AfterEach(func() {
defer func() {
cancel()
Eventually(func() error {
select {
case <-operatorDone:
return nil
default:
return fmt.Errorf("operator did not shutdown")
}
}, 60*time.Second).Should(BeNil())
}()
waitForProductTeardown(c)
cleanupResources(c)

// Clean up Calico data that might be left behind.
Eventually(func() error {
Expand Down Expand Up @@ -118,63 +105,45 @@ var _ = Describe("CRD management tests", func() {

mgr = nil

_ = c.Delete(context.Background(), npCRD.DeepCopy())
// Need to make sure the operator is shut down prior to deleting and recreating the networkpolicies CRD otherwise
// the controller initialized within the tests can recreate the CRD between the deletion confirmation and recreation
// attempt, creating a timing window where failures can occur.
cancel()
Eventually(func() error {
u := &unstructured.Unstructured{}
u.SetGroupVersionKind(schema.GroupVersionKind{
Group: "apiextensions.k8s.io",
Version: "v1",
Kind: "CustomResourceDefinition",
})

k := client.ObjectKey{Name: npCRD.Name}
err := c.Get(context.Background(), k, u)
return err
}, 20*time.Second).ShouldNot(BeNil())
npCRD.SetResourceVersion("")
err := c.Create(context.Background(), npCRD)
select {
case <-operatorDone:
return nil
default:
return fmt.Errorf("operator did not shutdown")
}
}, 60*time.Second).Should(BeNil())

err := c.Delete(context.Background(), npCRD)
Expect(err).NotTo(HaveOccurred())
// The "MustPassRepeatedly" here is important because there is some jitter that isn't fully understood that can
// cause flakes without it
Eventually(func() error {
u := &unstructured.Unstructured{}
u.SetGroupVersionKind(schema.GroupVersionKind{
Group: "apiextensions.k8s.io",
Version: "v1",
Kind: "CustomResourceDefinition",
})

k := client.ObjectKey{Name: npCRD.Name}
err := c.Get(context.Background(), k, u)
return err
}, 20*time.Second).Should(BeNil())
err := GetResource(c, npCRD)
if kerror.IsNotFound(err) || kerror.IsGone(err) {
return nil
} else if err != nil {
return err
} else {
return fmt.Errorf("NetworkPolicy CRD still exists")
}
}, 20*time.Second).MustPassRepeatedly(50).ShouldNot(HaveOccurred())
npCRD.SetResourceVersion("")
err = c.Create(context.Background(), npCRD)
Expect(err).NotTo(HaveOccurred())
ExpectResourceCreated(c, npCRD)
})

Describe("Installing CRD", func() {
BeforeEach(func() {
// Delete the networkpolicies so we can tell that it gets created.
err := c.Delete(context.Background(), npCRD)
Expect(err).NotTo(HaveOccurred())
Eventually(func() error {
u := &unstructured.Unstructured{}
u.SetGroupVersionKind(schema.GroupVersionKind{
Group: "apiextensions.k8s.io",
Version: "v1",
Kind: "CustomResourceDefinition",
})

k := client.ObjectKey{Name: "networkpolicies.crd.projectcalico.org"}
err := c.Get(context.Background(), k, u)
if err == nil {
return fmt.Errorf("networkpolicies CRD still exists")
}
if kerror.IsNotFound(err) {
return nil
}
return err
}, 10*time.Second, 1*time.Second).Should(BeNil())
})
AfterEach(func() {
// Delete any CR that might have been created by the test.
removeInstallResourceCR(c, "default", context.Background())
ExpectResourceDestroyed(c, npCRD, 10*time.Second)
})

It("Should create CRD if it doesn't exist", func() {
Expand Down Expand Up @@ -210,11 +179,6 @@ var _ = Describe("CRD management tests", func() {
return nil
}, 60*time.Second, 1*time.Second).Should(BeNil())
})
AfterEach(func() {
// Delete any CR that might have been created by the test.
removeInstallResourceCR(c, "default", context.Background())
})

It("Should add tier to networkpolicy CRD", func() {
c, shutdownContext, cancel, mgr = setupManager(ManageCRDsEnable)
operatorDone = installResourceCRD(c, mgr, shutdownContext, &operator.InstallationSpec{Variant: operator.TigeraSecureEnterprise})
Expand Down
18 changes: 18 additions & 0 deletions test/load_images_on_kind_cluster.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/bin/bash

function load_image() {
local node=${1}
for IMAGETAR in ${@:2}
do
docker cp ./${IMAGETAR} ${node}:/${IMAGETAR}
docker exec -t ${node} ctr -n=k8s.io images import /${IMAGETAR}
docker exec -t ${node} rm /${IMAGETAR}
done
}

KIND_NODES="kind-control-plane kind-worker kind-worker2 kind-worker3"

for NODE in ${KIND_NODES}
do
load_image ${NODE} ${@:2}
done
Loading

0 comments on commit 6d0bf25

Please sign in to comment.