From f6fd9cdbb3ed2eacef1fe5e71da79aa673aa52b9 Mon Sep 17 00:00:00 2001 From: Bryce Soghigian <49734722+Bryce-Soghigian@users.noreply.github.com> Date: Wed, 21 Aug 2024 23:02:30 -0700 Subject: [PATCH 1/3] feat: --network-dataplane flag (#318) * feat: --network-dataplane flag * commit from codespace --- pkg/operator/options/options.go | 11 ++++++----- pkg/operator/options/options_validation.go | 7 +++++++ pkg/operator/options/suite_test.go | 12 +++++++++++- pkg/providers/launchtemplate/launchtemplate.go | 17 ++++++++++------- pkg/test/options.go | 2 ++ 5 files changed, 36 insertions(+), 13 deletions(-) diff --git a/pkg/operator/options/options.go b/pkg/operator/options/options.go index 5ede66943..d3a3acac8 100644 --- a/pkg/operator/options/options.go +++ b/pkg/operator/options/options.go @@ -62,14 +62,14 @@ type Options struct { ClusterEndpoint string // => APIServerName in bootstrap, except needs to be w/o https/port VMMemoryOverheadPercent float64 ClusterID string - KubeletClientTLSBootstrapToken string // => TLSBootstrapToken in bootstrap (may need to be per node/nodepool) - SSHPublicKey string // ssh.publicKeys.keyData => VM SSH public key // TODO: move to v1alpha2.AKSNodeClass? - NetworkPlugin string // => NetworkPlugin in bootstrap - NetworkPolicy string // => NetworkPolicy in bootstrap + KubeletClientTLSBootstrapToken string // => TLSBootstrapToken in bootstrap (may need to be per node/nodepool) + SSHPublicKey string // ssh.publicKeys.keyData => VM SSH public key // TODO: move to v1alpha2.AKSNodeClass? + NetworkPlugin string // => NetworkPlugin in bootstrap + NetworkPolicy string // => NetworkPolicy in bootstrap + NetworkDataplane string NodeIdentities []string // => Applied onto each VM SubnetID string // => VnetSubnetID to use (for nodes in Azure CNI Overlay and Azure CNI + pod subnet; for for nodes and pods in Azure CNI), unless overridden via AKSNodeClass - setFlags map[string]bool } @@ -81,6 +81,7 @@ func (o *Options) AddFlags(fs *coreoptions.FlagSet) { fs.StringVar(&o.SSHPublicKey, "ssh-public-key", env.WithDefaultString("SSH_PUBLIC_KEY", ""), "[REQUIRED] VM SSH public key.") fs.StringVar(&o.NetworkPlugin, "network-plugin", env.WithDefaultString("NETWORK_PLUGIN", "azure"), "The network plugin used by the cluster.") fs.StringVar(&o.NetworkPolicy, "network-policy", env.WithDefaultString("NETWORK_POLICY", ""), "The network policy used by the cluster.") + fs.StringVar(&o.NetworkDataplane, "network-dataplane", env.WithDefaultString("NETWORK_DATAPLANE", "cilium"), "The network dataplane used by the cluster.") fs.StringVar(&o.SubnetID, "vnet-subnet-id", env.WithDefaultString("VNET_SUBNET_ID", ""), "The default subnet ID to use for new nodes. This must be a valid ARM resource ID for subnet that does not overlap with the service CIDR or the pod CIDR") fs.Var(newNodeIdentitiesValue(env.WithDefaultString("NODE_IDENTITIES", ""), &o.NodeIdentities), "node-identities", "User assigned identities for nodes.") } diff --git a/pkg/operator/options/options_validation.go b/pkg/operator/options/options_validation.go index 9ed1bb21c..853c580ae 100644 --- a/pkg/operator/options/options_validation.go +++ b/pkg/operator/options/options_validation.go @@ -31,6 +31,7 @@ func (o Options) Validate() error { o.validateRequiredFields(), o.validateEndpoint(), o.validateVMMemoryOverheadPercent(), + o.validateNetworkDataplane(), o.validateVnetSubnetID(), validate.Struct(o), ) @@ -44,6 +45,12 @@ func (o Options) validateVnetSubnetID() error { return nil } +func (o Options) validateNetworkDataplane() error { + if o.NetworkDataplane != "azure" && o.NetworkDataplane != "cilium" { + return fmt.Errorf("network dataplane %s is not a valid network dataplane, valid dataplanes are ('azure', 'cilium')", o.NetworkDataplane) + } + return nil +} func (o Options) validateEndpoint() error { if o.ClusterEndpoint == "" { return nil diff --git a/pkg/operator/options/suite_test.go b/pkg/operator/options/suite_test.go index 064a59033..eadd978ca 100644 --- a/pkg/operator/options/suite_test.go +++ b/pkg/operator/options/suite_test.go @@ -113,8 +113,18 @@ var _ = Describe("Options", func() { })) }) }) - Context("Validation", func() { + It("should fail validation when networkDataplane is not invalid", func() { + err := opts.Parse( + fs, + "--cluster-endpoint", "https://karpenter-000000000000.hcp.westus2.staging.azmk8s.io", + "--kubelet-bootstrap-token", "flag-bootstrap-token", + "--ssh-public-key", "flag-ssh-public-key", + "--network-dataplane", "ciluum", + ) + Expect(err).To(MatchError(ContainSubstring("network dataplane ciluum is not a valid network dataplane, valid dataplanes are ('azure', 'cilium')"))) + }) + It("should fail validation when clusterName not included", func() { err := opts.Parse( fs, diff --git a/pkg/providers/launchtemplate/launchtemplate.go b/pkg/providers/launchtemplate/launchtemplate.go index 3c5e5ac95..22c52e30f 100644 --- a/pkg/providers/launchtemplate/launchtemplate.go +++ b/pkg/providers/launchtemplate/launchtemplate.go @@ -121,13 +121,16 @@ func (p *Provider) getStaticParameters(ctx context.Context, instanceType *cloudp labels = lo.Assign(labels, vnetLabels) // TODO: Make conditional on epbf dataplane - // This label is required for the cilium agent daemonset because - // we select the nodes for the daemonset based on this label - // - key: kubernetes.azure.com/ebpf-dataplane - // operator: In - // values: - // - cilium - labels[vnetDataPlaneLabel] = networkDataplaneCilium + if options.FromContext(ctx).NetworkDataplane == networkDataplaneCilium { + // This label is required for the cilium agent daemonset because + // we select the nodes for the daemonset based on this label + // - key: kubernetes.azure.com/ebpf-dataplane + // operator: In + // values: + // - cilium + + labels[vnetDataPlaneLabel] = networkDataplaneCilium + } return ¶meters.StaticParameters{ ClusterName: options.FromContext(ctx).ClusterName, diff --git a/pkg/test/options.go b/pkg/test/options.go index de910df89..5b9c1b83c 100644 --- a/pkg/test/options.go +++ b/pkg/test/options.go @@ -33,6 +33,7 @@ type OptionsFields struct { SSHPublicKey *string NetworkPlugin *string NetworkPolicy *string + NetworkDataplane *string VMMemoryOverheadPercent *float64 NodeIdentities []string SubnetID *string @@ -53,6 +54,7 @@ func Options(overrides ...OptionsFields) *azoptions.Options { SSHPublicKey: lo.FromPtrOr(options.SSHPublicKey, "test-ssh-public-key"), NetworkPlugin: lo.FromPtrOr(options.NetworkPlugin, "azure"), NetworkPolicy: lo.FromPtrOr(options.NetworkPolicy, "cilium"), + NetworkDataplane: lo.FromPtrOr(options.NetworkDataplane, "cilium"), VMMemoryOverheadPercent: lo.FromPtrOr(options.VMMemoryOverheadPercent, 0.075), NodeIdentities: options.NodeIdentities, SubnetID: lo.FromPtrOr(options.SubnetID, "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/sillygeese/providers/Microsoft.Network/virtualNetworks/karpentervnet/subnets/karpentersub"), From a43b9940731b3fb51e59de6a5442dca1bb80da57 Mon Sep 17 00:00:00 2001 From: Bryce Soghigian <49734722+Bryce-Soghigian@users.noreply.github.com> Date: Wed, 21 Aug 2024 23:29:36 -0700 Subject: [PATCH 2/3] fix: set `DisableKubeletCloudCredentialProviders=false` for versions without OOT credential provider (1.29) (#456) * fix: DisableKubeletCloudCredentialProviders should default to false for 1.29 * fix: adding 1.31 credential provider into switch * test: testing 1.31 credential provider url * fix: ci --------- Co-authored-by: Alex Leites <18728999+tallaxes@users.noreply.github.com> --- pkg/providers/imagefamily/bootstrap/aksbootstrap.go | 6 ++++++ pkg/providers/imagefamily/bootstrap/aksbootstrap_test.go | 5 +++++ 2 files changed, 11 insertions(+) diff --git a/pkg/providers/imagefamily/bootstrap/aksbootstrap.go b/pkg/providers/imagefamily/bootstrap/aksbootstrap.go index 83014232b..e337e4188 100644 --- a/pkg/providers/imagefamily/bootstrap/aksbootstrap.go +++ b/pkg/providers/imagefamily/bootstrap/aksbootstrap.go @@ -437,6 +437,9 @@ func CredentialProviderURL(kubernetesVersion, arch string) string { credentialProviderVersion = "1.29.2" case 30: credentialProviderVersion = "1.30.0" + + case 31: + credentialProviderVersion = "1.31.0" } return fmt.Sprintf("%s/cloud-provider-azure/v%s/binaries/azure-acr-credential-provider-linux-%s-v%s.tar.gz", globalAKSMirror, credentialProviderVersion, arch, credentialProviderVersion) @@ -497,6 +500,9 @@ func (a AKS) applyOptions(nbv *NodeBootstrapVariables) { kubeletFlagsBase["--image-credential-provider-config"] = "/var/lib/kubelet/credential-provider-config.yaml" kubeletFlagsBase["--image-credential-provider-bin-dir"] = "/var/lib/kubelet/credential-provider" } else { // Versions Less than 1.30 + // we can make this logic smarter later when we have more than one + // for now just adding here. + kubeletFlagsBase["--feature-gates"] = "DisableKubeletCloudCredentialProviders=false" kubeletFlagsBase["--azure-container-registry-config"] = "/etc/kubernetes/azure.json" } // merge and stringify taints diff --git a/pkg/providers/imagefamily/bootstrap/aksbootstrap_test.go b/pkg/providers/imagefamily/bootstrap/aksbootstrap_test.go index ec0cb423f..0d3c3764b 100644 --- a/pkg/providers/imagefamily/bootstrap/aksbootstrap_test.go +++ b/pkg/providers/imagefamily/bootstrap/aksbootstrap_test.go @@ -65,6 +65,11 @@ func TestGetCredentialProviderURL(t *testing.T) { arch string url string }{ + { + version: "1.31.0", + arch: "amd64", + url: fmt.Sprintf("%s/cloud-provider-azure/v1.31.0/binaries/azure-acr-credential-provider-linux-amd64-v1.31.0.tar.gz", globalAKSMirror), + }, { version: "1.30.2", arch: "amd64", From 1797b302773cd9e2d13f9ec671beada796a22ea3 Mon Sep 17 00:00:00 2001 From: Bryce Soghigian <49734722+Bryce-Soghigian@users.noreply.github.com> Date: Thu, 22 Aug 2024 15:17:24 -0700 Subject: [PATCH 3/3] test(e2e): validate that pulling from an acr registry attached to aks via --attach-acr with karpenter nodes works (#457) * test(e2e): validate that pulling from an acr registry attached to aks via --attach-acr with karpenter nodes works * fix: ci * test: ci * fix: crd validation breaks on local so accidentally committed the change with it disabled * fix: passing in azure acr name from env rather than using makefile default * fix: ci again? * fix: nit comments addressed * test: only provisioning one pod --- .github/actions/e2e/create-acr/action.yaml | 4 ++ .github/workflows/e2e-matrix.yaml | 2 +- .github/workflows/e2e.yaml | 2 +- Makefile | 2 +- Makefile-az.mk | 9 +++ test/suites/acr/suite_test.go | 80 ++++++++++++++++++++++ 6 files changed, 96 insertions(+), 3 deletions(-) create mode 100644 test/suites/acr/suite_test.go diff --git a/.github/actions/e2e/create-acr/action.yaml b/.github/actions/e2e/create-acr/action.yaml index a485e80ff..aadfed21c 100644 --- a/.github/actions/e2e/create-acr/action.yaml +++ b/.github/actions/e2e/create-acr/action.yaml @@ -40,3 +40,7 @@ runs: - name: create ACR shell: bash run: AZURE_RESOURCE_GROUP=${{ inputs.resource_group }} AZURE_ACR_NAME=${{ inputs.acr_name }} AZURE_LOCATION=${{ inputs.location }} make az-mkacr + - name: import needed images + shell: bash + run: | + AZURE_ACR_NAME=${{ inputs.acr_name }} make az-acrimport diff --git a/.github/workflows/e2e-matrix.yaml b/.github/workflows/e2e-matrix.yaml index 6dde9d0de..43ce1a351 100644 --- a/.github/workflows/e2e-matrix.yaml +++ b/.github/workflows/e2e-matrix.yaml @@ -43,7 +43,7 @@ jobs: strategy: fail-fast: false matrix: - suite: [Nonbehavioral, Utilization, GPU, Drift, Integration, NodeClaim, Chaos] + suite: [Nonbehavioral, Utilization, GPU, Drift, Integration, NodeClaim, Chaos, ACR] permissions: contents: read id-token: write diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index 548e54cde..f9a0abecd 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -129,7 +129,7 @@ jobs: if: inputs.suite != 'Nonbehavioral' run: | AZURE_CLUSTER_NAME=${{ env.CLUSTER_NAME }} AZURE_RESOURCE_GROUP=${{ env.RG_NAME }} make az-creds - CLUSTER_NAME=${{ env.CLUSTER_NAME }} TEST_SUITE="${{ inputs.suite }}" GIT_REF="$(git rev-parse HEAD)" make e2etests + CLUSTER_NAME=${{ env.CLUSTER_NAME }} AZURE_ACR_NAME=${{ env.ACR_NAME}} TEST_SUITE="${{ inputs.suite }}" GIT_REF="$(git rev-parse HEAD)" make e2etests - name: dump logs on failure uses: ./.github/actions/e2e/dump-logs if: failure() || cancelled() diff --git a/Makefile b/Makefile index d8204f6c4..3a93ed537 100644 --- a/Makefile +++ b/Makefile @@ -45,7 +45,7 @@ e2etests: ## Run the e2e suite against your local cluster # -count 1: prevents caching # -timeout: If a test binary runs longer than TEST_TIMEOUT, panic # -v: verbose output - cd test && CLUSTER_NAME=${CLUSTER_NAME} go test \ + cd test && CLUSTER_NAME=${CLUSTER_NAME} AZURE_ACR_NAME=${AZURE_ACR_NAME} go test \ -p 1 \ -count 1 \ -timeout ${TEST_TIMEOUT} \ diff --git a/Makefile-az.mk b/Makefile-az.mk index 5db0a0864..3019d9a9e 100755 --- a/Makefile-az.mk +++ b/Makefile-az.mk @@ -38,6 +38,15 @@ az-mkacr: az-mkrg ## Create test ACR --sku Basic --admin-enabled -o none az acr login --name $(AZURE_ACR_NAME) +az-acrimport: ## Imports an image to an acr registry + az acr import --name $(AZURE_ACR_NAME) --source "mcr.microsoft.com/oss/kubernetes/pause:3.6" --image "pause:3.6" + +az-cleanenv: az-rmnodeclaims-fin ## Deletes a few common karpenter testing resources(pods, nodepools, nodeclaims, aksnodeclasses) + kubectl delete pods -n default --all + kubectl delete nodeclaims --all + kubectl delete nodepools --all + kubectl delete aksnodeclasses --all + az-mkaks: az-mkacr ## Create test AKS cluster (with --vm-set-type AvailabilitySet for compatibility with standalone VMs) az aks create --name $(AZURE_CLUSTER_NAME) --resource-group $(AZURE_RESOURCE_GROUP) --attach-acr $(AZURE_ACR_NAME) --location $(AZURE_LOCATION) \ --enable-managed-identity --node-count 3 --generate-ssh-keys --vm-set-type AvailabilitySet -o none diff --git a/test/suites/acr/suite_test.go b/test/suites/acr/suite_test.go new file mode 100644 index 000000000..076b03ca0 --- /dev/null +++ b/test/suites/acr/suite_test.go @@ -0,0 +1,80 @@ +/* +Portions Copyright (c) Microsoft Corporation. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package acr + +import ( + "fmt" + "os" + "testing" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + v1 "k8s.io/api/core/v1" + + "github.com/Azure/karpenter-provider-azure/pkg/apis/v1alpha2" + "github.com/Azure/karpenter-provider-azure/test/pkg/environment/azure" + "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/apimachinery/pkg/labels" + corev1beta1 "sigs.k8s.io/karpenter/pkg/apis/v1beta1" + "sigs.k8s.io/karpenter/pkg/test" +) + +var env *azure.Environment +var nodeClass *v1alpha2.AKSNodeClass +var nodePool *corev1beta1.NodePool +var pauseImage string + +func TestAcr(t *testing.T) { + RegisterFailHandler(Fail) + BeforeSuite(func() { + env = azure.NewEnvironment(t) + acrName := os.Getenv("AZURE_ACR_NAME") + Expect(acrName).NotTo(BeEmpty(), "AZURE_ACR_NAME must be set for the acr test suite") + pauseImage = fmt.Sprintf("%s.azurecr.io/pause:3.6", acrName) + }) + RunSpecs(t, "Acr") +} + +var _ = BeforeEach(func() { + env.BeforeEach() + nodeClass = env.DefaultAKSNodeClass() + nodePool = env.DefaultNodePool(nodeClass) +}) +var _ = AfterEach(func() { env.Cleanup() }) +var _ = AfterEach(func() { env.AfterEach() }) + +var _ = Describe("Acr", func() { + Describe("Image Pull", func() { + It("should allow karpenter user pool nodes to pull images from the clusters attached acr", func() { + deployment := test.Deployment(test.DeploymentOptions{ + Replicas: 1, + PodOptions: test.PodOptions{ + ResourceRequirements: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("1.1"), + }, + }, + Image: pauseImage, + }, + }) + + env.ExpectCreated(nodePool, nodeClass, deployment) + env.EventuallyExpectHealthyPodCountWithTimeout(time.Minute*15, labels.SelectorFromSet(deployment.Spec.Selector.MatchLabels), int(*deployment.Spec.Replicas)) + }) + }) +})