From 19c80d471b63100c2e8d26ef0913d18a4cf471a7 Mon Sep 17 00:00:00 2001 From: Stefan Bueringer Date: Wed, 4 Sep 2024 16:02:05 +0200 Subject: [PATCH] Add new ClusterCache implementation --- api/v1beta1/v1beta2_condition_consts.go | 6 + bootstrap/kubeadm/controllers/alias.go | 6 +- .../controllers/kubeadmconfig_controller.go | 18 +- .../kubeadmconfig_controller_test.go | 24 +- bootstrap/kubeadm/main.go | 98 ++- controllers/alias.go | 46 +- controllers/clustercache/cluster_accessor.go | 497 +++++++++++ .../clustercache/cluster_accessor_client.go | 329 ++++++++ .../cluster_accessor_client_test.go | 146 ++++ .../clustercache/cluster_accessor_test.go | 382 +++++++++ controllers/clustercache/cluster_cache.go | 667 +++++++++++++++ .../clustercache/cluster_cache_fake.go | 39 + .../clustercache/cluster_cache_test.go | 781 ++++++++++++++++++ controllers/clustercache/doc.go | 57 ++ controllers/clustercache/index.go | 30 + controllers/clustercache/suite_test.go | 38 + controlplane/kubeadm/controllers/alias.go | 6 +- controlplane/kubeadm/internal/cluster.go | 10 +- controlplane/kubeadm/internal/cluster_test.go | 49 +- .../internal/controllers/controller.go | 24 +- .../internal/controllers/controller_test.go | 2 +- controlplane/kubeadm/main.go | 98 ++- exp/addons/controllers/alias.go | 8 +- .../clusterresourceset_controller.go | 24 +- .../clusterresourceset_controller_test.go | 11 +- exp/addons/internal/controllers/suite_test.go | 30 +- exp/controllers/alias.go | 10 +- .../controllers/machinepool_controller.go | 28 +- .../machinepool_controller_noderef.go | 4 +- .../machinepool_controller_phases.go | 2 +- .../machinepool_controller_phases_test.go | 82 +- .../machinepool_controller_test.go | 18 +- exp/internal/controllers/suite_test.go | 33 +- exp/topology/desiredstate/desired_state.go | 10 +- .../controllers/cluster/cluster_controller.go | 36 +- .../cluster/cluster_controller_test.go | 23 + internal/controllers/cluster/suite_test.go | 46 +- .../controllers/machine/machine_controller.go | 46 +- .../machine/machine_controller_noderef.go | 2 +- .../machine_controller_noderef_test.go | 68 +- .../machine/machine_controller_status_test.go | 33 + .../machine/machine_controller_test.go | 56 +- internal/controllers/machine/suite_test.go | 44 +- .../machinedeployment_controller_test.go | 5 + .../machinedeployment/suite_test.go | 44 +- .../machinehealthcheck_controller.go | 25 +- .../machinehealthcheck_controller_test.go | 33 +- .../machinehealthcheck/suite_test.go | 60 +- .../machineset/machineset_controller.go | 21 +- .../machineset/machineset_controller_test.go | 5 + internal/controllers/machineset/suite_test.go | 44 +- .../topology/cluster/cluster_controller.go | 20 +- .../cluster/cluster_controller_test.go | 14 +- .../topology/cluster/suite_test.go | 46 +- internal/test/envtest/environment.go | 12 +- internal/webhooks/cluster.go | 14 +- internal/webhooks/cluster_test.go | 12 +- main.go | 140 ++-- .../docker/controllers/alias.go | 6 +- .../docker/exp/controllers/alias.go | 3 - .../dockermachinepool_controller.go | 12 +- .../controllers/dockermachine_controller.go | 14 +- test/infrastructure/docker/main.go | 98 ++- util/util.go | 5 +- webhooks/alias.go | 12 +- 65 files changed, 3900 insertions(+), 712 deletions(-) create mode 100644 controllers/clustercache/cluster_accessor.go create mode 100644 controllers/clustercache/cluster_accessor_client.go create mode 100644 controllers/clustercache/cluster_accessor_client_test.go create mode 100644 controllers/clustercache/cluster_accessor_test.go create mode 100644 controllers/clustercache/cluster_cache.go create mode 100644 controllers/clustercache/cluster_cache_fake.go create mode 100644 controllers/clustercache/cluster_cache_test.go create mode 100644 controllers/clustercache/doc.go create mode 100644 controllers/clustercache/index.go create mode 100644 controllers/clustercache/suite_test.go diff --git a/api/v1beta1/v1beta2_condition_consts.go b/api/v1beta1/v1beta2_condition_consts.go index d97bc735f6cc..d84a1cc2a0aa 100644 --- a/api/v1beta1/v1beta2_condition_consts.go +++ b/api/v1beta1/v1beta2_condition_consts.go @@ -203,6 +203,12 @@ const ( // is detected (or whatever period is defined in the --remote-connection-grace-period flag). ClusterRemoteConnectionProbeV1Beta2Condition = "RemoteConnectionProbe" + // ClusterRemoteConnectionProbeFailedV1Beta2Reason surfaces issues with the connection to the workload cluster. + ClusterRemoteConnectionProbeFailedV1Beta2Reason = "RemoteConnectionProbeFailed" + + // ClusterRemoteConnectionProbeSucceededV1Beta2Reason is used to report a working connection with the workload cluster. + ClusterRemoteConnectionProbeSucceededV1Beta2Reason = "RemoteConnectionProbeSucceeded" + // ClusterScalingUpV1Beta2Condition is true if available replicas < desired replicas. ClusterScalingUpV1Beta2Condition = ScalingUpV1Beta2Condition diff --git a/bootstrap/kubeadm/controllers/alias.go b/bootstrap/kubeadm/controllers/alias.go index 7c9a2aff9b0f..2fb8b5d230c8 100644 --- a/bootstrap/kubeadm/controllers/alias.go +++ b/bootstrap/kubeadm/controllers/alias.go @@ -25,7 +25,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/controller" kubeadmbootstrapcontrollers "sigs.k8s.io/cluster-api/bootstrap/kubeadm/internal/controllers" - "sigs.k8s.io/cluster-api/controllers/remote" + "sigs.k8s.io/cluster-api/controllers/clustercache" ) // Following types provides access to reconcilers implemented in internal/controllers, thus @@ -41,7 +41,7 @@ type KubeadmConfigReconciler struct { Client client.Client SecretCachingClient client.Client - Tracker *remote.ClusterCacheTracker + ClusterCache clustercache.ClusterCache // WatchFilterValue is the label value used to filter events prior to reconciliation. WatchFilterValue string @@ -55,7 +55,7 @@ func (r *KubeadmConfigReconciler) SetupWithManager(ctx context.Context, mgr ctrl return (&kubeadmbootstrapcontrollers.KubeadmConfigReconciler{ Client: r.Client, SecretCachingClient: r.SecretCachingClient, - Tracker: r.Tracker, + ClusterCache: r.ClusterCache, WatchFilterValue: r.WatchFilterValue, TokenTTL: r.TokenTTL, }).SetupWithManager(ctx, mgr, options) diff --git a/bootstrap/kubeadm/internal/controllers/kubeadmconfig_controller.go b/bootstrap/kubeadm/internal/controllers/kubeadmconfig_controller.go index dacba4fa73d9..bb05268f331a 100644 --- a/bootstrap/kubeadm/internal/controllers/kubeadmconfig_controller.go +++ b/bootstrap/kubeadm/internal/controllers/kubeadmconfig_controller.go @@ -50,7 +50,7 @@ import ( "sigs.k8s.io/cluster-api/bootstrap/kubeadm/internal/locking" kubeadmtypes "sigs.k8s.io/cluster-api/bootstrap/kubeadm/types" bsutil "sigs.k8s.io/cluster-api/bootstrap/util" - "sigs.k8s.io/cluster-api/controllers/remote" + "sigs.k8s.io/cluster-api/controllers/clustercache" expv1 "sigs.k8s.io/cluster-api/exp/api/v1beta1" "sigs.k8s.io/cluster-api/feature" "sigs.k8s.io/cluster-api/internal/util/taints" @@ -83,7 +83,7 @@ type InitLocker interface { type KubeadmConfigReconciler struct { Client client.Client SecretCachingClient client.Client - Tracker *remote.ClusterCacheTracker + ClusterCache clustercache.ClusterCache KubeadmInitLock InitLocker // WatchFilterValue is the label value used to filter events prior to reconciliation. @@ -135,7 +135,7 @@ func (r *KubeadmConfigReconciler) SetupWithManager(ctx context.Context, mgr ctrl predicates.ResourceHasFilterLabel(mgr.GetScheme(), predicateLog, r.WatchFilterValue), ), ), - ) + ).WatchesRawSource(r.ClusterCache.GetClusterSource("kubeadmconfig", r.ClusterToKubeadmConfigs)) if err := b.Complete(r); err != nil { return errors.Wrap(err, "failed setting up with a controller manager") @@ -242,10 +242,8 @@ func (r *KubeadmConfigReconciler) Reconcile(ctx context.Context, req ctrl.Reques } res, err := r.reconcile(ctx, scope, cluster, config, configOwner) - if err != nil && errors.Is(err, remote.ErrClusterLocked) { - // Requeue if the reconcile failed because the ClusterCacheTracker was locked for - // the current cluster because of concurrent access. - log.V(5).Info("Requeuing because another worker has the lock on the ClusterCacheTracker") + if err != nil && errors.Is(err, clustercache.ErrClusterNotConnected) { + log.V(5).Info("Requeuing because connection to the workload cluster is down") return ctrl.Result{RequeueAfter: time.Minute}, nil } return res, err @@ -320,7 +318,7 @@ func (r *KubeadmConfigReconciler) refreshBootstrapTokenIfNeeded(ctx context.Cont log := ctrl.LoggerFrom(ctx) token := config.Spec.JoinConfiguration.Discovery.BootstrapToken.Token - remoteClient, err := r.Tracker.GetClient(ctx, util.ObjectKey(cluster)) + remoteClient, err := r.ClusterCache.GetClient(ctx, util.ObjectKey(cluster)) if err != nil { return ctrl.Result{}, err } @@ -367,7 +365,7 @@ func (r *KubeadmConfigReconciler) refreshBootstrapTokenIfNeeded(ctx context.Cont func (r *KubeadmConfigReconciler) rotateMachinePoolBootstrapToken(ctx context.Context, config *bootstrapv1.KubeadmConfig, cluster *clusterv1.Cluster, scope *Scope) (ctrl.Result, error) { log := ctrl.LoggerFrom(ctx) log.V(2).Info("Config is owned by a MachinePool, checking if token should be rotated") - remoteClient, err := r.Tracker.GetClient(ctx, util.ObjectKey(cluster)) + remoteClient, err := r.ClusterCache.GetClient(ctx, util.ObjectKey(cluster)) if err != nil { return ctrl.Result{}, err } @@ -1087,7 +1085,7 @@ func (r *KubeadmConfigReconciler) reconcileDiscovery(ctx context.Context, cluste // if BootstrapToken already contains a token, respect it; otherwise create a new bootstrap token for the node to join if config.Spec.JoinConfiguration.Discovery.BootstrapToken.Token == "" { - remoteClient, err := r.Tracker.GetClient(ctx, util.ObjectKey(cluster)) + remoteClient, err := r.ClusterCache.GetClient(ctx, util.ObjectKey(cluster)) if err != nil { return ctrl.Result{}, err } diff --git a/bootstrap/kubeadm/internal/controllers/kubeadmconfig_controller_test.go b/bootstrap/kubeadm/internal/controllers/kubeadmconfig_controller_test.go index 969a00ce1781..0a0480c1d5c9 100644 --- a/bootstrap/kubeadm/internal/controllers/kubeadmconfig_controller_test.go +++ b/bootstrap/kubeadm/internal/controllers/kubeadmconfig_controller_test.go @@ -24,7 +24,6 @@ import ( "time" ignition "github.com/flatcar/ignition/config/v2_3" - "github.com/go-logr/logr" . "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -34,14 +33,13 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" - "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/reconcile" "sigs.k8s.io/yaml" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" bootstrapv1 "sigs.k8s.io/cluster-api/bootstrap/kubeadm/api/v1beta1" bootstrapbuilder "sigs.k8s.io/cluster-api/bootstrap/kubeadm/internal/builder" - "sigs.k8s.io/cluster-api/controllers/remote" + "sigs.k8s.io/cluster-api/controllers/clustercache" expv1 "sigs.k8s.io/cluster-api/exp/api/v1beta1" "sigs.k8s.io/cluster-api/feature" "sigs.k8s.io/cluster-api/internal/test/builder" @@ -509,7 +507,7 @@ func TestKubeadmConfigReconciler_Reconcile_GenerateCloudConfigData(t *testing.T) k := &KubeadmConfigReconciler{ Client: myclient, SecretCachingClient: myclient, - Tracker: remote.NewTestClusterCacheTracker(logr.New(log.NullLogSink{}), myclient, myclient, myclient.Scheme(), client.ObjectKey{Name: cluster.Name, Namespace: cluster.Namespace}), + ClusterCache: clustercache.NewFakeClusterCache(myclient, client.ObjectKey{Name: cluster.Name, Namespace: cluster.Namespace}), KubeadmInitLock: &myInitLocker{}, } @@ -571,7 +569,7 @@ func TestKubeadmConfigReconciler_Reconcile_ErrorIfJoiningControlPlaneHasInvalidC k := &KubeadmConfigReconciler{ Client: myclient, SecretCachingClient: myclient, - Tracker: remote.NewTestClusterCacheTracker(logr.New(log.NullLogSink{}), myclient, myclient, myclient.Scheme(), client.ObjectKey{Name: cluster.Name, Namespace: cluster.Namespace}), + ClusterCache: clustercache.NewFakeClusterCache(myclient, client.ObjectKey{Name: cluster.Name, Namespace: cluster.Namespace}), KubeadmInitLock: &myInitLocker{}, } @@ -693,7 +691,7 @@ func TestReconcileIfJoinCertificatesAvailableConditioninNodesAndControlPlaneIsRe k := &KubeadmConfigReconciler{ Client: myclient, SecretCachingClient: myclient, - Tracker: remote.NewTestClusterCacheTracker(logr.New(log.NullLogSink{}), myclient, myclient, myclient.Scheme(), client.ObjectKey{Name: cluster.Name, Namespace: cluster.Namespace}), + ClusterCache: clustercache.NewFakeClusterCache(myclient, client.ObjectKey{Name: cluster.Name, Namespace: cluster.Namespace}), KubeadmInitLock: &myInitLocker{}, } @@ -770,7 +768,7 @@ func TestReconcileIfJoinNodePoolsAndControlPlaneIsReady(t *testing.T) { k := &KubeadmConfigReconciler{ Client: myclient, SecretCachingClient: myclient, - Tracker: remote.NewTestClusterCacheTracker(logr.New(log.NullLogSink{}), myclient, myclient, myclient.Scheme(), client.ObjectKey{Name: cluster.Name, Namespace: cluster.Namespace}), + ClusterCache: clustercache.NewFakeClusterCache(myclient, client.ObjectKey{Name: cluster.Name, Namespace: cluster.Namespace}), KubeadmInitLock: &myInitLocker{}, } @@ -871,7 +869,7 @@ func TestBootstrapDataFormat(t *testing.T) { k := &KubeadmConfigReconciler{ Client: myclient, SecretCachingClient: myclient, - Tracker: remote.NewTestClusterCacheTracker(logr.New(log.NullLogSink{}), myclient, myclient, myclient.Scheme(), client.ObjectKey{Name: cluster.Name, Namespace: cluster.Namespace}), + ClusterCache: clustercache.NewFakeClusterCache(myclient, client.ObjectKey{Name: cluster.Name, Namespace: cluster.Namespace}), KubeadmInitLock: &myInitLocker{}, } request := ctrl.Request{ @@ -952,7 +950,7 @@ func TestKubeadmConfigSecretCreatedStatusNotPatched(t *testing.T) { k := &KubeadmConfigReconciler{ Client: myclient, SecretCachingClient: myclient, - Tracker: remote.NewTestClusterCacheTracker(logr.New(log.NullLogSink{}), myclient, myclient, myclient.Scheme(), client.ObjectKey{Name: cluster.Name, Namespace: cluster.Namespace}), + ClusterCache: clustercache.NewFakeClusterCache(myclient, client.ObjectKey{Name: cluster.Name, Namespace: cluster.Namespace}), KubeadmInitLock: &myInitLocker{}, } request := ctrl.Request{ @@ -1033,7 +1031,7 @@ func TestBootstrapTokenTTLExtension(t *testing.T) { SecretCachingClient: myclient, KubeadmInitLock: &myInitLocker{}, TokenTTL: DefaultTokenTTL, - Tracker: remote.NewTestClusterCacheTracker(logr.New(log.NullLogSink{}), myclient, remoteClient, remoteClient.Scheme(), client.ObjectKey{Name: cluster.Name, Namespace: cluster.Namespace}), + ClusterCache: clustercache.NewFakeClusterCache(remoteClient, client.ObjectKey{Name: cluster.Name, Namespace: cluster.Namespace}), } request := ctrl.Request{ NamespacedName: client.ObjectKey{ @@ -1279,7 +1277,7 @@ func TestBootstrapTokenRotationMachinePool(t *testing.T) { SecretCachingClient: myclient, KubeadmInitLock: &myInitLocker{}, TokenTTL: DefaultTokenTTL, - Tracker: remote.NewTestClusterCacheTracker(logr.New(log.NullLogSink{}), myclient, remoteClient, remoteClient.Scheme(), client.ObjectKey{Name: cluster.Name, Namespace: cluster.Namespace}), + ClusterCache: clustercache.NewFakeClusterCache(remoteClient, client.ObjectKey{Name: cluster.Name, Namespace: cluster.Namespace}), } request := ctrl.Request{ NamespacedName: client.ObjectKey{ @@ -1602,7 +1600,7 @@ func TestKubeadmConfigReconciler_Reconcile_DiscoveryReconcileBehaviors(t *testin k := &KubeadmConfigReconciler{ Client: fakeClient, SecretCachingClient: fakeClient, - Tracker: remote.NewTestClusterCacheTracker(logr.New(log.NullLogSink{}), fakeClient, fakeClient, fakeClient.Scheme(), client.ObjectKey{Name: tc.cluster.Name, Namespace: tc.cluster.Namespace}), + ClusterCache: clustercache.NewFakeClusterCache(fakeClient, client.ObjectKey{Name: tc.cluster.Name, Namespace: tc.cluster.Namespace}), KubeadmInitLock: &myInitLocker{}, } @@ -1827,7 +1825,7 @@ func TestKubeadmConfigReconciler_Reconcile_AlwaysCheckCAVerificationUnlessReques reconciler := KubeadmConfigReconciler{ Client: myclient, SecretCachingClient: myclient, - Tracker: remote.NewTestClusterCacheTracker(logr.New(log.NullLogSink{}), myclient, myclient, myclient.Scheme(), client.ObjectKey{Name: cluster.Name, Namespace: cluster.Namespace}), + ClusterCache: clustercache.NewFakeClusterCache(myclient, client.ObjectKey{Name: cluster.Name, Namespace: cluster.Namespace}), KubeadmInitLock: &myInitLocker{}, } diff --git a/bootstrap/kubeadm/main.go b/bootstrap/kubeadm/main.go index dea7668506a3..bd77fb1e5672 100644 --- a/bootstrap/kubeadm/main.go +++ b/bootstrap/kubeadm/main.go @@ -47,6 +47,7 @@ import ( bootstrapv1 "sigs.k8s.io/cluster-api/bootstrap/kubeadm/api/v1beta1" kubeadmbootstrapcontrollers "sigs.k8s.io/cluster-api/bootstrap/kubeadm/controllers" "sigs.k8s.io/cluster-api/bootstrap/kubeadm/internal/webhooks" + "sigs.k8s.io/cluster-api/controllers/clustercache" "sigs.k8s.io/cluster-api/controllers/remote" expv1 "sigs.k8s.io/cluster-api/exp/api/v1beta1" "sigs.k8s.io/cluster-api/feature" @@ -63,31 +64,31 @@ var ( controllerName = "cluster-api-kubeadm-bootstrap-manager" // flags. - enableLeaderElection bool - leaderElectionLeaseDuration time.Duration - leaderElectionRenewDeadline time.Duration - leaderElectionRetryPeriod time.Duration - watchFilterValue string - watchNamespace string - profilerAddress string - enableContentionProfiling bool - syncPeriod time.Duration - restConfigQPS float32 - restConfigBurst int - clusterCacheTrackerClientQPS float32 - clusterCacheTrackerClientBurst int - webhookPort int - webhookCertDir string - webhookCertName string - webhookKeyName string - healthAddr string - managerOptions = flags.ManagerOptions{} - logOptions = logs.NewOptions() + enableLeaderElection bool + leaderElectionLeaseDuration time.Duration + leaderElectionRenewDeadline time.Duration + leaderElectionRetryPeriod time.Duration + watchFilterValue string + watchNamespace string + profilerAddress string + enableContentionProfiling bool + syncPeriod time.Duration + restConfigQPS float32 + restConfigBurst int + clusterCacheClientQPS float32 + clusterCacheClientBurst int + webhookPort int + webhookCertDir string + webhookCertName string + webhookKeyName string + healthAddr string + managerOptions = flags.ManagerOptions{} + logOptions = logs.NewOptions() // CABPK specific flags. - clusterConcurrency int - clusterCacheTrackerConcurrency int - kubeadmConfigConcurrency int - tokenTTL time.Duration + clusterConcurrency int + clusterCacheConcurrency int + kubeadmConfigConcurrency int + tokenTTL time.Duration ) func init() { @@ -131,7 +132,7 @@ func InitFlags(fs *pflag.FlagSet) { "Number of clusters to process simultaneously") _ = fs.MarkDeprecated("cluster-concurrency", "This flag has no function anymore and is going to be removed in a next release. Use \"--clustercachetracker-concurrency\" instead.") - fs.IntVar(&clusterCacheTrackerConcurrency, "clustercachetracker-concurrency", 10, + fs.IntVar(&clusterCacheConcurrency, "clustercache-concurrency", 100, "Number of clusters to process simultaneously") fs.IntVar(&kubeadmConfigConcurrency, "kubeadmconfig-concurrency", 10, @@ -146,11 +147,11 @@ func InitFlags(fs *pflag.FlagSet) { fs.IntVar(&restConfigBurst, "kube-api-burst", 30, "Maximum number of queries that should be allowed in one burst from the controller client to the Kubernetes API server.") - fs.Float32Var(&clusterCacheTrackerClientQPS, "clustercachetracker-client-qps", 20, - "Maximum queries per second from the cluster cache tracker clients to the Kubernetes API server of workload clusters.") + fs.Float32Var(&clusterCacheClientQPS, "clustercache-client-qps", 20, + "Maximum queries per second from the cluster cache clients to the Kubernetes API server of workload clusters.") - fs.IntVar(&clusterCacheTrackerClientBurst, "clustercachetracker-client-burst", 30, - "Maximum number of queries that should be allowed in one burst from the cluster cache tracker clients to the Kubernetes API server of workload clusters.") + fs.IntVar(&clusterCacheClientBurst, "clustercache-client-burst", 30, + "Maximum number of queries that should be allowed in one burst from the cluster cache clients to the Kubernetes API server of workload clusters.") fs.DurationVar(&tokenTTL, "bootstrap-token-ttl", kubeadmbootstrapcontrollers.DefaultTokenTTL, "The amount of time the bootstrap token will be valid") @@ -312,35 +313,32 @@ func setupReconcilers(ctx context.Context, mgr ctrl.Manager) { os.Exit(1) } - // Set up a ClusterCacheTracker and ClusterCacheReconciler to provide to controllers - // requiring a connection to a remote cluster - tracker, err := remote.NewClusterCacheTracker( - mgr, - remote.ClusterCacheTrackerOptions{ - SecretCachingClient: secretCachingClient, - ControllerName: controllerName, - Log: &ctrl.Log, - ClientQPS: clusterCacheTrackerClientQPS, - ClientBurst: clusterCacheTrackerClientBurst, + clusterCache, err := clustercache.SetupWithManager(ctx, mgr, clustercache.Options{ + SecretClient: secretCachingClient, + Cache: clustercache.CacheOptions{}, + Client: clustercache.ClientOptions{ + QPS: clusterCacheClientQPS, + Burst: clusterCacheClientBurst, + UserAgent: remote.DefaultClusterAPIUserAgent(controllerName), + Cache: clustercache.ClientCacheOptions{ + DisableFor: []client.Object{ + // Don't cache ConfigMaps & Secrets. + &corev1.ConfigMap{}, + &corev1.Secret{}, + }, + }, }, - ) - if err != nil { - setupLog.Error(err, "unable to create cluster cache tracker") - os.Exit(1) - } - if err := (&remote.ClusterCacheReconciler{ - Client: mgr.GetClient(), - Tracker: tracker, WatchFilterValue: watchFilterValue, - }).SetupWithManager(ctx, mgr, concurrency(clusterCacheTrackerConcurrency)); err != nil { - setupLog.Error(err, "unable to create controller", "controller", "ClusterCacheReconciler") + }, concurrency(clusterCacheConcurrency)) + if err != nil { + setupLog.Error(err, "Unable to create ClusterCache") os.Exit(1) } if err := (&kubeadmbootstrapcontrollers.KubeadmConfigReconciler{ Client: mgr.GetClient(), SecretCachingClient: secretCachingClient, - Tracker: tracker, + ClusterCache: clusterCache, WatchFilterValue: watchFilterValue, TokenTTL: tokenTTL, }).SetupWithManager(ctx, mgr, concurrency(kubeadmConfigConcurrency)); err != nil { diff --git a/controllers/alias.go b/controllers/alias.go index 76e300827c3c..90e5c201aa47 100644 --- a/controllers/alias.go +++ b/controllers/alias.go @@ -18,12 +18,13 @@ package controllers import ( "context" + "time" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" - "sigs.k8s.io/cluster-api/controllers/remote" + "sigs.k8s.io/cluster-api/controllers/clustercache" clustercontroller "sigs.k8s.io/cluster-api/internal/controllers/cluster" clusterclasscontroller "sigs.k8s.io/cluster-api/internal/controllers/clusterclass" machinecontroller "sigs.k8s.io/cluster-api/internal/controllers/machine" @@ -41,26 +42,31 @@ import ( // ClusterReconciler reconciles a Cluster object. type ClusterReconciler struct { - Client client.Client - APIReader client.Reader + Client client.Client + APIReader client.Reader + ClusterCache clustercache.ClusterCache // WatchFilterValue is the label value used to filter events prior to reconciliation. WatchFilterValue string + + RemoteConnectionGracePeriod time.Duration } func (r *ClusterReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, options controller.Options) error { return (&clustercontroller.Reconciler{ - Client: r.Client, - APIReader: r.APIReader, - WatchFilterValue: r.WatchFilterValue, + Client: r.Client, + APIReader: r.APIReader, + ClusterCache: r.ClusterCache, + WatchFilterValue: r.WatchFilterValue, + RemoteConnectionGracePeriod: r.RemoteConnectionGracePeriod, }).SetupWithManager(ctx, mgr, options) } // MachineReconciler reconciles a Machine object. type MachineReconciler struct { - Client client.Client - APIReader client.Reader - Tracker *remote.ClusterCacheTracker + Client client.Client + APIReader client.Reader + ClusterCache clustercache.ClusterCache // WatchFilterValue is the label value used to filter events prior to reconciliation. WatchFilterValue string @@ -70,16 +76,16 @@ func (r *MachineReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manag return (&machinecontroller.Reconciler{ Client: r.Client, APIReader: r.APIReader, - Tracker: r.Tracker, + ClusterCache: r.ClusterCache, WatchFilterValue: r.WatchFilterValue, }).SetupWithManager(ctx, mgr, options) } // MachineSetReconciler reconciles a MachineSet object. type MachineSetReconciler struct { - Client client.Client - APIReader client.Reader - Tracker *remote.ClusterCacheTracker + Client client.Client + APIReader client.Reader + ClusterCache clustercache.ClusterCache // WatchFilterValue is the label value used to filter events prior to reconciliation. WatchFilterValue string @@ -92,7 +98,7 @@ func (r *MachineSetReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Ma return (&machinesetcontroller.Reconciler{ Client: r.Client, APIReader: r.APIReader, - Tracker: r.Tracker, + ClusterCache: r.ClusterCache, WatchFilterValue: r.WatchFilterValue, DeprecatedInfraMachineNaming: r.DeprecatedInfraMachineNaming, }).SetupWithManager(ctx, mgr, options) @@ -117,8 +123,8 @@ func (r *MachineDeploymentReconciler) SetupWithManager(ctx context.Context, mgr // MachineHealthCheckReconciler reconciles a MachineHealthCheck object. type MachineHealthCheckReconciler struct { - Client client.Client - Tracker *remote.ClusterCacheTracker + Client client.Client + ClusterCache clustercache.ClusterCache // WatchFilterValue is the label value used to filter events prior to reconciliation. WatchFilterValue string @@ -127,15 +133,15 @@ type MachineHealthCheckReconciler struct { func (r *MachineHealthCheckReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, options controller.Options) error { return (&machinehealthcheckcontroller.Reconciler{ Client: r.Client, - Tracker: r.Tracker, + ClusterCache: r.ClusterCache, WatchFilterValue: r.WatchFilterValue, }).SetupWithManager(ctx, mgr, options) } // ClusterTopologyReconciler reconciles a managed topology for a Cluster object. type ClusterTopologyReconciler struct { - Client client.Client - Tracker *remote.ClusterCacheTracker + Client client.Client + ClusterCache clustercache.ClusterCache // APIReader is used to list MachineSets directly via the API server to avoid // race conditions caused by an outdated cache. APIReader client.Reader @@ -150,7 +156,7 @@ func (r *ClusterTopologyReconciler) SetupWithManager(ctx context.Context, mgr ct return (&clustertopologycontroller.Reconciler{ Client: r.Client, APIReader: r.APIReader, - Tracker: r.Tracker, + ClusterCache: r.ClusterCache, RuntimeClient: r.RuntimeClient, WatchFilterValue: r.WatchFilterValue, }).SetupWithManager(ctx, mgr, options) diff --git a/controllers/clustercache/cluster_accessor.go b/controllers/clustercache/cluster_accessor.go new file mode 100644 index 000000000000..1e304d7c28be --- /dev/null +++ b/controllers/clustercache/cluster_accessor.go @@ -0,0 +1,497 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package clustercache + +import ( + "context" + "crypto/rsa" + "fmt" + "sync" + "time" + + "github.com/pkg/errors" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/client-go/rest" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/cache" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/source" + + "sigs.k8s.io/cluster-api/util/certs" +) + +// clusterAccessor is the object used to create and manage connections to a specific workload cluster. +type clusterAccessor struct { + cluster client.ObjectKey + + // config is the config of the clusterAccessor. + config *clusterAccessorConfig + + // lockedStateLock is used to synchronize access to lockedState. + // It should *never* be held for an extended period of time (e.g. during connection creation) to ensure + // regular controllers are not blocked when calling e.g. GetClient(). + // lockedStateLock should never be used directly, use the following methods instead: lock, unlock, rLock, rUnlock. + // All usages of these functions can be found with the following regexp: "\.lock\(|\.unlock\(|\.rLock\(|\.rUnlock\(" + lockedStateLock sync.RWMutex + + // lockedState is the state of the clusterAccessor. This includes the connection (e.g. client, cache) + // and health checking information (e.g. lastProbeSuccessTimestamp, consecutiveFailures). + // lockedStateLock must be *always* held (via lock or rLock) before accessing this field. + lockedState clusterAccessorLockedState +} + +// clusterAccessorConfig is the config of the clusterAccessor. +type clusterAccessorConfig struct { + // Scheme is the scheme used for the client and the cache. + Scheme *runtime.Scheme + + // SecretClient is the client used to access secrets of type secret.Kubeconfig, i.e. Secrets + // with the following name format: "-kubeconfig". + // Ideally this is a client that caches only kubeconfig secrets, it is highly recommended to avoid caching all secrets. + // An example on how to create an ideal secret caching client can be found in the core Cluster API controller main.go file. + SecretClient client.Reader + + // ControllerPodMetadata is the Pod metadata of the controller using this ClusterCache. + // This is only set when the POD_NAMESPACE, POD_NAME and POD_UID environment variables are set. + // This information will be used to detected if the controller is running on a workload cluster, so + // that for the Cluster we're running on we can then access the apiserver directly instead of going + // through the apiserver loadbalancer. + ControllerPodMetadata *metav1.ObjectMeta + + // ConnectionCreationRetryInterval is the interval after which to retry to create a + // connection after creating a connection failed. + ConnectionCreationRetryInterval time.Duration + + // Cache is the config used for the cache that the clusterAccessor creates. + Cache *clusterAccessorCacheConfig + + // Client is the config used for the client that the clusterAccessor creates. + Client *clusterAccessorClientConfig + + // HealthProbe is the configuration for the health probe. + HealthProbe *clusterAccessorHealthProbeConfig +} + +// clusterAccessorCacheConfig is the config used for the cache that the clusterAccessor creates. +type clusterAccessorCacheConfig struct { + // InitialSyncTimeout is the timeout used when waiting for the cache to sync after cache start. + InitialSyncTimeout time.Duration + + // SyncPeriod is the sync period of the cache. + SyncPeriod *time.Duration + + // ByObject restricts the cache's ListWatch to the desired fields per GVK at the specified object. + ByObject map[client.Object]cache.ByObject + + // Indexes are the indexes added to the cache. + Indexes []CacheOptionsIndex +} + +// clusterAccessorClientConfig is the config used for the client that the clusterAccessor creates. +type clusterAccessorClientConfig struct { + // Timeout is the timeout used for the rest.Config. + // The rest.Config is also used to create the client and the cache. + Timeout time.Duration + + // QPS is the qps used for the rest.Config. + // The rest.Config is also used to create the client and the cache. + QPS float32 + + // Burst is the burst used for the rest.Config. + // The rest.Config is also used to create the client and the cache. + Burst int + + // UserAgent is the user agent used for the rest.Config. + // The rest.Config is also used to create the client and the cache. + UserAgent string + + // Cache is the cache config defining how the clients that clusterAccessor creates + // should interact with the underlying cache. + Cache clusterAccessorClientCacheConfig +} + +// clusterAccessorClientCacheConfig is the cache config used for the client that the clusterAccessor creates. +type clusterAccessorClientCacheConfig struct { + // DisableFor is a list of objects that should never be read from the cache. + // Get & List calls for objects configured here always result in a live lookup. + DisableFor []client.Object +} + +// clusterAccessorHealthProbeConfig is the configuration for the health probe. +type clusterAccessorHealthProbeConfig struct { + // Timeout is the timeout after which the health probe times out. + Timeout time.Duration + + // Interval is the interval in which the probe should be run. + Interval time.Duration + + // FailureThreshold is the number of consecutive failures after which + // the health probe is considered failed. + FailureThreshold int +} + +// clusterAccessorLockedState is the state of the clusterAccessor. This includes the connection (e.g. client, cache) +// and health checking information (e.g. lastProbeSuccessTimestamp, consecutiveFailures). +// lockedStateLock must be *always* held (via lock or rLock) before accessing this field. +type clusterAccessorLockedState struct { + // lastConnectionCreationErrorTimestamp is the timestamp when connection creation failed the last time. + lastConnectionCreationErrorTimestamp time.Time + + // connection holds the connection state (e.g. client, cache) of the clusterAccessor. + connection *clusterAccessorLockedConnectionState + + // clientCertificatePrivateKey is a private key that is generated once for a clusterAccessor + // and can then be used to generate client certificates. This is e.g. used in KCP to generate a client + // cert to communicate with etcd. + // This private key is stored and cached in the ClusterCache because it's expensive to generate a new + // private key in every single Reconcile. + clientCertificatePrivateKey *rsa.PrivateKey + + // healthChecking holds the health checking state (e.g. lastProbeSuccessTimestamp, consecutiveFailures) + // of the clusterAccessor. + healthChecking clusterAccessorLockedHealthCheckingState +} + +// RESTClient is an interface only containing the methods of *rest.RESTClient that we use. +// Using this interface instead of *rest.RESTClient makes it possible to use the fake.RESTClient in unit tests. +type RESTClient interface { + Get() *rest.Request +} + +// clusterAccessorLockedConnectionState holds the connection state (e.g. client, cache) of the clusterAccessor. +type clusterAccessorLockedConnectionState struct { + // restConfig to communicate with the workload cluster. + restConfig *rest.Config + + // restClient to communicate with the workload cluster. + restClient RESTClient + + // cachedClient to communicate with the workload cluster. + // It uses cache for Get & List calls for all Unstructured objects and + // all typed objects except the ones for which caching has been disabled via DisableFor. + cachedClient client.Client + + // cache is the cache used by the client. + // It manages informers that have been created e.g. by adding indexes to the cache, + // Get & List calls from the client or via the Watch method of the clusterAccessor. + cache *stoppableCache + + // watches is used to track the watches that have been added through the Watch method + // of the clusterAccessor. This is important to avoid adding duplicate watches. + watches sets.Set[string] +} + +// clusterAccessorLockedHealthCheckingState holds the health checking state (e.g. lastProbeSuccessTimestamp, +// consecutiveFailures) of the clusterAccessor. +type clusterAccessorLockedHealthCheckingState struct { + // lastProbeTimestamp is the time when the health probe was executed last. + lastProbeTimestamp time.Time + + // lastProbeSuccessTimestamp is the time when the health probe was successfully executed last. + lastProbeSuccessTimestamp time.Time + + // consecutiveFailures is the number of consecutive health probe failures. + consecutiveFailures int +} + +// newClusterAccessor creates a new clusterAccessor. +func newClusterAccessor(cluster client.ObjectKey, clusterAccessorConfig *clusterAccessorConfig) *clusterAccessor { + return &clusterAccessor{ + cluster: cluster, + config: clusterAccessorConfig, + } +} + +// Connected returns true if there is a connection to the workload cluster, i.e. the clusterAccessor has a +// client, cache, etc. +func (ca *clusterAccessor) Connected(ctx context.Context) bool { + ca.rLock(ctx) + defer ca.rUnlock(ctx) + + return ca.lockedState.connection != nil +} + +// Connect creates a connection to the workload cluster, i.e. it creates a client, cache, etc. +// +// This method will only be called by the ClusterCache reconciler and not by other regular reconcilers. +// Controller-runtime guarantees that the ClusterCache reconciler will never reconcile the same Cluster concurrently. +// Thus, we can rely on this method not being called concurrently for the same Cluster / clusterAccessor. +// This means we don't have to hold the lock when we create the client, cache, etc. +// It is extremely important that we don't lock during connection creation, so that other methods like GetClient (that +// use a read lock) can return immediately even when we currently create a client. This is necessary to avoid blocking +// other reconcilers (e.g. the Cluster or Machine reconciler). +func (ca *clusterAccessor) Connect(ctx context.Context) (retErr error) { + log := ctrl.LoggerFrom(ctx) + + if ca.Connected(ctx) { + log.V(6).Info("Skipping connect, already connected") + return nil + } + + log.Info("Connecting") + + // Creating clients, cache etc. is intentionally done without a lock to avoid blocking other reconcilers. + connection, err := ca.createConnection(ctx) + + ca.lock(ctx) + defer ca.unlock(ctx) + + defer func() { + if retErr != nil { + log.Error(retErr, "Connect failed") + ca.lockedState.lastConnectionCreationErrorTimestamp = time.Now() + } + }() + + if err != nil { + return err + } + + log.Info("Connected") + + // Only generate the clientCertificatePrivateKey once as there is no need to regenerate it after disconnect/connect. + // Note: This has to be done before setting connection, because otherwise this code wouldn't be re-entrant if the + // private key generation fails because we check Connected above. + if ca.lockedState.clientCertificatePrivateKey == nil { + log.V(6).Info("Generating client certificate private key") + clientCertificatePrivateKey, err := certs.NewPrivateKey() + if err != nil { + return errors.Wrapf(err, "error creating client certificate private key") + } + ca.lockedState.clientCertificatePrivateKey = clientCertificatePrivateKey + } + + now := time.Now() + ca.lockedState.healthChecking = clusterAccessorLockedHealthCheckingState{ + // A client was just created successfully, so let's set the last probe times. + lastProbeTimestamp: now, + lastProbeSuccessTimestamp: now, + consecutiveFailures: 0, + } + ca.lockedState.connection = &clusterAccessorLockedConnectionState{ + restConfig: connection.RESTConfig, + restClient: connection.RESTClient, + cachedClient: connection.CachedClient, + cache: connection.Cache, + watches: sets.Set[string]{}, + } + + return nil +} + +// Disconnect disconnects a connection to the workload cluster. +func (ca *clusterAccessor) Disconnect(ctx context.Context) { + log := ctrl.LoggerFrom(ctx) + + if !ca.Connected(ctx) { + log.V(6).Info("Skipping disconnect, already disconnected") + return + } + + ca.lock(ctx) + defer ca.unlock(ctx) + + log.Info("Disconnecting") + + // Stopping the cache is non-blocking, so it's okay to do it while holding the lock. + // Note: Stopping the cache will also trigger shutdown of all informers that have been added to the cache. + log.V(6).Info("Stopping cache") + ca.lockedState.connection.cache.Stop() + + log.Info("Disconnected") + + ca.lockedState.connection = nil +} + +// HealthCheck will run a health probe against the cluster's apiserver (a "GET /" call). +func (ca *clusterAccessor) HealthCheck(ctx context.Context) (bool, bool) { + log := ctrl.LoggerFrom(ctx) + + if !ca.Connected(ctx) { + log.V(6).Info("Skipping health check, not connected") + return false, false + } + + ca.rLock(ctx) + restClient := ca.lockedState.connection.restClient + ca.rUnlock(ctx) + + log.V(6).Info("Run health probe") + + // Executing the health probe is intentionally done without a lock to avoid blocking other reconcilers. + _, err := restClient.Get().AbsPath("/").Timeout(ca.config.HealthProbe.Timeout).DoRaw(ctx) + + ca.lock(ctx) + defer ca.unlock(ctx) + + ca.lockedState.healthChecking.lastProbeTimestamp = time.Now() + + unauthorizedErrorOccurred := false + switch { + case err != nil && apierrors.IsUnauthorized(err): + // Unauthorized means that the underlying kubeconfig is not authorizing properly anymore, which + // usually is the result of automatic kubeconfig refreshes, meaning that we have to disconnect the + // clusterAccessor and re-connect without waiting for further failed health probes. + unauthorizedErrorOccurred = true + ca.lockedState.healthChecking.consecutiveFailures++ + log.V(6).Info(fmt.Sprintf("Health probe failed (unauthorized error occurred): %v", err)) + case err != nil: + ca.lockedState.healthChecking.consecutiveFailures++ + log.V(6).Info(fmt.Sprintf("Health probe failed (%d/%d): %v", + ca.lockedState.healthChecking.consecutiveFailures, ca.config.HealthProbe.FailureThreshold, err)) + default: + ca.lockedState.healthChecking.consecutiveFailures = 0 + ca.lockedState.healthChecking.lastProbeSuccessTimestamp = ca.lockedState.healthChecking.lastProbeTimestamp + log.V(6).Info("Health probe succeeded") + } + + tooManyConsecutiveFailures := ca.lockedState.healthChecking.consecutiveFailures >= ca.config.HealthProbe.FailureThreshold + return tooManyConsecutiveFailures, unauthorizedErrorOccurred +} + +func (ca *clusterAccessor) GetClient(ctx context.Context) (client.Client, error) { + ca.rLock(ctx) + defer ca.rUnlock(ctx) + + if ca.lockedState.connection == nil { + return nil, errors.Wrapf(ErrClusterNotConnected, "error getting client") + } + + return ca.lockedState.connection.cachedClient, nil +} + +func (ca *clusterAccessor) GetReader(ctx context.Context) (client.Reader, error) { + ca.rLock(ctx) + defer ca.rUnlock(ctx) + + if ca.lockedState.connection == nil { + return nil, errors.Wrapf(ErrClusterNotConnected, "error getting client reader") + } + + return ca.lockedState.connection.cachedClient, nil +} + +func (ca *clusterAccessor) GetRESTConfig(ctx context.Context) (*rest.Config, error) { + ca.rLock(ctx) + defer ca.rUnlock(ctx) + + if ca.lockedState.connection == nil { + return nil, errors.Wrapf(ErrClusterNotConnected, "error getting REST config") + } + + return ca.lockedState.connection.restConfig, nil +} + +func (ca *clusterAccessor) GetClientCertificatePrivateKey(ctx context.Context) *rsa.PrivateKey { + ca.rLock(ctx) + defer ca.rUnlock(ctx) + + return ca.lockedState.clientCertificatePrivateKey +} + +// Watch watches a workload cluster for events. +// Each unique watch (by input.Name) is only added once after a Connect (otherwise we return early). +// During a disconnect existing watches (i.e. informers) are shutdown when stopping the cache. +// After a re-connect watches will be re-added (assuming the Watch method is called again). +func (ca *clusterAccessor) Watch(ctx context.Context, input WatchInput) error { + if input.Name == "" { + return errors.New("input.Name is required") + } + + if !ca.Connected(ctx) { + return errors.Wrapf(ErrClusterNotConnected, "error creating watch %s for %T", input.Name, input.Kind) + } + + log := ctrl.LoggerFrom(ctx) + + // Calling Watch on a controller is non-blocking because it only calls Start on the Kind source. + // Start on the Kind source starts an additional go routine to create an informer. + // Because it is non-blocking we can use a full lock here without blocking other reconcilers too long. + ca.lock(ctx) + defer ca.unlock(ctx) + + // Checking connection again while holding the lock, because maybe Disconnect was called since checking above. + if ca.lockedState.connection == nil { + return errors.Wrapf(ErrClusterNotConnected, "error creating watch %s for %T", input.Name, input.Kind) + } + + // Return early if the watch was already added. + if ca.lockedState.connection.watches.Has(input.Name) { + log.V(6).Info(fmt.Sprintf("Skip creation of watch %s for %T because it already exists", input.Name, input.Kind)) + return nil + } + + log.Info(fmt.Sprintf("Creating watch %s for %T", input.Name, input.Kind)) + if err := input.Watcher.Watch(source.Kind(ca.lockedState.connection.cache, input.Kind, input.EventHandler, input.Predicates...)); err != nil { + return errors.Wrapf(err, "error creating watch %s for %T", input.Name, input.Kind) + } + + ca.lockedState.connection.watches.Insert(input.Name) + return nil +} + +func (ca *clusterAccessor) GetLastProbeSuccessTimestamp(ctx context.Context) time.Time { + ca.rLock(ctx) + defer ca.rUnlock(ctx) + + return ca.lockedState.healthChecking.lastProbeSuccessTimestamp +} + +func (ca *clusterAccessor) GetLastProbeTimestamp(ctx context.Context) time.Time { + ca.rLock(ctx) + defer ca.rUnlock(ctx) + + return ca.lockedState.healthChecking.lastProbeTimestamp +} + +func (ca *clusterAccessor) GetLastConnectionCreationErrorTimestamp(ctx context.Context) time.Time { + ca.rLock(ctx) + defer ca.rUnlock(ctx) + + return ca.lockedState.lastConnectionCreationErrorTimestamp +} + +func (ca *clusterAccessor) rLock(ctx context.Context) { + log := ctrl.LoggerFrom(ctx).WithCallDepth(1) + log.V(10).Info("Getting read lock for ClusterAccessor") + ca.lockedStateLock.RLock() + log.V(10).Info("Got read lock for ClusterAccessor") +} + +func (ca *clusterAccessor) rUnlock(ctx context.Context) { + log := ctrl.LoggerFrom(ctx).WithCallDepth(1) + log.V(10).Info("Removing read lock for ClusterAccessor") + ca.lockedStateLock.RUnlock() + log.V(10).Info("Removed read lock for ClusterAccessor") +} + +func (ca *clusterAccessor) lock(ctx context.Context) { + log := ctrl.LoggerFrom(ctx).WithCallDepth(1) + log.V(10).Info("Getting lock for ClusterAccessor") + ca.lockedStateLock.Lock() + log.V(10).Info("Got lock for ClusterAccessor") +} + +func (ca *clusterAccessor) unlock(ctx context.Context) { + log := ctrl.LoggerFrom(ctx).WithCallDepth(1) + log.V(10).Info("Removing lock for ClusterAccessor") + ca.lockedStateLock.Unlock() + log.V(10).Info("Removed lock for ClusterAccessor") +} diff --git a/controllers/clustercache/cluster_accessor_client.go b/controllers/clustercache/cluster_accessor_client.go new file mode 100644 index 000000000000..b0632e6f5581 --- /dev/null +++ b/controllers/clustercache/cluster_accessor_client.go @@ -0,0 +1,329 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package clustercache + +import ( + "context" + "fmt" + "net/http" + "sync" + "time" + + "github.com/pkg/errors" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/serializer" + "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/cache" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/apiutil" + + kcfg "sigs.k8s.io/cluster-api/util/kubeconfig" +) + +type createConnectionResult struct { + RESTConfig *rest.Config + RESTClient *rest.RESTClient + CachedClient client.Client + Cache *stoppableCache +} + +func (ca *clusterAccessor) createConnection(ctx context.Context) (*createConnectionResult, error) { + log := ctrl.LoggerFrom(ctx) + log.V(6).Info("Creating connection") + + log.V(6).Info("Creating REST config") + restConfig, err := createRESTConfig(ctx, ca.config.Client, ca.config.SecretClient, ca.cluster) + if err != nil { + return nil, err + } + + log.V(6).Info("Creating HTTP client and mapper") + httpClient, mapper, restClient, err := createHTTPClientAndMapper(ctx, ca.config.HealthProbe, restConfig) + if err != nil { + return nil, errors.Wrapf(err, "error creating HTTP client and mapper") + } + + log.V(6).Info("Creating uncached client") + uncachedClient, err := createUncachedClient(ca.config.Scheme, restConfig, httpClient, mapper) + if err != nil { + return nil, err + } + + log.V(6).Info("Detect if controller is running on the cluster") + // This function uses an uncached client to ensure pods aren't cached by the long-lived client. + runningOnCluster, err := runningOnWorkloadCluster(ctx, ca.config.ControllerPodMetadata, uncachedClient) + if err != nil { + return nil, err + } + + // If the controller runs on the workload cluster, access the apiserver directly by using the + // CA and Host from the in-cluster configuration. + if runningOnCluster { + log.V(6).Info("Controller is running on the cluster, updating REST config with in-cluster config") + + inClusterConfig, err := ctrl.GetConfig() + if err != nil { + return nil, errors.Wrapf(err, "error getting in-cluster REST config") + } + + // Use CA and Host from in-cluster config. + restConfig.CAData = nil + restConfig.CAFile = inClusterConfig.CAFile + restConfig.Host = inClusterConfig.Host + + log.V(6).Info(fmt.Sprintf("Creating HTTP client and mapper with updated REST config with host %q", restConfig.Host)) + httpClient, mapper, restClient, err = createHTTPClientAndMapper(ctx, ca.config.HealthProbe, restConfig) + if err != nil { + return nil, errors.Wrapf(err, "error creating HTTP client and mapper (using in-cluster config)") + } + } + + log.V(6).Info("Creating cached client and cache") + cachedClient, cache, err := createCachedClient(ctx, ca.config, restConfig, httpClient, mapper) + if err != nil { + return nil, err + } + + return &createConnectionResult{ + RESTConfig: restConfig, + RESTClient: restClient, + CachedClient: cachedClient, + Cache: cache, + }, nil +} + +// createRESTConfig returns a REST config created based on the kubeconfig Secret. +func createRESTConfig(ctx context.Context, clientConfig *clusterAccessorClientConfig, c client.Reader, cluster client.ObjectKey) (*rest.Config, error) { + kubeConfig, err := kcfg.FromSecret(ctx, c, cluster) + if err != nil { + return nil, errors.Wrapf(err, "error creating REST config: error getting kubeconfig secret") + } + + restConfig, err := clientcmd.RESTConfigFromKubeConfig(kubeConfig) + if err != nil { + return nil, errors.Wrapf(err, "error creating REST config: error parsing kubeconfig") + } + + restConfig.UserAgent = clientConfig.UserAgent + restConfig.Timeout = clientConfig.Timeout + restConfig.QPS = clientConfig.QPS + restConfig.Burst = clientConfig.Burst + + return restConfig, nil +} + +// runningOnWorkloadCluster detects if the current controller runs on the workload cluster. +func runningOnWorkloadCluster(ctx context.Context, controllerPodMetadata *metav1.ObjectMeta, c client.Client) (bool, error) { + // Controller Pod metadata was not found, so we can't detect if we run on the workload cluster. + if controllerPodMetadata == nil { + return false, nil + } + + // Try to get the controller pod. + var pod corev1.Pod + if err := c.Get(ctx, client.ObjectKey{ + Namespace: controllerPodMetadata.Namespace, + Name: controllerPodMetadata.Name, + }, &pod); err != nil { + // If the controller Pod is not found, we assume we are not running on the workload cluster. + if apierrors.IsNotFound(err) { + return false, nil + } + + // If we got another error, we return the error so that this will be retried later. + return false, errors.Wrapf(err, "error checking if we're running on workload cluster") + } + + // If the uid is the same we found the controller pod on the workload cluster. + return controllerPodMetadata.UID == pod.UID, nil +} + +// createHTTPClientAndMapper creates a http client and a dynamic REST mapper for the given cluster, based on the rest.Config. +func createHTTPClientAndMapper(ctx context.Context, healthProbeConfig *clusterAccessorHealthProbeConfig, config *rest.Config) (*http.Client, meta.RESTMapper, *rest.RESTClient, error) { + // Create a http client for the cluster. + httpClient, err := rest.HTTPClientFor(config) + if err != nil { + return nil, nil, nil, errors.Wrapf(err, "error creating HTTP client") + } + + // Create a dynamic REST mapper for the cluster. + mapper, err := apiutil.NewDynamicRESTMapper(config, httpClient) + if err != nil { + return nil, nil, nil, errors.Wrapf(err, "error creating dynamic REST mapper") + } + + // Create a REST client for the cluster (this is later used for health checking as well). + codec := runtime.NoopEncoder{Decoder: scheme.Codecs.UniversalDecoder()} + restClientConfig := rest.CopyConfig(config) + restClientConfig.NegotiatedSerializer = serializer.NegotiatedSerializerWrapper(runtime.SerializerInfo{Serializer: codec}) + restClient, err := rest.UnversionedRESTClientForConfigAndClient(restClientConfig, httpClient) + if err != nil { + return nil, nil, nil, errors.Wrapf(err, "error creating REST client") + } + + // Note: This checks if the apiserver is up. We do this already here to produce a clearer error message if the cluster is unreachable. + if _, err := restClient.Get().AbsPath("/").Timeout(healthProbeConfig.Timeout).DoRaw(ctx); err != nil { + return nil, nil, nil, errors.Wrapf(err, "cluster is not reachable") + } + + // Verify if we can get a REST mapping from the workload cluster apiserver. + _, err = mapper.RESTMapping(corev1.SchemeGroupVersion.WithKind("Node").GroupKind(), corev1.SchemeGroupVersion.Version) + if err != nil { + return nil, nil, nil, errors.Wrapf(err, "error getting REST mapping") + } + + return httpClient, mapper, restClient, nil +} + +// createUncachedClient creates an uncached client for the given cluster, based on the rest.Config. +func createUncachedClient(scheme *runtime.Scheme, config *rest.Config, httpClient *http.Client, mapper meta.RESTMapper) (client.Client, error) { + // Create the uncached client for the cluster. + uncachedClient, err := client.New(config, client.Options{ + Scheme: scheme, + Mapper: mapper, + HTTPClient: httpClient, + }) + if err != nil { + return nil, errors.Wrapf(err, "error creating uncached client") + } + + return uncachedClient, nil +} + +// createCachedClient creates a cached client for the given cluster, based on the rest.Config. +func createCachedClient(ctx context.Context, clusterAccessorConfig *clusterAccessorConfig, config *rest.Config, httpClient *http.Client, mapper meta.RESTMapper) (client.Client, *stoppableCache, error) { + // Create the cache for the cluster. + cacheOptions := cache.Options{ + HTTPClient: httpClient, + Scheme: clusterAccessorConfig.Scheme, + Mapper: mapper, + SyncPeriod: clusterAccessorConfig.Cache.SyncPeriod, + ByObject: clusterAccessorConfig.Cache.ByObject, + } + remoteCache, err := cache.New(config, cacheOptions) + if err != nil { + return nil, nil, errors.Wrapf(err, "error creating cache") + } + + cacheCtx, cacheCtxCancel := context.WithCancel(ctx) + + // We need to be able to stop the cache's shared informers, so wrap this in a stoppableCache. + cache := &stoppableCache{ + Cache: remoteCache, + cancelFunc: cacheCtxCancel, + } + + for _, index := range clusterAccessorConfig.Cache.Indexes { + if err := cache.IndexField(ctx, index.Object, index.Field, index.ExtractValue); err != nil { + return nil, nil, errors.Wrapf(err, "error adding index for field %q to cache", index.Field) + } + } + + // Create the client for the cluster. + cachedClient, err := client.New(config, client.Options{ + Scheme: clusterAccessorConfig.Scheme, + Mapper: mapper, + HTTPClient: httpClient, + Cache: &client.CacheOptions{ + Reader: cache, + DisableFor: clusterAccessorConfig.Client.Cache.DisableFor, + Unstructured: true, + }, + }) + if err != nil { + return nil, nil, errors.Wrapf(err, "error creating cached client") + } + + // Start the cache! + go cache.Start(cacheCtx) //nolint:errcheck + + // Wait until the cache is initially synced. + cacheSyncCtx, cacheSyncCtxCancel := context.WithTimeout(ctx, clusterAccessorConfig.Cache.InitialSyncTimeout) + defer cacheSyncCtxCancel() + if !cache.WaitForCacheSync(cacheSyncCtx) { + cache.Stop() + return nil, nil, errors.Wrapf(cacheCtx.Err(), "error when waiting for cache to sync") + } + + // Wrap the cached client with a client that sets timeouts on all Get and List calls + // If we don't set timeouts here Get and List calls can get stuck if they lazily create a new informer + // and the informer than doesn't sync because the workload cluster apiserver is not reachable. + // An alternative would be to set timeouts in the contexts we pass into all Get and List calls. + // It should be reasonable to have Get and List calls timeout within the duration configured in the restConfig. + cachedClient = newClientWithTimeout(cachedClient, config.Timeout) + + return cachedClient, cache, nil +} + +// newClientWithTimeout returns a new client which sets the specified timeout on all Get and List calls. +// If we don't set timeouts here Get and List calls can get stuck if they lazily create a new informer +// and the informer than doesn't sync because the workload cluster apiserver is not reachable. +// An alternative would be to set timeouts in the contexts we pass into all Get and List calls. +func newClientWithTimeout(client client.Client, timeout time.Duration) client.Client { + return clientWithTimeout{ + Client: client, + timeout: timeout, + } +} + +type clientWithTimeout struct { + client.Client + timeout time.Duration +} + +var _ client.Client = &clientWithTimeout{} + +func (c clientWithTimeout) Get(ctx context.Context, key client.ObjectKey, obj client.Object, opts ...client.GetOption) error { + ctx, cancel := context.WithTimeout(ctx, c.timeout) + defer cancel() + return c.Client.Get(ctx, key, obj, opts...) +} + +func (c clientWithTimeout) List(ctx context.Context, list client.ObjectList, opts ...client.ListOption) error { + ctx, cancel := context.WithTimeout(ctx, c.timeout) + defer cancel() + return c.Client.List(ctx, list, opts...) +} + +// stoppableCache embeds cache.Cache and combines it with a stop channel. +type stoppableCache struct { + cache.Cache + + lock sync.Mutex + stopped bool + cancelFunc context.CancelFunc +} + +// Stop cancels the cache.Cache's context, unless it has already been stopped. +func (cc *stoppableCache) Stop() { + cc.lock.Lock() + defer cc.lock.Unlock() + + if cc.stopped { + return + } + + cc.stopped = true + cc.cancelFunc() +} diff --git a/controllers/clustercache/cluster_accessor_client_test.go b/controllers/clustercache/cluster_accessor_client_test.go new file mode 100644 index 000000000000..aeee4e4e5df9 --- /dev/null +++ b/controllers/clustercache/cluster_accessor_client_test.go @@ -0,0 +1,146 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package clustercache + +import ( + "testing" + + . "github.com/onsi/gomega" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +func TestRunningOnWorkloadCluster(t *testing.T) { + tests := []struct { + name string + currentControllerMetadata *metav1.ObjectMeta + clusterObjects []client.Object + expected bool + }{ + { + name: "should return true if the controller is running on the workload cluster", + currentControllerMetadata: &metav1.ObjectMeta{ + Name: "controller-pod", + Namespace: "controller-pod-namespace", + UID: types.UID("controller-pod-uid"), + }, + clusterObjects: []client.Object{ + &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "controller-pod", + Namespace: "controller-pod-namespace", + UID: types.UID("controller-pod-uid"), + }, + }, + }, + expected: true, + }, + { + name: "should return false if the controller is not running on the workload cluster: name mismatch", + currentControllerMetadata: &metav1.ObjectMeta{ + Name: "controller-pod", + Namespace: "controller-pod-namespace", + UID: types.UID("controller-pod-uid"), + }, + clusterObjects: []client.Object{ + &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "controller-pod-mismatch", + Namespace: "controller-pod-namespace", + UID: types.UID("controller-pod-uid"), + }, + }, + }, + expected: false, + }, + { + name: "should return false if the controller is not running on the workload cluster: namespace mismatch", + currentControllerMetadata: &metav1.ObjectMeta{ + Name: "controller-pod", + Namespace: "controller-pod-namespace", + UID: types.UID("controller-pod-uid"), + }, + clusterObjects: []client.Object{ + &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "controller-pod", + Namespace: "controller-pod-namespace-mismatch", + UID: types.UID("controller-pod-uid"), + }, + }, + }, + expected: false, + }, + { + name: "should return false if the controller is not running on the workload cluster: uid mismatch", + currentControllerMetadata: &metav1.ObjectMeta{ + Name: "controller-pod", + Namespace: "controller-pod-namespace", + UID: types.UID("controller-pod-uid"), + }, + clusterObjects: []client.Object{ + &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "controller-pod", + Namespace: "controller-pod-namespace", + UID: types.UID("controller-pod-uid-mismatch"), + }, + }, + }, + expected: false, + }, + { + name: "should return false if the controller is not running on the workload cluster: no pod in cluster", + currentControllerMetadata: &metav1.ObjectMeta{ + Name: "controller-pod", + Namespace: "controller-pod-namespace", + UID: types.UID("controller-pod-uid"), + }, + clusterObjects: []client.Object{}, + expected: false, + }, + { + name: "should return false if the controller is not running on the workload cluster: no controller metadata", + currentControllerMetadata: nil, + clusterObjects: []client.Object{ + &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "controller-pod", + Namespace: "controller-pod-namespace", + UID: types.UID("controller-pod-uid"), + }, + }, + }, + expected: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g := NewWithT(t) + + c := fake.NewClientBuilder().WithObjects(tt.clusterObjects...).Build() + + found, err := runningOnWorkloadCluster(ctx, tt.currentControllerMetadata, c) + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(found).To(Equal(tt.expected)) + }) + } +} diff --git a/controllers/clustercache/cluster_accessor_test.go b/controllers/clustercache/cluster_accessor_test.go new file mode 100644 index 000000000000..0a8cef6d9754 --- /dev/null +++ b/controllers/clustercache/cluster_accessor_test.go @@ -0,0 +1,382 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package clustercache + +import ( + "bytes" + "context" + "errors" + "io" + "net/http" + "testing" + "time" + + . "github.com/onsi/gomega" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest/fake" + "k8s.io/client-go/tools/clientcmd" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/source" + + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + "sigs.k8s.io/cluster-api/controllers/remote" + "sigs.k8s.io/cluster-api/util/kubeconfig" +) + +func TestConnect(t *testing.T) { + g := NewWithT(t) + + testCluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cluster", + Namespace: metav1.NamespaceDefault, + }, + } + clusterKey := client.ObjectKeyFromObject(testCluster) + g.Expect(env.CreateAndWait(ctx, testCluster)).To(Succeed()) + defer func() { g.Expect(env.CleanupAndWait(ctx, testCluster)).To(Succeed()) }() + + config := buildClusterAccessorConfig(env.Manager.GetScheme(), Options{ + SecretClient: env.Manager.GetClient(), + Client: ClientOptions{ + UserAgent: remote.DefaultClusterAPIUserAgent("test-controller-manager"), + Timeout: 10 * time.Second, + }, + Cache: CacheOptions{ + Indexes: []CacheOptionsIndex{NodeProviderIDIndex}, + }, + }, nil) + accessor := newClusterAccessor(clusterKey, config) + + // Connect when kubeconfig Secret doesn't exist (should fail) + err := accessor.Connect(ctx) + g.Expect(err).To(HaveOccurred()) + g.Expect(err.Error()).To(Equal("error creating REST config: error getting kubeconfig secret: Secret \"test-cluster-kubeconfig\" not found")) + g.Expect(accessor.Connected(ctx)).To(BeFalse()) + g.Expect(accessor.lockedState.lastConnectionCreationErrorTimestamp.IsZero()).To(BeFalse()) + accessor.lockedState.lastConnectionCreationErrorTimestamp = time.Time{} // so we can compare in the next line + g.Expect(accessor.lockedState).To(Equal(clusterAccessorLockedState{})) + + // Create invalid kubeconfig Secret + kubeconfigBytes := kubeconfig.FromEnvTestConfig(env.Config, testCluster) + cmdConfig, err := clientcmd.Load(kubeconfigBytes) + g.Expect(err).ToNot(HaveOccurred()) + cmdConfig.Clusters[testCluster.Name].Server += "invalid-context-path" // breaks the server URL. + kubeconfigBytes, err = clientcmd.Write(*cmdConfig) + g.Expect(err).ToNot(HaveOccurred()) + kubeconfigSecret := kubeconfig.GenerateSecret(testCluster, kubeconfigBytes) + g.Expect(env.CreateAndWait(ctx, kubeconfigSecret)).To(Succeed()) + + // Connect with invalid kubeconfig Secret + err = accessor.Connect(ctx) + g.Expect(err).To(HaveOccurred()) + g.Expect(err.Error()).To(Equal("error creating HTTP client and mapper: cluster is not reachable: the server could not find the requested resource")) + g.Expect(accessor.Connected(ctx)).To(BeFalse()) + g.Expect(accessor.lockedState.lastConnectionCreationErrorTimestamp.IsZero()).To(BeFalse()) + + // Cleanup invalid kubeconfig Secret + g.Expect(env.CleanupAndWait(ctx, kubeconfigSecret)).To(Succeed()) + + // Create kubeconfig Secret + kubeconfigSecret = kubeconfig.GenerateSecret(testCluster, kubeconfig.FromEnvTestConfig(env.Config, testCluster)) + g.Expect(env.CreateAndWait(ctx, kubeconfigSecret)).To(Succeed()) + defer func() { g.Expect(env.CleanupAndWait(ctx, kubeconfigSecret)).To(Succeed()) }() + + // Connect + g.Expect(accessor.Connect(ctx)).To(Succeed()) + g.Expect(accessor.Connected(ctx)).To(BeTrue()) + + g.Expect(accessor.lockedState.connection.restConfig).ToNot(BeNil()) + g.Expect(accessor.lockedState.connection.restClient).ToNot(BeNil()) + g.Expect(accessor.lockedState.connection.cachedClient).ToNot(BeNil()) + g.Expect(accessor.lockedState.connection.cache).ToNot(BeNil()) + g.Expect(accessor.lockedState.connection.watches).ToNot(BeNil()) + + // Check if cache was started / synced + cacheSyncCtx, cacheSyncCtxCancel := context.WithTimeout(ctx, 5*time.Second) + defer cacheSyncCtxCancel() + g.Expect(accessor.lockedState.connection.cache.WaitForCacheSync(cacheSyncCtx)).To(BeTrue()) + + g.Expect(accessor.lockedState.clientCertificatePrivateKey).ToNot(BeNil()) + + g.Expect(accessor.lockedState.healthChecking.lastProbeTimestamp.IsZero()).To(BeFalse()) + g.Expect(accessor.lockedState.healthChecking.lastProbeSuccessTimestamp.IsZero()).To(BeFalse()) + g.Expect(accessor.lockedState.healthChecking.consecutiveFailures).To(Equal(0)) + + // Get client and test Get & List + c, err := accessor.GetClient(ctx) + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(c.Get(ctx, client.ObjectKey{Name: metav1.NamespaceDefault}, &corev1.Namespace{})).To(Succeed()) + nodeList := &corev1.NodeList{} + g.Expect(c.List(ctx, nodeList)).To(Succeed()) + g.Expect(nodeList.Items).To(BeEmpty()) + + // Connect again (no-op) + g.Expect(accessor.Connect(ctx)).To(Succeed()) + + // Disconnect + accessor.Disconnect(ctx) + g.Expect(accessor.Connected(ctx)).To(BeFalse()) +} + +func TestDisconnect(t *testing.T) { + g := NewWithT(t) + + testCluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cluster", + Namespace: metav1.NamespaceDefault, + }, + } + clusterKey := client.ObjectKeyFromObject(testCluster) + g.Expect(env.CreateAndWait(ctx, testCluster)).To(Succeed()) + defer func() { g.Expect(env.CleanupAndWait(ctx, testCluster)).To(Succeed()) }() + + // Create kubeconfig Secret + kubeconfigSecret := kubeconfig.GenerateSecret(testCluster, kubeconfig.FromEnvTestConfig(env.Config, testCluster)) + g.Expect(env.CreateAndWait(ctx, kubeconfigSecret)).To(Succeed()) + defer func() { g.Expect(env.CleanupAndWait(ctx, kubeconfigSecret)).To(Succeed()) }() + + config := buildClusterAccessorConfig(env.Manager.GetScheme(), Options{ + SecretClient: env.Manager.GetClient(), + Client: ClientOptions{ + UserAgent: remote.DefaultClusterAPIUserAgent("test-controller-manager"), + Timeout: 10 * time.Second, + }, + }, nil) + accessor := newClusterAccessor(clusterKey, config) + + // Connect (so we can disconnect afterward) + g.Expect(accessor.Connect(ctx)).To(Succeed()) + g.Expect(accessor.Connected(ctx)).To(BeTrue()) + + cache := accessor.lockedState.connection.cache + g.Expect(cache.stopped).To(BeFalse()) + + // Disconnect + accessor.Disconnect(ctx) + g.Expect(accessor.Connected(ctx)).To(BeFalse()) + g.Expect(accessor.lockedState.connection).To(BeNil()) + g.Expect(cache.stopped).To(BeTrue()) + + // Disconnect again (no-op) + accessor.Disconnect(ctx) + g.Expect(accessor.Connected(ctx)).To(BeFalse()) + + // Verify health checking state was preserved + g.Expect(accessor.lockedState.clientCertificatePrivateKey).ToNot(BeNil()) + + g.Expect(accessor.lockedState.healthChecking.lastProbeTimestamp.IsZero()).To(BeFalse()) + g.Expect(accessor.lockedState.healthChecking.lastProbeSuccessTimestamp.IsZero()).To(BeFalse()) +} + +func TestHealthCheck(t *testing.T) { + testCluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cluster", + Namespace: metav1.NamespaceDefault, + }, + } + clusterKey := client.ObjectKeyFromObject(testCluster) + + tests := []struct { + name string + connected bool + restClientHTTPResponse *http.Response + initialConsecutiveFailures int + wantTooManyConsecutiveFailures bool + wantUnauthorizedErrorOccurred bool + wantConsecutiveFailures int + }{ + { + name: "Health probe failed with unauthorized error", + connected: true, + restClientHTTPResponse: &http.Response{ + StatusCode: http.StatusUnauthorized, + Header: header(), + Body: objBody(&apierrors.NewUnauthorized("authorization failed").ErrStatus), + }, + wantTooManyConsecutiveFailures: false, + wantUnauthorizedErrorOccurred: true, + wantConsecutiveFailures: 1, + }, + { + name: "Health probe failed with other error", + connected: true, + restClientHTTPResponse: &http.Response{ + StatusCode: http.StatusBadRequest, + Header: header(), + Body: objBody(&apierrors.NewBadRequest("bad request").ErrStatus), + }, + wantTooManyConsecutiveFailures: false, + wantUnauthorizedErrorOccurred: false, + wantConsecutiveFailures: 1, + }, + { + name: "Health probe failed with other error (failure threshold met)", + connected: true, + restClientHTTPResponse: &http.Response{ + StatusCode: http.StatusBadRequest, + Header: header(), + Body: objBody(&apierrors.NewBadRequest("bad request").ErrStatus), + }, + initialConsecutiveFailures: 4, // failure threshold is 5 + wantTooManyConsecutiveFailures: true, + wantUnauthorizedErrorOccurred: false, + wantConsecutiveFailures: 5, + }, + { + name: "Health probe succeeded", + connected: true, + restClientHTTPResponse: &http.Response{ + StatusCode: http.StatusOK, + }, + wantTooManyConsecutiveFailures: false, + wantUnauthorizedErrorOccurred: false, + wantConsecutiveFailures: 0, + }, + { + name: "Health probe skipped (not connected)", + connected: false, + restClientHTTPResponse: &http.Response{ + StatusCode: http.StatusOK, + }, + wantTooManyConsecutiveFailures: false, + wantUnauthorizedErrorOccurred: false, + wantConsecutiveFailures: 0, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g := NewWithT(t) + + accessor := newClusterAccessor(clusterKey, &clusterAccessorConfig{ + HealthProbe: &clusterAccessorHealthProbeConfig{ + Timeout: 5 * time.Second, + FailureThreshold: 5, + }, + }) + accessor.lockedState.connection = &clusterAccessorLockedConnectionState{ + restClient: &fake.RESTClient{ + NegotiatedSerializer: scheme.Codecs, + Resp: tt.restClientHTTPResponse, + Err: nil, + }, + } + accessor.lockedState.healthChecking = clusterAccessorLockedHealthCheckingState{ + consecutiveFailures: tt.initialConsecutiveFailures, + } + + if !tt.connected { + accessor.lockedState.connection = nil + } + + gotTooManyConsecutiveFailures, gotUnauthorizedErrorOccurred := accessor.HealthCheck(ctx) + g.Expect(gotTooManyConsecutiveFailures).To(Equal(tt.wantTooManyConsecutiveFailures)) + g.Expect(gotUnauthorizedErrorOccurred).To(Equal(tt.wantUnauthorizedErrorOccurred)) + }) + } +} + +func TestWatch(t *testing.T) { + g := NewWithT(t) + + testCluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cluster", + Namespace: metav1.NamespaceDefault, + }, + } + clusterKey := client.ObjectKeyFromObject(testCluster) + g.Expect(env.CreateAndWait(ctx, testCluster)).To(Succeed()) + defer func() { g.Expect(env.CleanupAndWait(ctx, testCluster)).To(Succeed()) }() + + // Create kubeconfig Secret + kubeconfigSecret := kubeconfig.GenerateSecret(testCluster, kubeconfig.FromEnvTestConfig(env.Config, testCluster)) + g.Expect(env.CreateAndWait(ctx, kubeconfigSecret)).To(Succeed()) + defer func() { g.Expect(env.CleanupAndWait(ctx, kubeconfigSecret)).To(Succeed()) }() + + config := buildClusterAccessorConfig(env.Manager.GetScheme(), Options{ + SecretClient: env.Manager.GetClient(), + Client: ClientOptions{ + UserAgent: remote.DefaultClusterAPIUserAgent("test-controller-manager"), + Timeout: 10 * time.Second, + }, + }, nil) + accessor := newClusterAccessor(clusterKey, config) + + tw := &testWatcher{} + wi := WatchInput{ + Name: "test-watch", + Watcher: tw, + Kind: &corev1.Node{}, + EventHandler: &handler.EnqueueRequestForObject{}, + } + + // Add watch when not connected (fails) + err := accessor.Watch(ctx, wi) + g.Expect(err).To(HaveOccurred()) + g.Expect(errors.Is(err, ErrClusterNotConnected)).To(BeTrue()) + + // Connect + g.Expect(accessor.Connect(ctx)).To(Succeed()) + g.Expect(accessor.Connected(ctx)).To(BeTrue()) + + g.Expect(accessor.lockedState.connection.watches).To(BeEmpty()) + + // Add watch + g.Expect(accessor.Watch(ctx, wi)).To(Succeed()) + g.Expect(accessor.lockedState.connection.watches.Has("test-watch")).To(BeTrue()) + g.Expect(accessor.lockedState.connection.watches.Len()).To(Equal(1)) + + // Add watch again (no-op as watch already exists) + g.Expect(accessor.Watch(ctx, wi)).To(Succeed()) + g.Expect(accessor.lockedState.connection.watches.Has("test-watch")).To(BeTrue()) + g.Expect(accessor.lockedState.connection.watches.Len()).To(Equal(1)) + + // Disconnect + accessor.Disconnect(ctx) + g.Expect(accessor.Connected(ctx)).To(BeFalse()) +} + +type testWatcher struct { + watchedSources []source.Source +} + +func (w *testWatcher) Watch(src source.Source) error { + w.watchedSources = append(w.watchedSources, src) + return nil +} + +func header() http.Header { + header := http.Header{} + header.Set("Content-Type", runtime.ContentTypeJSON) + return header +} + +func objBody(obj runtime.Object) io.ReadCloser { + corev1GV := schema.GroupVersion{Version: "v1"} + corev1Codec := scheme.Codecs.CodecForVersions(scheme.Codecs.LegacyCodec(corev1GV), scheme.Codecs.UniversalDecoder(corev1GV), corev1GV, corev1GV) + return io.NopCloser(bytes.NewReader([]byte(runtime.EncodeOrDie(corev1Codec, obj)))) +} diff --git a/controllers/clustercache/cluster_cache.go b/controllers/clustercache/cluster_cache.go new file mode 100644 index 000000000000..ddb59532ee67 --- /dev/null +++ b/controllers/clustercache/cluster_cache.go @@ -0,0 +1,667 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package clustercache + +import ( + "context" + "crypto/rsa" + "fmt" + "os" + "strings" + "sync" + "time" + + "github.com/pkg/errors" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/rest" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/cache" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/controller-runtime/pkg/predicate" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + "sigs.k8s.io/controller-runtime/pkg/source" + + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + "sigs.k8s.io/cluster-api/util/predicates" +) + +// Options defines the options to configure a ClusterCache. +type Options struct { + // SecretClient is the client used to access secrets of type secret.Kubeconfig, i.e. Secrets + // with the following name format: "-kubeconfig". + // Ideally this is a client that caches only kubeconfig secrets, it is highly recommended to avoid caching all secrets. + // An example on how to create an ideal secret caching client can be found in the core Cluster API controller main.go file. + SecretClient client.Reader + + // WatchFilterValue is the label value used to filter events prior to reconciliation. + // If a filter excludes a cluster from reconciliation, the accessors for this cluster + // will never be created. + WatchFilterValue string + + // Cache are the cache options for the caches that are created per cluster. + Cache CacheOptions + + // Client are the client options for the clients that are created per cluster. + Client ClientOptions +} + +// CacheOptions are the cache options for the caches that are created per cluster. +type CacheOptions struct { + // SyncPeriod is the sync period of the cache. + SyncPeriod *time.Duration + + // ByObject restricts the cache's ListWatch to the desired fields per GVK at the specified object. + ByObject map[client.Object]cache.ByObject + + // Indexes are the indexes added to the cache. + Indexes []CacheOptionsIndex +} + +// CacheOptionsIndex is a index that is added to the cache. +type CacheOptionsIndex struct { + // Object is the object for which the index is created. + Object client.Object + + // Field is the field of the index that can later be used when selecting + // objects with a field selector. + Field string + + // ExtractValue is a func that extracts the index value from an object. + ExtractValue client.IndexerFunc +} + +// ClientOptions are the client options for the clients that are created per cluster. +type ClientOptions struct { + // Timeout is the timeout used for the REST config, client and cache. + // Defaults to 10s. + Timeout time.Duration + + // QPS is the maximum queries per second from the controller client + // to the Kubernetes API server of workload clusters. + // It is used for the REST config, client and cache. + // Defaults to 20. + QPS float32 + + // Burst is the maximum number of queries that should be allowed in + // one burst from the controller client to the Kubernetes API server of workload clusters. + // It is used for the REST config, client and cache. + // Default 30. + Burst int + + // UserAgent is the user agent used for the REST config, client and cache. + UserAgent string + + // Cache are the cache options defining how clients should interact with the underlying cache. + Cache ClientCacheOptions +} + +// ClientCacheOptions are the cache options for the clients that are created per cluster. +type ClientCacheOptions struct { + // DisableFor is a list of objects that should never be read from the cache. + // Get & List calls for objects configured here always result in a live lookup. + DisableFor []client.Object +} + +// ClusterCache is a component that caches clients, caches etc. for workload clusters. +type ClusterCache interface { + // GetClient returns a cached client for the given cluster. + // If there is no connection to the workload cluster ErrClusterNotConnected will be returned. + GetClient(ctx context.Context, cluster client.ObjectKey) (client.Client, error) + + // GetReader returns a cached read-only client for the given cluster. + // If there is no connection to the workload cluster ErrClusterNotConnected will be returned. + GetReader(ctx context.Context, cluster client.ObjectKey) (client.Reader, error) + + // GetRESTConfig returns a REST config for the given cluster. + // If there is no connection to the workload cluster ErrClusterNotConnected will be returned. + GetRESTConfig(ctx context.Context, cluster client.ObjectKey) (*rest.Config, error) + + // GetClientCertificatePrivateKey returns a private key that is generated once for a cluster + // and can then be used to generate client certificates. This is e.g. used in KCP to generate a client + // cert to communicate with etcd. + // This private key is stored and cached in the ClusterCache because it's expensive to generate a new + // private key in every single Reconcile. + GetClientCertificatePrivateKey(ctx context.Context, cluster client.ObjectKey) (*rsa.PrivateKey, error) + + // Watch watches a workload cluster for events. + // Each unique watch (by input.Name) is only added once after a Connect (otherwise we return early). + // During a disconnect existing watches (i.e. informers) are shutdown when stopping the cache. + // After a re-connect watches will be re-added (assuming the Watch method is called again). + // If there is no connection to the workload cluster ErrClusterNotConnected will be returned. + Watch(ctx context.Context, cluster client.ObjectKey, input WatchInput) error + + // GetLastProbeSuccessTimestamp returns the time when the health probe was successfully executed last. + GetLastProbeSuccessTimestamp(ctx context.Context, cluster client.ObjectKey) time.Time + + // GetClusterSource returns a Source of Cluster events. + // The mapFunc will be used to map from Cluster to reconcile.Request. + // reconcile.Requests will always be enqueued on connect and disconnect. + // Additionally the WatchForProbeFailure(time.Duration) option can be used to enqueue reconcile.Requests + // if the health probe didn't succeed for the configured duration. + // Note: GetClusterSource would ideally take a mapFunc that has a *clusterv1.Cluster instead of a client.Object + // as a parameter, but then the existing mapFuncs we already use in our Reconcilers wouldn't work and we would + // have to implement new ones. + GetClusterSource(controllerName string, mapFunc func(ctx context.Context, cluster client.Object) []ctrl.Request, opts ...GetClusterSourceOption) source.Source +} + +// ErrClusterNotConnected is returned by the ClusterCache when e.g. a Client cannot be returned +// because there is no connection to the workload cluster. +var ErrClusterNotConnected = errors.New("connection to the workload cluster is down") + +// Watcher is a scoped-down interface from Controller that only has the Watch func. +type Watcher interface { + // Watch watches the provided Source. + Watch(src source.Source) error +} + +// WatchInput specifies the parameters used to establish a new watch for a workload cluster. +// A source.Kind source (configured with Kind, EventHandler and Predicates) will be added to the Watcher. +// To watch for events, the source.Kind will create an informer on the Cache that we have created and cached +// for the given Cluster. +type WatchInput struct { + // Name represents a unique Watch request for the specified Cluster. + // The name is used to track that a specific watch is only added once to a cache. + // After a connection (and thus also the cache) has been re-created, watches have to be added + // again by calling the Watch method again. + Name string + + // Watcher is the watcher (controller) whose Reconcile() function will be called for events. + Watcher Watcher + + // Kind is the type of resource to watch. + Kind client.Object + + // EventHandler contains the event handlers to invoke for resource events. + EventHandler handler.EventHandler + + // Predicates is used to filter resource events. + Predicates []predicate.Predicate +} + +// GetClusterSourceOption is an option that modifies GetClusterSourceOptions for a GetClusterSource call. +type GetClusterSourceOption interface { + // ApplyToGetClusterSourceOptions applies this option to the given GetClusterSourceOptions. + ApplyToGetClusterSourceOptions(option *GetClusterSourceOptions) +} + +// GetClusterSourceOptions allows to set options for the GetClusterSource method. +type GetClusterSourceOptions struct { + watchForProbeFailures []time.Duration +} + +// ApplyOptions applies the passed list of options to GetClusterSourceOptions, +// and then returns itself for convenient chaining. +func (o *GetClusterSourceOptions) ApplyOptions(opts []GetClusterSourceOption) *GetClusterSourceOptions { + for _, opt := range opts { + opt.ApplyToGetClusterSourceOptions(o) + } + return o +} + +// WatchForProbeFailure will configure the Cluster source to enqueue reconcile.Requests if the health probe +// didn't succeed for the configured duration. +// For example if WatchForProbeFailure is set to 5m, an event will be sent if LastProbeSuccessTimestamp +// is 5m in the past (i.e. health probes didn't succeed in the last 5m). +type WatchForProbeFailure time.Duration + +// ApplyToGetClusterSourceOptions applies WatchForProbeFailure to the given GetClusterSourceOptions. +func (n WatchForProbeFailure) ApplyToGetClusterSourceOptions(opts *GetClusterSourceOptions) { + opts.watchForProbeFailures = append(opts.watchForProbeFailures, time.Duration(n)) +} + +// SetupWithManager sets up a ClusterCache with the given Manager and Options. +// This will add a reconciler to the Manager and returns a ClusterCache which can be used +// to retrieve e.g. Clients for a given Cluster. +func SetupWithManager(ctx context.Context, mgr manager.Manager, options Options, controllerOptions controller.Options) (ClusterCache, error) { + log := ctrl.LoggerFrom(ctx).WithValues("controller", "clustercache") + + if err := validateAndDefaultOptions(&options); err != nil { + return nil, err + } + + var controllerPodMetadata *metav1.ObjectMeta + podNamespace := os.Getenv("POD_NAMESPACE") + podName := os.Getenv("POD_NAME") + podUID := os.Getenv("POD_UID") + if podNamespace != "" && podName != "" && podUID != "" { + log.Info("Found controller Pod metadata, the ClusterCache will try to access the cluster it is running on directly if possible") + controllerPodMetadata = &metav1.ObjectMeta{ + Namespace: podNamespace, + Name: podName, + UID: types.UID(podUID), + } + } else { + log.Info("Couldn't find controller Pod metadata, the ClusterCache will always access the cluster it is running on using the regular apiserver endpoint") + } + + cc := &clusterCache{ + client: mgr.GetClient(), + clusterAccessorConfig: buildClusterAccessorConfig(mgr.GetScheme(), options, controllerPodMetadata), + clusterAccessors: make(map[client.ObjectKey]*clusterAccessor), + } + + err := ctrl.NewControllerManagedBy(mgr). + Named("clustercache"). + For(&clusterv1.Cluster{}). + WithOptions(controllerOptions). + WithEventFilter(predicates.ResourceHasFilterLabel(mgr.GetScheme(), log, options.WatchFilterValue)). + Complete(cc) + if err != nil { + return nil, errors.Wrap(err, "failed setting up ClusterCache with a controller manager") + } + + return cc, nil +} + +type clusterCache struct { + client client.Reader + + // clusterAccessorConfig is the config for clusterAccessors. + clusterAccessorConfig *clusterAccessorConfig + + // clusterAccessorsLock is used to synchronize access to clusterAccessors. + clusterAccessorsLock sync.RWMutex + // clusterAccessors is the map of clusterAccessors by cluster. + clusterAccessors map[client.ObjectKey]*clusterAccessor + + // clusterSourcesLock is used to synchronize access to clusterSources. + clusterSourcesLock sync.RWMutex + // clusterSources is used to store information about cluster sources. + // This information is necessary so we can enqueue reconcile.Requests for reconcilers that + // got a cluster source via GetClusterSource. + clusterSources []clusterSource +} + +// clusterSource stores the necessary information so we can enqueue reconcile.Requests for reconcilers that +// got a cluster source via GetClusterSource. +type clusterSource struct { + // controllerName is the name of the controller that will watch this source. + controllerName string + + // ch is the channel on which to send events. + ch chan event.GenericEvent + + // sendEventAfterProbeFailureDurations are the durations after LastProbeSuccessTimestamp + // after which we have to send events. + sendEventAfterProbeFailureDurations []time.Duration + + // lastEventSentTimeByCluster are the timestamps when we last sent an event for a cluster. + lastEventSentTimeByCluster map[client.ObjectKey]time.Time +} + +func (cc *clusterCache) GetClient(ctx context.Context, cluster client.ObjectKey) (client.Client, error) { + accessor := cc.getClusterAccessor(cluster) + if accessor == nil { + return nil, errors.Wrapf(ErrClusterNotConnected, "error getting client") + } + return accessor.GetClient(ctx) +} + +func (cc *clusterCache) GetReader(ctx context.Context, cluster client.ObjectKey) (client.Reader, error) { + accessor := cc.getClusterAccessor(cluster) + if accessor == nil { + return nil, errors.Wrapf(ErrClusterNotConnected, "error getting client reader") + } + return accessor.GetReader(ctx) +} + +func (cc *clusterCache) GetRESTConfig(ctx context.Context, cluster client.ObjectKey) (*rest.Config, error) { + accessor := cc.getClusterAccessor(cluster) + if accessor == nil { + return nil, errors.Wrapf(ErrClusterNotConnected, "error getting REST config") + } + return accessor.GetRESTConfig(ctx) +} + +func (cc *clusterCache) GetClientCertificatePrivateKey(ctx context.Context, cluster client.ObjectKey) (*rsa.PrivateKey, error) { + accessor := cc.getClusterAccessor(cluster) + if accessor == nil { + return nil, errors.New("error getting client certificate private key: private key was not generated yet") + } + return accessor.GetClientCertificatePrivateKey(ctx), nil +} + +func (cc *clusterCache) Watch(ctx context.Context, cluster client.ObjectKey, input WatchInput) error { + accessor := cc.getClusterAccessor(cluster) + if accessor == nil { + return errors.Wrapf(ErrClusterNotConnected, "error creating watch %s for %T", input.Name, input.Kind) + } + return accessor.Watch(ctx, input) +} + +func (cc *clusterCache) GetLastProbeSuccessTimestamp(ctx context.Context, cluster client.ObjectKey) time.Time { + accessor := cc.getClusterAccessor(cluster) + if accessor == nil { + return time.Time{} + } + return accessor.GetLastProbeSuccessTimestamp(ctx) +} + +const ( + // defaultRequeueAfter is used as a fallback if no other duration should be used. + defaultRequeueAfter = 10 * time.Second +) + +// Reconcile reconciles Clusters and manages corresponding clusterAccessors. +func (cc *clusterCache) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { + log := ctrl.LoggerFrom(ctx) + clusterKey := client.ObjectKey{Namespace: req.Namespace, Name: req.Name} + + accessor := cc.getOrCreateClusterAccessor(clusterKey) + + cluster := &clusterv1.Cluster{} + if err := cc.client.Get(ctx, req.NamespacedName, cluster); err != nil { + if apierrors.IsNotFound(err) { + log.Info("Cluster has been deleted, disconnecting") + accessor.Disconnect(ctx) + cc.deleteClusterAccessor(clusterKey) + cc.cleanupClusterSourcesForCluster(clusterKey) + return ctrl.Result{}, nil + } + + // Requeue, error getting the object. + log.Error(err, fmt.Sprintf("Requeuing after %s (error getting Cluster object)", defaultRequeueAfter)) + return ctrl.Result{RequeueAfter: defaultRequeueAfter}, nil + } + + // Return if infrastructure is not ready yet to avoid trying to open a connection when it cannot succeed. + // Requeue is not needed as there will be a new reconcile.Request when Cluster.status.infrastructureReady is set. + if !cluster.Status.InfrastructureReady { + log.V(6).Info("Can't connect yet, Cluster infrastructure is not ready") + return reconcile.Result{}, nil + } + + // Track if the current Reconcile is doing a connect or disconnect. + didConnect := false + didDisconnect := false + + requeueAfterDurations := []time.Duration{} + + // Try to connect, if not connected. + connected := accessor.Connected(ctx) + if !connected { + lastConnectionCreationErrorTimestamp := accessor.GetLastConnectionCreationErrorTimestamp(ctx) + + // Requeue, if connection creation failed within the ConnectionCreationRetryInterval. + if requeueAfter, requeue := shouldRequeue(time.Now(), lastConnectionCreationErrorTimestamp, accessor.config.ConnectionCreationRetryInterval); requeue { + log.V(6).Info(fmt.Sprintf("Requeuing after %s as connection creation already failed within the last %s", + requeueAfter.Truncate(time.Second/10), accessor.config.ConnectionCreationRetryInterval)) + requeueAfterDurations = append(requeueAfterDurations, requeueAfter) + } else { + if err := accessor.Connect(ctx); err != nil { + // Requeue, if the connect failed. + log.V(6).Info(fmt.Sprintf("Requeuing after %s (connection creation failed)", + accessor.config.ConnectionCreationRetryInterval)) + requeueAfterDurations = append(requeueAfterDurations, accessor.config.ConnectionCreationRetryInterval) + } else { + // Store that connect was done successfully. + didConnect = true + connected = true + } + } + } + + // Run the health probe, if connected. + if connected { + lastProbeTimestamp := accessor.GetLastProbeTimestamp(ctx) + + // Requeue, if health probe was already run within the HealthProbe.Interval. + if requeueAfter, requeue := shouldRequeue(time.Now(), lastProbeTimestamp, accessor.config.HealthProbe.Interval); requeue { + log.V(6).Info(fmt.Sprintf("Requeuing after %s as health probe was already run within the last %s", + requeueAfter.Truncate(time.Second/10), accessor.config.HealthProbe.Interval)) + requeueAfterDurations = append(requeueAfterDurations, requeueAfter) + } else { + // Run the health probe + tooManyConsecutiveFailures, unauthorizedErrorOccurred := accessor.HealthCheck(ctx) + if tooManyConsecutiveFailures || unauthorizedErrorOccurred { + // Disconnect if the health probe failed (either with unauthorized or consecutive failures >= HealthProbe.FailureThreshold). + accessor.Disconnect(ctx) + + // Store that disconnect was done. + didDisconnect = true + connected = false //nolint:ineffassign // connected is *currently* not used below, let's update it anyway + } + switch { + case unauthorizedErrorOccurred: + // Requeue for connection creation immediately. + // If we got an unauthorized error, it could be because the kubeconfig was rotated + // and in that case we want to immediately try to create the connection again. + log.V(6).Info("Requeuing immediately (disconnected after unauthorized error occurred)") + requeueAfterDurations = append(requeueAfterDurations, 1*time.Millisecond) + case tooManyConsecutiveFailures: + // Requeue for connection creation with the regular ConnectionCreationRetryInterval. + log.V(6).Info(fmt.Sprintf("Requeuing after %s (disconnected after consecutive failure threshold met)", + accessor.config.ConnectionCreationRetryInterval)) + requeueAfterDurations = append(requeueAfterDurations, accessor.config.ConnectionCreationRetryInterval) + default: + // Requeue for next health probe. + log.V(6).Info(fmt.Sprintf("Requeuing after %s (health probe succeeded)", + accessor.config.HealthProbe.Interval)) + requeueAfterDurations = append(requeueAfterDurations, accessor.config.HealthProbe.Interval) + } + } + } + + // Send events to cluster sources. + lastProbeSuccessTime := accessor.GetLastProbeSuccessTimestamp(ctx) + cc.sendEventsToClusterSources(ctx, cluster, time.Now(), lastProbeSuccessTime, didConnect, didDisconnect) + + // Requeue based on requeueAfterDurations (fallback to defaultRequeueAfter). + return reconcile.Result{RequeueAfter: minDurationOrDefault(requeueAfterDurations, defaultRequeueAfter)}, nil +} + +// getOrCreateClusterAccessor returns a clusterAccessor and creates it if it doesn't exist already. +// Note: This intentionally does not already create a client and cache. This is later done +// via clusterAccessor.Connect() by the ClusterCache reconciler. +// Note: This method should only be called in the ClusterCache Reconcile method. Otherwise it could happen +// that a new clusterAccessor is created even after ClusterCache Reconcile deleted the clusterAccessor after +// Cluster object deletion. +func (cc *clusterCache) getOrCreateClusterAccessor(cluster client.ObjectKey) *clusterAccessor { + cc.clusterAccessorsLock.Lock() + defer cc.clusterAccessorsLock.Unlock() + + accessor, ok := cc.clusterAccessors[cluster] + if !ok { + accessor = newClusterAccessor(cluster, cc.clusterAccessorConfig) + cc.clusterAccessors[cluster] = accessor + } + + return accessor +} + +// getClusterAccessor returns a clusterAccessor if it exists, otherwise nil. +func (cc *clusterCache) getClusterAccessor(cluster client.ObjectKey) *clusterAccessor { + cc.clusterAccessorsLock.RLock() + defer cc.clusterAccessorsLock.RUnlock() + + return cc.clusterAccessors[cluster] +} + +// deleteClusterAccessor deletes the clusterAccessor for the given cluster in the clusterAccessors map. +func (cc *clusterCache) deleteClusterAccessor(cluster client.ObjectKey) { + cc.clusterAccessorsLock.Lock() + defer cc.clusterAccessorsLock.Unlock() + + delete(cc.clusterAccessors, cluster) +} + +// shouldRequeue calculates if we should requeue based on the lastExecutionTime and the interval. +// Note: We can implement a more sophisticated backoff mechanism later if really necessary. +func shouldRequeue(now, lastExecutionTime time.Time, interval time.Duration) (time.Duration, bool) { + if lastExecutionTime.IsZero() { + return time.Duration(0), false + } + + timeSinceLastExecution := now.Sub(lastExecutionTime) + if timeSinceLastExecution < interval { + return interval - timeSinceLastExecution, true + } + + return time.Duration(0), false +} + +func minDurationOrDefault(durations []time.Duration, defaultDuration time.Duration) time.Duration { + if len(durations) == 0 { + return defaultDuration + } + + d := durations[0] + for i := 1; i < len(durations); i++ { + if durations[i] < d { + d = durations[i] + } + } + return d +} + +func (cc *clusterCache) cleanupClusterSourcesForCluster(cluster client.ObjectKey) { + cc.clusterSourcesLock.Lock() + defer cc.clusterSourcesLock.Unlock() + + for _, cs := range cc.clusterSources { + delete(cs.lastEventSentTimeByCluster, cluster) + } +} + +func (cc *clusterCache) GetClusterSource(controllerName string, mapFunc func(ctx context.Context, cluster client.Object) []ctrl.Request, opts ...GetClusterSourceOption) source.Source { + cc.clusterSourcesLock.Lock() + defer cc.clusterSourcesLock.Unlock() + + getClusterSourceOptions := &GetClusterSourceOptions{} + getClusterSourceOptions.ApplyOptions(opts) + + cs := clusterSource{ + controllerName: controllerName, + ch: make(chan event.GenericEvent), + sendEventAfterProbeFailureDurations: getClusterSourceOptions.watchForProbeFailures, + lastEventSentTimeByCluster: map[client.ObjectKey]time.Time{}, + } + cc.clusterSources = append(cc.clusterSources, cs) + + return source.Channel(cs.ch, handler.TypedEnqueueRequestsFromMapFunc(mapFunc)) +} + +func (cc *clusterCache) sendEventsToClusterSources(ctx context.Context, cluster *clusterv1.Cluster, now, lastProbeSuccessTime time.Time, didConnect, didDisconnect bool) { + log := ctrl.LoggerFrom(ctx) + + cc.clusterSourcesLock.Lock() + defer cc.clusterSourcesLock.Unlock() + + clusterKey := client.ObjectKeyFromObject(cluster) + + for _, cs := range cc.clusterSources { + lastEventSentTime := cs.lastEventSentTimeByCluster[clusterKey] + + if reasons := shouldSendEvent(now, lastProbeSuccessTime, lastEventSentTime, didConnect, didDisconnect, cs.sendEventAfterProbeFailureDurations); len(reasons) > 0 { + log.V(6).Info("Sending Cluster event", "targetController", cs.controllerName, "reasons", strings.Join(reasons, ", ")) + cs.ch <- event.GenericEvent{ + Object: cluster, + } + cs.lastEventSentTimeByCluster[clusterKey] = now + } + } +} + +func shouldSendEvent(now, lastProbeSuccessTime, lastEventSentTime time.Time, didConnect, didDisconnect bool, sendEventAfterProbeFailureDurations []time.Duration) []string { + var reasons []string + if didConnect { + // Send an event if connect was done. + reasons = append(reasons, "connect") + } + if didDisconnect { + // Send an event if disconnect was done. + reasons = append(reasons, "disconnect") + } + + if len(sendEventAfterProbeFailureDurations) > 0 { + for _, failureDuration := range sendEventAfterProbeFailureDurations { + shouldSendEventTime := lastProbeSuccessTime.Add(failureDuration) + + // Send an event if: + // * lastEventSentTime is before shouldSendEventTime and + // * shouldSendEventTime is in the past + if lastEventSentTime.Before(shouldSendEventTime) && shouldSendEventTime.Before(now) { + reasons = append(reasons, fmt.Sprintf("health probe didn't succeed since more than %s", failureDuration)) + } + } + } + + return reasons +} + +// SetConnectionCreationRetryInterval can be used to overwrite the ConnectionCreationRetryInterval. +// This method should only be used for tests and is not part of the public ClusterCache interface. +func (cc *clusterCache) SetConnectionCreationRetryInterval(interval time.Duration) { + cc.clusterAccessorConfig.ConnectionCreationRetryInterval = interval +} + +func validateAndDefaultOptions(opts *Options) error { + if opts.SecretClient == nil { + return errors.New("options.SecretClient must be set") + } + + if opts.Client.Timeout.Nanoseconds() == 0 { + opts.Client.Timeout = 10 * time.Second + } + if opts.Client.QPS == 0 { + opts.Client.QPS = 20 + } + if opts.Client.Burst == 0 { + opts.Client.Burst = 30 + } + if opts.Client.UserAgent == "" { + return errors.New("options.Client.UserAgent must be set") + } + + return nil +} + +func buildClusterAccessorConfig(scheme *runtime.Scheme, options Options, controllerPodMetadata *metav1.ObjectMeta) *clusterAccessorConfig { + return &clusterAccessorConfig{ + Scheme: scheme, + SecretClient: options.SecretClient, + ControllerPodMetadata: controllerPodMetadata, + ConnectionCreationRetryInterval: 30 * time.Second, + Cache: &clusterAccessorCacheConfig{ + InitialSyncTimeout: 5 * time.Minute, + SyncPeriod: options.Cache.SyncPeriod, + ByObject: options.Cache.ByObject, + Indexes: options.Cache.Indexes, + }, + Client: &clusterAccessorClientConfig{ + Timeout: options.Client.Timeout, + QPS: options.Client.QPS, + Burst: options.Client.Burst, + UserAgent: options.Client.UserAgent, + Cache: clusterAccessorClientCacheConfig{ + DisableFor: options.Client.Cache.DisableFor, + }, + }, + HealthProbe: &clusterAccessorHealthProbeConfig{ + Timeout: 5 * time.Second, + Interval: 10 * time.Second, + FailureThreshold: 5, + }, + } +} diff --git a/controllers/clustercache/cluster_cache_fake.go b/controllers/clustercache/cluster_cache_fake.go new file mode 100644 index 000000000000..c2af2730e28a --- /dev/null +++ b/controllers/clustercache/cluster_cache_fake.go @@ -0,0 +1,39 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package clustercache + +import ( + "k8s.io/apimachinery/pkg/util/sets" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// NewFakeClusterCache creates a new fake ClusterCache that can be used by unit tests. +func NewFakeClusterCache(workloadClient client.Client, clusterKey client.ObjectKey, watchObjects ...string) ClusterCache { + testCacheTracker := &clusterCache{ + clusterAccessors: make(map[client.ObjectKey]*clusterAccessor), + } + + testCacheTracker.clusterAccessors[clusterKey] = &clusterAccessor{ + lockedState: clusterAccessorLockedState{ + connection: &clusterAccessorLockedConnectionState{ + cachedClient: workloadClient, + watches: sets.Set[string]{}.Insert(watchObjects...), + }, + }, + } + return testCacheTracker +} diff --git a/controllers/clustercache/cluster_cache_test.go b/controllers/clustercache/cluster_cache_test.go new file mode 100644 index 000000000000..5cff083f2eb9 --- /dev/null +++ b/controllers/clustercache/cluster_cache_test.go @@ -0,0 +1,781 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package clustercache + +import ( + "context" + "fmt" + "math" + "math/rand/v2" + "net/http" + "sync" + "testing" + "time" + + . "github.com/onsi/gomega" + pkgerrors "github.com/pkg/errors" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + kerrors "k8s.io/apimachinery/pkg/util/errors" + "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest/fake" + "k8s.io/client-go/tools/clientcmd" + "k8s.io/client-go/util/workqueue" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/metrics" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + "sigs.k8s.io/cluster-api/controllers/remote" + "sigs.k8s.io/cluster-api/util/kubeconfig" +) + +func TestReconcile(t *testing.T) { + g := NewWithT(t) + + testCluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cluster", + Namespace: metav1.NamespaceDefault, + }, + } + clusterKey := client.ObjectKeyFromObject(testCluster) + g.Expect(env.CreateAndWait(ctx, testCluster)).To(Succeed()) + defer func() { g.Expect(client.IgnoreNotFound(env.CleanupAndWait(ctx, testCluster))).To(Succeed()) }() + + opts := Options{ + SecretClient: env.Manager.GetClient(), + Client: ClientOptions{ + UserAgent: remote.DefaultClusterAPIUserAgent("test-controller-manager"), + Timeout: 10 * time.Second, + }, + Cache: CacheOptions{ + Indexes: []CacheOptionsIndex{NodeProviderIDIndex}, + }, + } + accessorConfig := buildClusterAccessorConfig(env.Manager.GetScheme(), opts, nil) + cc := &clusterCache{ + // Use APIReader to avoid cache issues when reading the Cluster object. + client: env.Manager.GetAPIReader(), + clusterAccessorConfig: accessorConfig, + clusterAccessors: make(map[client.ObjectKey]*clusterAccessor), + } + + // Add a Cluster source and start it (queue will be later used to verify the source works correctly) + clusterQueue := workqueue.NewTypedRateLimitingQueueWithConfig( + workqueue.DefaultTypedControllerRateLimiter[reconcile.Request](), + workqueue.TypedRateLimitingQueueConfig[reconcile.Request]{ + Name: "test-controller", + }) + g.Expect(cc.GetClusterSource("test-controller", func(_ context.Context, o client.Object) []ctrl.Request { + return []ctrl.Request{{NamespacedName: client.ObjectKeyFromObject(o)}} + }).Start(ctx, clusterQueue)).To(Succeed()) + + // Reconcile, Cluster.Status.InfrastructureReady == false + // => we expect no requeue. + res, err := cc.Reconcile(ctx, reconcile.Request{NamespacedName: clusterKey}) + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(res.IsZero()).To(BeTrue()) + + // Set Cluster.Status.InfrastructureReady == true + patch := client.MergeFrom(testCluster.DeepCopy()) + testCluster.Status.InfrastructureReady = true + g.Expect(env.Status().Patch(ctx, testCluster, patch)).To(Succeed()) + + // Reconcile, kubeconfig Secret doesn't exist + // => accessor.Connect will fail so we expect a retry with ConnectionCreationRetryInterval. + res, err = cc.Reconcile(ctx, reconcile.Request{NamespacedName: clusterKey}) + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(res).To(Equal(reconcile.Result{RequeueAfter: accessorConfig.ConnectionCreationRetryInterval})) + + // Create kubeconfig Secret + kubeconfigSecret := kubeconfig.GenerateSecret(testCluster, kubeconfig.FromEnvTestConfig(env.Config, testCluster)) + g.Expect(env.CreateAndWait(ctx, kubeconfigSecret)).To(Succeed()) + defer func() { g.Expect(env.CleanupAndWait(ctx, kubeconfigSecret)).To(Succeed()) }() + + // Reconcile again + // => accessor.Connect just failed, so because of rate-limiting we expect a retry with + // slightly less than ConnectionCreationRetryInterval. + res, err = cc.Reconcile(ctx, reconcile.Request{NamespacedName: clusterKey}) + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(res.RequeueAfter >= accessorConfig.ConnectionCreationRetryInterval-2*time.Second).To(BeTrue()) + g.Expect(res.RequeueAfter <= accessorConfig.ConnectionCreationRetryInterval).To(BeTrue()) + + // Set lastConnectionCreationErrorTimestamp to now - ConnectionCreationRetryInterval to skip over rate-limiting + cc.getClusterAccessor(clusterKey).lockedState.lastConnectionCreationErrorTimestamp = time.Now().Add(-1 * accessorConfig.ConnectionCreationRetryInterval) + + // Reconcile again, accessor.Connect works now + // => because accessor.Connect just set the lastProbeTimestamp we expect a retry with + // slightly less than HealthProbe.Interval + res, err = cc.Reconcile(ctx, reconcile.Request{NamespacedName: clusterKey}) + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(res.RequeueAfter >= accessorConfig.HealthProbe.Interval-2*time.Second).To(BeTrue()) + g.Expect(res.RequeueAfter <= accessorConfig.HealthProbe.Interval).To(BeTrue()) + g.Expect(cc.getClusterAccessor(clusterKey).Connected(ctx)).To(BeTrue()) + + // Reconcile again + // => we still expect a retry with slightly less than HealthProbe.Interval + res, err = cc.Reconcile(ctx, reconcile.Request{NamespacedName: clusterKey}) + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(res.RequeueAfter >= accessorConfig.HealthProbe.Interval-2*time.Second).To(BeTrue()) + g.Expect(res.RequeueAfter <= accessorConfig.HealthProbe.Interval).To(BeTrue()) + + // Set last probe timestamps to now - accessorConfig.HealthProbe.Interval to skip over rate-limiting + cc.getClusterAccessor(clusterKey).lockedState.healthChecking.lastProbeTimestamp = time.Now().Add(-1 * accessorConfig.HealthProbe.Interval) + cc.getClusterAccessor(clusterKey).lockedState.healthChecking.lastProbeSuccessTimestamp = time.Now().Add(-1 * accessorConfig.HealthProbe.Interval) + + // Reconcile again, now the health probe will be run successfully + // => so we expect a retry with slightly less than HealthProbe.Interval + res, err = cc.Reconcile(ctx, reconcile.Request{NamespacedName: clusterKey}) + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(res.RequeueAfter >= accessorConfig.HealthProbe.Interval-2*time.Second).To(BeTrue()) + g.Expect(res.RequeueAfter <= accessorConfig.HealthProbe.Interval).To(BeTrue()) + g.Expect(cc.getClusterAccessor(clusterKey).Connected(ctx)).To(BeTrue()) + g.Expect(cc.getClusterAccessor(clusterKey).lockedState.healthChecking.consecutiveFailures).To(Equal(0)) + + // Exchange the REST client, so the next health probe will return an unauthorized error + cc.getClusterAccessor(clusterKey).lockedState.connection.restClient = &fake.RESTClient{ + NegotiatedSerializer: scheme.Codecs, + Resp: &http.Response{ + StatusCode: http.StatusUnauthorized, + Header: header(), + Body: objBody(&apierrors.NewUnauthorized("authorization failed").ErrStatus), + }, + } + // Set last probe timestamps to now - accessorConfig.HealthProbe.Interval to skip over rate-limiting + cc.getClusterAccessor(clusterKey).lockedState.healthChecking.lastProbeTimestamp = time.Now().Add(-1 * accessorConfig.HealthProbe.Interval) + cc.getClusterAccessor(clusterKey).lockedState.healthChecking.lastProbeSuccessTimestamp = time.Now().Add(-1 * accessorConfig.HealthProbe.Interval) + + // Reconcile again, now the health probe will fail + // => so we expect a disconnect and an immediate retry. + res, err = cc.Reconcile(ctx, reconcile.Request{NamespacedName: clusterKey}) + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(res).To(Equal(reconcile.Result{RequeueAfter: 1 * time.Millisecond})) + + // At this point there should still be a cluster accessor, but it is disconnected + _, ok := cc.clusterAccessors[clusterKey] + g.Expect(ok).To(BeTrue()) + g.Expect(cc.getClusterAccessor(clusterKey).Connected(ctx)).To(BeFalse()) + // There should be one cluster source with an entry for lastEventSentTime for the current cluster. + g.Expect(cc.clusterSources).To(HaveLen(1)) + _, ok = cc.clusterSources[0].lastEventSentTimeByCluster[clusterKey] + g.Expect(ok).To(BeTrue()) + + // Delete the Cluster + g.Expect(env.CleanupAndWait(ctx, testCluster)).To(Succeed()) + + // Reconcile again, Cluster has been deleted + // => so we expect another Disconnect (no-op) and cleanup of the cluster accessor and cluster sources entries + res, err = cc.Reconcile(ctx, reconcile.Request{NamespacedName: clusterKey}) + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(res.IsZero()).To(BeTrue()) + + // Cluster accessor should have been removed + _, ok = cc.clusterAccessors[clusterKey] + g.Expect(ok).To(BeFalse()) + // lastEventSentTime in cluster source for the current cluster should have been removed. + g.Expect(cc.clusterSources).To(HaveLen(1)) + _, ok = cc.clusterSources[0].lastEventSentTimeByCluster[clusterKey] + g.Expect(ok).To(BeFalse()) + + // Verify Cluster queue. + g.Expect(clusterQueue.Len()).To(Equal(1)) + g.Expect(clusterQueue.Get()).To(Equal(reconcile.Request{NamespacedName: clusterKey})) +} + +func TestShouldRequeue(t *testing.T) { + now := time.Now() + + tests := []struct { + name string + now time.Time + lastExecution time.Time + interval time.Duration + wantRequeue bool + wantRequeueAfter time.Duration + }{ + { + name: "Don't requeue first execution (interval: 20s)", + now: now, + lastExecution: time.Time{}, + interval: 20 * time.Second, + wantRequeue: false, + wantRequeueAfter: time.Duration(0), + }, + { + name: "Requeue after 15s last execution was 5s ago (interval: 20s)", + now: now, + lastExecution: now.Add(-time.Duration(5) * time.Second), + interval: 20 * time.Second, + wantRequeue: true, + wantRequeueAfter: time.Duration(15) * time.Second, + }, + { + name: "Don't requeue last execution was 20s ago (interval: 20s)", + now: now, + lastExecution: now.Add(-time.Duration(20) * time.Second), + interval: 20 * time.Second, + wantRequeue: false, + wantRequeueAfter: time.Duration(0), + }, + { + name: "Don't requeue last execution was 60s ago (interval: 20s)", + now: now, + lastExecution: now.Add(-time.Duration(60) * time.Second), + interval: 20 * time.Second, + wantRequeue: false, + wantRequeueAfter: time.Duration(0), + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g := NewWithT(t) + + gotRequeueAfter, gotRequeue := shouldRequeue(tt.now, tt.lastExecution, tt.interval) + g.Expect(gotRequeue).To(Equal(tt.wantRequeue)) + g.Expect(gotRequeueAfter).To(Equal(tt.wantRequeueAfter)) + }) + } +} + +func TestMinDurationOrDefault(t *testing.T) { + tests := []struct { + name string + durations []time.Duration + defaultDuration time.Duration + wantDuration time.Duration + }{ + { + name: "nil durations use default duration", + durations: nil, + defaultDuration: defaultRequeueAfter, + wantDuration: defaultRequeueAfter, + }, + { + name: "empty durations use default duration", + durations: []time.Duration{}, + defaultDuration: defaultRequeueAfter, + wantDuration: defaultRequeueAfter, + }, + { + name: "use min duration", + durations: []time.Duration{5 * time.Second, 10 * time.Second}, + defaultDuration: defaultRequeueAfter, + wantDuration: 5 * time.Second, + }, + { + name: "use min duration even if default duration is smaller", + durations: []time.Duration{5 * time.Hour, 10 * time.Hour}, + defaultDuration: defaultRequeueAfter, + wantDuration: 5 * time.Hour, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g := NewWithT(t) + + gotDuration := minDurationOrDefault(tt.durations, tt.defaultDuration) + g.Expect(gotDuration).To(Equal(tt.wantDuration)) + }) + } +} + +func TestSendEventsToClusterSources(t *testing.T) { + now := time.Now() + + testCluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cluster", + Namespace: metav1.NamespaceDefault, + }, + } + + type testClusterSources struct { + controllerName string + sendEventAfterProbeFailureDurations []time.Duration + lastEventSentTime time.Time + } + + tests := []struct { + name string + now time.Time + lastProbeSuccessTime time.Time + didConnect bool + didDisconnect bool + clusterSources []testClusterSources + wantEventsToControllers []string + }{ + { + name: "send event because of connect", + now: now, + lastProbeSuccessTime: now, + didConnect: true, + clusterSources: []testClusterSources{ + { + controllerName: "cluster", + }, + { + controllerName: "machineset", + }, + { + controllerName: "machine", + }, + }, + wantEventsToControllers: []string{"cluster", "machineset", "machine"}, + }, + { + name: "send event because of probe failure to some controllers", + now: now, + lastProbeSuccessTime: now.Add(-60 * time.Second), // probe last succeeded 60s ago. + clusterSources: []testClusterSources{ + { + controllerName: "cluster", + lastEventSentTime: now.Add(-40 * time.Second), // last event was sent 40s ago (20s after last successful probe) + // cluster controller already got an event after 20s which is >= 5s, won't get another event + sendEventAfterProbeFailureDurations: []time.Duration{5 * time.Second}, + }, + { + controllerName: "machineset", + lastEventSentTime: now.Add(-40 * time.Second), // last event was sent 40s ago (20s after last successful probe) + // machineset controller didn't get an event after >= 30s, will get an event + sendEventAfterProbeFailureDurations: []time.Duration{30 * time.Second}, + }, + { + controllerName: "machine", + lastEventSentTime: now.Add(-40 * time.Second), // last event was sent 40s ago (20s after last successful probe) + // machine controller won't get an event as event should be sent at -60s + 120s = now + 60s + sendEventAfterProbeFailureDurations: []time.Duration{120 * time.Second}, + }, + }, + wantEventsToControllers: []string{"machineset"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g := NewWithT(t) + + gotEventsToController := []string{} + var lock sync.Mutex + var wg sync.WaitGroup + + cc := &clusterCache{} + for i, cs := range tt.clusterSources { + // Create cluster source + opts := []GetClusterSourceOption{} + for _, sendEventAfterProbeFailureDuration := range cs.sendEventAfterProbeFailureDurations { + opts = append(opts, WatchForProbeFailure(sendEventAfterProbeFailureDuration)) + } + g.Expect(cc.GetClusterSource(cs.controllerName, func(_ context.Context, o client.Object) []ctrl.Request { + return []ctrl.Request{{NamespacedName: client.ObjectKeyFromObject(o)}} + }, opts...)).ToNot(BeNil()) + + cc.clusterSources[i].lastEventSentTimeByCluster[client.ObjectKeyFromObject(testCluster)] = cs.lastEventSentTime + + // Create go routine to read events from source and store them in gotEventsToController + wg.Add(1) + go func(ch <-chan event.GenericEvent) { + for { + _, ok := <-ch + if !ok { + wg.Done() + return + } + lock.Lock() + gotEventsToController = append(gotEventsToController, cs.controllerName) + lock.Unlock() + } + }(cc.clusterSources[i].ch) + } + + cc.sendEventsToClusterSources(ctx, testCluster, tt.now, tt.lastProbeSuccessTime, tt.didConnect, tt.didDisconnect) + + for _, cs := range cc.clusterSources { + close(cs.ch) + } + wg.Wait() + + g.Expect(gotEventsToController).To(ConsistOf(tt.wantEventsToControllers)) + }) + } +} + +func TestShouldSentEvent(t *testing.T) { + now := time.Now() + + tests := []struct { + name string + now time.Time + lastProbeSuccessTime time.Time + lastEventSentTime time.Time + didConnect bool + didDisconnect bool + sendEventAfterProbeFailureDurations []time.Duration + wantReasons []string + }{ + { + name: "send event because of connect", + now: now, + lastProbeSuccessTime: now, + lastEventSentTime: now.Add(-5 * time.Second), + didConnect: true, + didDisconnect: false, + wantReasons: []string{"connect"}, + }, + { + name: "send event because of disconnect", + now: now, + lastProbeSuccessTime: now, + lastEventSentTime: now.Add(-5 * time.Second), + didConnect: false, + didDisconnect: true, + wantReasons: []string{"disconnect"}, + }, + { + name: "send event, last probe success is longer ago than failure duration", + now: now, + lastProbeSuccessTime: now.Add(-60 * time.Second), // probe last succeeded 60s ago. + lastEventSentTime: now.Add(-40 * time.Second), // last event was sent 40s ago (20s after last successful probe). + sendEventAfterProbeFailureDurations: []time.Duration{21 * time.Second}, // event should be sent 21s after last successful probe + wantReasons: []string{"health probe didn't succeed since more than 21s"}, + }, + { + name: "don't send event, last probe success is longer ago than failure duration, but an event was already sent", + now: now, + lastProbeSuccessTime: now.Add(-60 * time.Second), // probe last succeeded 60s ago. + lastEventSentTime: now.Add(-39 * time.Second), // last event was sent 39s ago (21s after last successful probe). + sendEventAfterProbeFailureDurations: []time.Duration{21 * time.Second}, // event should be sent 21s after last successful probe + wantReasons: nil, + }, + { + name: "don't send event, last probe success is not longer ago than failure duration (event should be send in 1s)", + now: now, + lastProbeSuccessTime: now.Add(-60 * time.Second), // probe last succeeded 60s ago. + lastEventSentTime: now.Add(-39 * time.Second), // last event was sent 39s ago (21s after last successful probe). + sendEventAfterProbeFailureDurations: []time.Duration{61 * time.Second}, // event should be sent 61s after last successful probe + wantReasons: nil, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g := NewWithT(t) + + gotReasons := shouldSendEvent(tt.now, tt.lastProbeSuccessTime, tt.lastEventSentTime, tt.didConnect, tt.didDisconnect, tt.sendEventAfterProbeFailureDurations) + g.Expect(gotReasons).To(Equal(tt.wantReasons)) + }) + } +} + +type testCluster struct { + cluster client.ObjectKey + brokenRESTConfig bool +} + +func TestClusterCacheConcurrency(t *testing.T) { + g := NewWithT(t) + + // Scale parameters for the test. + const ( + // ClusterCache will only be run with 10 workers (default is 100). + // So in general 10x clusterCount should be at least possible + clusterCacheConcurrency = 10 + + // clusterCount is the number of clusters we create + clusterCount = 100 + // brokenClusterPercentage is the percentage of clusters that are broken, + // i.e. they'll have a broken REST config and connection creation will time out. + brokenClusterPercentage = 20 // 20% + // clusterCreationConcurrency is the number of workers that are creating Cluster objects in parallel. + clusterCreationConcurrency = 10 + + // clusterCacheConsumerConcurrency is the number of workers for consumers that use the ClusterCache, + // i.e. this is the equivalent of the sum of workers of all reconcilers calling .GetClient etc. + clusterCacheConsumerConcurrency = 100 + + // clusterCacheConsumerDuration is the duration for which we let the consumers run. + clusterCacheConsumerDuration = 1 * time.Minute + ) + + // Set up ClusterCache. + cc, err := SetupWithManager(ctx, env.Manager, Options{ + SecretClient: env.Manager.GetClient(), + Cache: CacheOptions{ + Indexes: []CacheOptionsIndex{NodeProviderIDIndex}, + }, + Client: ClientOptions{ + QPS: 20, + Burst: 30, + UserAgent: remote.DefaultClusterAPIUserAgent("test-controller-manager"), + Timeout: 10 * time.Second, + Cache: ClientCacheOptions{ + DisableFor: []client.Object{ + // Don't cache ConfigMaps & Secrets. + &corev1.ConfigMap{}, + &corev1.Secret{}, + }, + }, + }, + }, controller.Options{MaxConcurrentReconciles: clusterCacheConcurrency}) + g.Expect(err).ToNot(HaveOccurred()) + internalClusterCache, ok := cc.(*clusterCache) + g.Expect(ok).To(BeTrue()) + + // Generate test clusters. + testClusters := generateTestClusters(clusterCount, brokenClusterPercentage) + + // Create test clusters concurrently + start := time.Now() + inputChan := make(chan testCluster) + var wg sync.WaitGroup + for range clusterCreationConcurrency { + wg.Add(1) + go func() { + for { + testCluster, ok := <-inputChan + if !ok { + wg.Done() + return + } + createCluster(g, testCluster) + } + }() + } + go func() { + for _, testCluster := range testClusters { + inputChan <- testCluster + } + // All clusters are requested, close the channel to shut down workers. + close(inputChan) + }() + wg.Wait() + afterClusterCreation := time.Since(start) + t.Logf("Created %d clusters in %s", clusterCount, afterClusterCreation) + + // Ensure ClusterCache had a chance to run Connect for each Cluster at least once. + // Otherwise we e.g. can't assume below that every working cluster is already connected. + g.Eventually(func(g Gomega) { + for _, tc := range testClusters { + accessor := internalClusterCache.getClusterAccessor(tc.cluster) + g.Expect(accessor).ToNot(BeNil()) + if tc.brokenRESTConfig { + g.Expect(accessor.GetLastConnectionCreationErrorTimestamp(ctx).IsZero()).To(BeFalse()) + } else { + g.Expect(accessor.Connected(ctx)).To(BeTrue()) + } + } + }, 3*time.Minute, 5*time.Second).Should(Succeed()) + + t.Logf("ClusterCache ran Connect at least once per Cluster now") + + // Run some workers to simulate other reconcilers using the ClusterCache + errChan := make(chan error) + errStopChan := make(chan struct{}) + consumerCtx, consumerCancel := context.WithCancel(ctx) + errs := []error{} + for range clusterCacheConsumerConcurrency { + wg.Add(1) + go func(ctx context.Context) { + for { + select { + case <-consumerCtx.Done(): + wg.Done() + return + default: + } + + // Pick a random cluster + tc := testClusters[rand.IntN(len(testClusters))] //nolint:gosec // weak random numbers are fine + + // Try to get a client. + c, err := cc.GetClient(ctx, tc.cluster) + + if tc.brokenRESTConfig { + // If the cluster has a broken REST config, validate: + // * we got an error when getting the client + // * the accessor is not connected + // * connection creation was tried in a reasonable timeframe (ConnectionCreationRetryInterval*1,2) + // As a lot of clusters are enqueued at the same time we have to give a bit of buffer (20% for now). + if err == nil { + errChan <- pkgerrors.Errorf("cluster %s: expected an error when getting client", tc.cluster) + continue + } + + accessor := internalClusterCache.getClusterAccessor(tc.cluster) + if accessor.Connected(ctx) { + errChan <- pkgerrors.Errorf("cluster %s: expected accessor to not be connected", tc.cluster) + continue + } + + lastConnectionCreationErrorTimestamp := accessor.GetLastConnectionCreationErrorTimestamp(ctx) + if time.Since(lastConnectionCreationErrorTimestamp) > (internalClusterCache.clusterAccessorConfig.ConnectionCreationRetryInterval * 120 / 100) { + errChan <- pkgerrors.Wrapf(err, "cluster %s: connection creation wasn't tried within the connection creation retry interval", tc.cluster) + continue + } + } else { + // If the cluster has a valid REST config, validate: + // * we didn't get an error when getting the client + // * the accessor is connected + // * the client works, by listing ConfigMaps + // * health probe was run in a reasonable timeframe (HealthProbe.Interval*1,2) + // As a lot of clusters are enqueued at the same time we have to give a bit of buffer (20% for now). + if err != nil { + errChan <- pkgerrors.Wrapf(err, "cluster %s: got unexpected error when getting client: %v", tc.cluster, err) + continue + } + + accessor := internalClusterCache.getClusterAccessor(tc.cluster) + if !accessor.Connected(ctx) { + errChan <- pkgerrors.Errorf("cluster %s: expected accessor to be connected", tc.cluster) + continue + } + + if err := c.List(ctx, &corev1.NodeList{}); err != nil { + errChan <- pkgerrors.Wrapf(err, "cluster %s: unexpected error when using client to list ConfigMaps: %v", tc.cluster, err) + continue + } + + lastProbeSuccessTimestamp := cc.GetLastProbeSuccessTimestamp(ctx, tc.cluster) + if time.Since(lastProbeSuccessTimestamp) > (internalClusterCache.clusterAccessorConfig.HealthProbe.Interval * 120 / 100) { + errChan <- pkgerrors.Wrapf(err, "cluster %s: health probe wasn't run successfully within the health probe interval", tc.cluster) + continue + } + } + + // Wait before the next iteration + time.Sleep(10 * time.Millisecond) + } + }(consumerCtx) + } + go func() { + for { + err, ok := <-errChan + if !ok { + errStopChan <- struct{}{} + return + } + errs = append(errs, err) + } + }() + + t.Logf("Consumers will now run for %s", clusterCacheConsumerDuration) + time.Sleep(clusterCacheConsumerDuration) + + // Cancel consumer go routines. + consumerCancel() + // Wait until consumer go routines are done. + wg.Wait() + // Close error channel to stop error collection go routine. + close(errChan) + // Wait until error collection go routine is done. + <-errStopChan + + // Validate that no errors occurred. + // Note: This test is also run with the race detector to detect race conditions. + if len(errs) > 0 { + g.Expect(kerrors.NewAggregate(errs)).ToNot(HaveOccurred()) + } + + // The expectation is that the ClusterCache Reconciler never returns errors to avoid exponential backoff. + errorsTotal, err := getCounterMetric("controller_runtime_reconcile_errors_total", "clustercache") + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(errorsTotal).To(Equal(float64(0))) + + panicsTotal, err := getCounterMetric("controller_runtime_reconcile_panics_total", "clustercache") + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(panicsTotal).To(Equal(float64(0))) +} + +func generateTestClusters(clusterCount, brokenClusterPercentage int) []testCluster { + testClusters := make([]testCluster, 0, clusterCount) + + clusterNameDigits := 1 + int(math.Log10(float64(clusterCount))) + for i := 1; i <= clusterCount; i++ { + // This ensures we always have the right number of leading zeros in our cluster names, e.g. + // clusterCount=1000 will lead to cluster names like scale-0001, scale-0002, ... . + // This makes it possible to have correct ordering of clusters in diagrams in tools like Grafana. + name := fmt.Sprintf("%s-%0*d", "cluster", clusterNameDigits, i) + testClusters = append(testClusters, testCluster{ + cluster: client.ObjectKey{ + Name: name, + Namespace: metav1.NamespaceDefault, + }, + brokenRESTConfig: i%(100/brokenClusterPercentage) == 0, + }) + } + return testClusters +} + +func createCluster(g Gomega, testCluster testCluster) { + cluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: testCluster.cluster.Name, + Namespace: testCluster.cluster.Namespace, + }, + } + g.Expect(env.CreateAndWait(ctx, cluster)).To(Succeed()) + + if !testCluster.brokenRESTConfig { + g.Expect(env.CreateKubeconfigSecret(ctx, cluster)).To(Succeed()) + } else { + // Create invalid kubeconfig Secret + kubeconfigBytes := kubeconfig.FromEnvTestConfig(env.Config, cluster) + cmdConfig, err := clientcmd.Load(kubeconfigBytes) + g.Expect(err).ToNot(HaveOccurred()) + cmdConfig.Clusters[cluster.Name].Server = "https://1.2.3.4" // breaks the server URL. + kubeconfigBytes, err = clientcmd.Write(*cmdConfig) + g.Expect(err).ToNot(HaveOccurred()) + kubeconfigSecret := kubeconfig.GenerateSecret(cluster, kubeconfigBytes) + g.Expect(env.CreateAndWait(ctx, kubeconfigSecret)).To(Succeed()) + } + + patch := client.MergeFrom(cluster.DeepCopy()) + cluster.Status.InfrastructureReady = true + g.Expect(env.Status().Patch(ctx, cluster, patch)).To(Succeed()) +} + +func getCounterMetric(metricFamilyName, controllerName string) (float64, error) { + metricFamilies, err := metrics.Registry.Gather() + if err != nil { + return 0, err + } + + for _, metricFamily := range metricFamilies { + if metricFamily.GetName() == metricFamilyName { + for _, metric := range metricFamily.Metric { + foundController := false + for _, l := range metric.Label { + if l.GetName() == "controller" && l.GetValue() == controllerName { + foundController = true + } + } + if !foundController { + continue + } + + if metric.GetCounter() != nil { + return metric.GetCounter().GetValue(), nil + } + } + } + } + + return 0, fmt.Errorf("failed to find %q metric", metricFamilyName) +} diff --git a/controllers/clustercache/doc.go b/controllers/clustercache/doc.go new file mode 100644 index 000000000000..9a89dab6375c --- /dev/null +++ b/controllers/clustercache/doc.go @@ -0,0 +1,57 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package clustercache implements the ClusterCache. +// +// The ClusterCache is used to create and cache clients, REST configs, etc. for workload clusters. +// +// # Setup +// +// ClusterCache can be set up via the SetupWithManager function. It will add the ClusterCache as a +// reconciler to the Manager and then return a ClusterCache object. +// +// # Usage +// +// Typically the ClusterCache is passed to controllers and then it can be used by e.g. simply calling +// GetClient to retrieve a client for the given Cluster (see the ClusterCache interface for more details). +// If GetClient does return an error, controllers should not requeue, requeue after or return an error and +// go in exponential backoff. Instead, they should register a source (via GetClusterSource) to receive reconnect +// events from the ClusterCache. +// +// # Implementation details +// +// The ClusterCache internally runs a reconciler that: +// - tries to create a connection to a workload cluster +// - if it fails, it will retry after roughly the ConnectionCreationRetryInterval +// +// - if the connection is established it will run continuous health checking every HealthProbe.Interval. +// - if the health checking fails more than HealthProbe.FailureThreshold times consecutively or if an +// unauthorized error occurs, the connection will be disconnected (a subsequent Reconcile will try +// to connect again) +// +// - if other reconcilers (e.g. the Machine controller) got a source to watch for events, they will get notified if: +// - a connect or disconnect happened +// - the health probe didn't succeed for a certain amount of time (if the WatchForProbeFailure option was used) +// +// # Implementation considerations +// +// Some considerations about the trade-offs that have been made: +// - There is intentionally only one Reconciler that handles both the connect/disconnect and health checking. +// This is a lot easier from a locking perspective than trying to coordinate between multiple reconcilers. +// - We are never holding the write lock on the clusterAccessor for an extended period of time (e.g. during +// connection creation, which can time out after a few seconds). This is important so that other reconcilers +// that are calling GetClient etc. are never blocked. +package clustercache diff --git a/controllers/clustercache/index.go b/controllers/clustercache/index.go new file mode 100644 index 000000000000..ea5bf7fec3d1 --- /dev/null +++ b/controllers/clustercache/index.go @@ -0,0 +1,30 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package clustercache + +import ( + corev1 "k8s.io/api/core/v1" + + "sigs.k8s.io/cluster-api/api/v1beta1/index" +) + +// NodeProviderIDIndex is used to index Nodes by ProviderID. +var NodeProviderIDIndex = CacheOptionsIndex{ + Object: &corev1.Node{}, + Field: index.NodeProviderIDField, + ExtractValue: index.NodeByProviderID, +} diff --git a/controllers/clustercache/suite_test.go b/controllers/clustercache/suite_test.go new file mode 100644 index 000000000000..a9d8681627fe --- /dev/null +++ b/controllers/clustercache/suite_test.go @@ -0,0 +1,38 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package clustercache + +import ( + "os" + "testing" + + ctrl "sigs.k8s.io/controller-runtime" + + "sigs.k8s.io/cluster-api/internal/test/envtest" +) + +var ( + env *envtest.Environment + ctx = ctrl.SetupSignalHandler() +) + +func TestMain(m *testing.M) { + os.Exit(envtest.Run(ctx, envtest.RunInput{ + M: m, + SetupEnv: func(e *envtest.Environment) { env = e }, + })) +} diff --git a/controlplane/kubeadm/controllers/alias.go b/controlplane/kubeadm/controllers/alias.go index 270f5222ba61..b7a54214a399 100644 --- a/controlplane/kubeadm/controllers/alias.go +++ b/controlplane/kubeadm/controllers/alias.go @@ -24,7 +24,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" - "sigs.k8s.io/cluster-api/controllers/remote" + "sigs.k8s.io/cluster-api/controllers/clustercache" kubeadmcontrolplanecontrollers "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/controllers" ) @@ -32,7 +32,7 @@ import ( type KubeadmControlPlaneReconciler struct { Client client.Client SecretCachingClient client.Client - Tracker *remote.ClusterCacheTracker + ClusterCache clustercache.ClusterCache EtcdDialTimeout time.Duration EtcdCallTimeout time.Duration @@ -49,7 +49,7 @@ func (r *KubeadmControlPlaneReconciler) SetupWithManager(ctx context.Context, mg return (&kubeadmcontrolplanecontrollers.KubeadmControlPlaneReconciler{ Client: r.Client, SecretCachingClient: r.SecretCachingClient, - Tracker: r.Tracker, + ClusterCache: r.ClusterCache, EtcdDialTimeout: r.EtcdDialTimeout, EtcdCallTimeout: r.EtcdCallTimeout, WatchFilterValue: r.WatchFilterValue, diff --git a/controlplane/kubeadm/internal/cluster.go b/controlplane/kubeadm/internal/cluster.go index 67a6d3aa3882..759f16e47bc2 100644 --- a/controlplane/kubeadm/internal/cluster.go +++ b/controlplane/kubeadm/internal/cluster.go @@ -30,7 +30,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" - "sigs.k8s.io/cluster-api/controllers/remote" + "sigs.k8s.io/cluster-api/controllers/clustercache" expv1 "sigs.k8s.io/cluster-api/exp/api/v1beta1" "sigs.k8s.io/cluster-api/util/collections" "sigs.k8s.io/cluster-api/util/secret" @@ -54,7 +54,7 @@ type ManagementCluster interface { type Management struct { Client client.Reader SecretCachingClient client.Reader - Tracker *remote.ClusterCacheTracker + ClusterCache clustercache.ClusterCache EtcdDialTimeout time.Duration EtcdCallTimeout time.Duration } @@ -105,14 +105,14 @@ func (m *Management) GetMachinePoolsForCluster(ctx context.Context, cluster *clu func (m *Management) GetWorkloadCluster(ctx context.Context, clusterKey client.ObjectKey) (WorkloadCluster, error) { // TODO(chuckha): Inject this dependency. // TODO(chuckha): memoize this function. The workload client only exists as long as a reconciliation loop. - restConfig, err := m.Tracker.GetRESTConfig(ctx, clusterKey) + restConfig, err := m.ClusterCache.GetRESTConfig(ctx, clusterKey) if err != nil { return nil, &RemoteClusterConnectionError{Name: clusterKey.String(), Err: err} } restConfig = rest.CopyConfig(restConfig) restConfig.Timeout = 30 * time.Second - c, err := m.Tracker.GetClient(ctx, clusterKey) + c, err := m.ClusterCache.GetClient(ctx, clusterKey) if err != nil { return nil, &RemoteClusterConnectionError{Name: clusterKey.String(), Err: err} } @@ -130,7 +130,7 @@ func (m *Management) GetWorkloadCluster(ctx context.Context, clusterKey client.O // TODO: consider if we can detect if we are using external etcd in a more explicit way (e.g. looking at the config instead of deriving from the existing certificates) var clientCert tls.Certificate if keyData != nil { - clientKey, err := m.Tracker.GetEtcdClientCertificateKey(ctx, clusterKey) + clientKey, err := m.ClusterCache.GetClientCertificatePrivateKey(ctx, clusterKey) if err != nil { return nil, err } diff --git a/controlplane/kubeadm/internal/cluster_test.go b/controlplane/kubeadm/internal/cluster_test.go index 233c67327e3d..77e807f78663 100644 --- a/controlplane/kubeadm/internal/cluster_test.go +++ b/controlplane/kubeadm/internal/cluster_test.go @@ -34,10 +34,13 @@ import ( apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/utils/ptr" "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/controller" + "sigs.k8s.io/controller-runtime/pkg/reconcile" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + "sigs.k8s.io/cluster-api/controllers/clustercache" "sigs.k8s.io/cluster-api/controllers/remote" "sigs.k8s.io/cluster-api/util/certs" "sigs.k8s.io/cluster-api/util/collections" @@ -109,6 +112,23 @@ func TestGetWorkloadCluster(t *testing.T) { badCrtEtcdSecret := etcdSecret.DeepCopy() badCrtEtcdSecret.Data[secret.TLSCrtDataName] = []byte("bad cert") + // Create Cluster + cluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "my-cluster", + Namespace: ns.Name, + }, + } + g.Expect(env.CreateAndWait(ctx, cluster)).To(Succeed()) + defer func(do client.Object) { + g.Expect(env.CleanupAndWait(ctx, do)).To(Succeed()) + }(cluster) + + // Set InfrastructureReady to true so ClusterCache creates the clusterAccessor. + patch := client.MergeFrom(cluster.DeepCopy()) + cluster.Status.InfrastructureReady = true + g.Expect(env.Status().Patch(ctx, cluster, patch)).To(Succeed()) + // Create kubeconfig secret // Store the envtest config as the contents of the kubeconfig secret. // This way we are using the envtest environment as both the @@ -191,22 +211,33 @@ func TestGetWorkloadCluster(t *testing.T) { }(o) } - // We have to create a new ClusterCacheTracker for every test case otherwise - // it could still have a rest config from a previous run cached. - tracker, err := remote.NewClusterCacheTracker( - env.Manager, - remote.ClusterCacheTrackerOptions{ - Log: &log.Log, + clusterCache, err := clustercache.SetupWithManager(ctx, env.Manager, clustercache.Options{ + SecretClient: env.Manager.GetClient(), + Client: clustercache.ClientOptions{ + UserAgent: remote.DefaultClusterAPIUserAgent("test-controller-manager"), + Cache: clustercache.ClientCacheOptions{ + DisableFor: []client.Object{ + // Don't cache ConfigMaps & Secrets. + &corev1.ConfigMap{}, + &corev1.Secret{}, + }, + }, }, - ) + }, controller.Options{MaxConcurrentReconciles: 10, SkipNameValidation: ptr.To(true)}) g.Expect(err).ToNot(HaveOccurred()) m := Management{ Client: env.GetClient(), SecretCachingClient: secretCachingClient, - Tracker: tracker, + ClusterCache: clusterCache, } + // Ensure the ClusterCache reconciled at least once (and if possible created a clusterAccessor). + _, err = clusterCache.(reconcile.Reconciler).Reconcile(ctx, reconcile.Request{ + NamespacedName: client.ObjectKeyFromObject(cluster), + }) + g.Expect(err).ToNot(HaveOccurred()) + workloadCluster, err := m.GetWorkloadCluster(ctx, tt.clusterKey) if tt.expectErr { g.Expect(err).To(HaveOccurred()) diff --git a/controlplane/kubeadm/internal/controllers/controller.go b/controlplane/kubeadm/internal/controllers/controller.go index a128a96a7ea9..bd900097e012 100644 --- a/controlplane/kubeadm/internal/controllers/controller.go +++ b/controlplane/kubeadm/internal/controllers/controller.go @@ -40,7 +40,7 @@ import ( clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" bootstrapv1 "sigs.k8s.io/cluster-api/bootstrap/kubeadm/api/v1beta1" - "sigs.k8s.io/cluster-api/controllers/remote" + "sigs.k8s.io/cluster-api/controllers/clustercache" controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1" "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal" expv1 "sigs.k8s.io/cluster-api/exp/api/v1beta1" @@ -77,7 +77,7 @@ type KubeadmControlPlaneReconciler struct { SecretCachingClient client.Client controller controller.Controller recorder record.EventRecorder - Tracker *remote.ClusterCacheTracker + ClusterCache clustercache.ClusterCache EtcdDialTimeout time.Duration EtcdCallTimeout time.Duration @@ -109,7 +109,9 @@ func (r *KubeadmControlPlaneReconciler) SetupWithManager(ctx context.Context, mg predicates.ClusterUnpausedAndInfrastructureReady(mgr.GetScheme(), predicateLog), ), ), - ).Build(r) + ). + WatchesRawSource(r.ClusterCache.GetClusterSource("kubeadmcontrolplane", r.ClusterToKubeadmControlPlane)). + Build(r) if err != nil { return errors.Wrap(err, "failed setting up with a controller manager") } @@ -119,13 +121,13 @@ func (r *KubeadmControlPlaneReconciler) SetupWithManager(ctx context.Context, mg r.ssaCache = ssa.NewCache() if r.managementCluster == nil { - if r.Tracker == nil { + if r.ClusterCache == nil { return errors.New("cluster cache tracker is nil, cannot create the internal management cluster resource") } r.managementCluster = &internal.Management{ Client: r.Client, SecretCachingClient: r.SecretCachingClient, - Tracker: r.Tracker, + ClusterCache: r.ClusterCache, EtcdDialTimeout: r.EtcdDialTimeout, EtcdCallTimeout: r.EtcdCallTimeout, } @@ -239,10 +241,8 @@ func (r *KubeadmControlPlaneReconciler) Reconcile(ctx context.Context, req ctrl. if !kcp.ObjectMeta.DeletionTimestamp.IsZero() { // Handle deletion reconciliation loop. res, err = r.reconcileDelete(ctx, controlPlane) - // Requeue if the reconcile failed because the ClusterCacheTracker was locked for - // the current cluster because of concurrent access. - if errors.Is(err, remote.ErrClusterLocked) { - log.V(5).Info("Requeuing because another worker has the lock on the ClusterCacheTracker") + if errors.Is(err, clustercache.ErrClusterNotConnected) { + log.V(5).Info("Requeuing because connection to the workload cluster is down") return ctrl.Result{RequeueAfter: time.Minute}, nil } return res, err @@ -250,10 +250,8 @@ func (r *KubeadmControlPlaneReconciler) Reconcile(ctx context.Context, req ctrl. // Handle normal reconciliation loop. res, err = r.reconcile(ctx, controlPlane) - // Requeue if the reconcile failed because the ClusterCacheTracker was locked for - // the current cluster because of concurrent access. - if errors.Is(err, remote.ErrClusterLocked) { - log.V(5).Info("Requeuing because another worker has the lock on the ClusterCacheTracker") + if errors.Is(err, clustercache.ErrClusterNotConnected) { + log.V(5).Info("Requeuing because connection to the workload cluster is down") return ctrl.Result{RequeueAfter: time.Minute}, nil } return res, err diff --git a/controlplane/kubeadm/internal/controllers/controller_test.go b/controlplane/kubeadm/internal/controllers/controller_test.go index e8a4001e0e58..edf1993da4af 100644 --- a/controlplane/kubeadm/internal/controllers/controller_test.go +++ b/controlplane/kubeadm/internal/controllers/controller_test.go @@ -176,7 +176,7 @@ func TestReconcileUpdateObservedGeneration(t *testing.T) { Client: env, SecretCachingClient: secretCachingClient, recorder: record.NewFakeRecorder(32), - managementCluster: &internal.Management{Client: env.Client, Tracker: nil}, + managementCluster: &internal.Management{Client: env.Client, ClusterCache: nil}, } ns, err := env.CreateNamespace(ctx, "test-reconcile-upd-og") diff --git a/controlplane/kubeadm/main.go b/controlplane/kubeadm/main.go index aa46c743d8e2..ccc2f18a98b6 100644 --- a/controlplane/kubeadm/main.go +++ b/controlplane/kubeadm/main.go @@ -47,6 +47,7 @@ import ( clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" bootstrapv1 "sigs.k8s.io/cluster-api/bootstrap/kubeadm/api/v1beta1" + "sigs.k8s.io/cluster-api/controllers/clustercache" "sigs.k8s.io/cluster-api/controllers/remote" controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1" kubeadmcontrolplanecontrollers "sigs.k8s.io/cluster-api/controlplane/kubeadm/controllers" @@ -67,29 +68,29 @@ var ( controllerName = "cluster-api-kubeadm-control-plane-manager" // flags. - enableLeaderElection bool - leaderElectionLeaseDuration time.Duration - leaderElectionRenewDeadline time.Duration - leaderElectionRetryPeriod time.Duration - watchFilterValue string - watchNamespace string - profilerAddress string - enableContentionProfiling bool - syncPeriod time.Duration - restConfigQPS float32 - restConfigBurst int - clusterCacheTrackerClientQPS float32 - clusterCacheTrackerClientBurst int - webhookPort int - webhookCertDir string - webhookCertName string - webhookKeyName string - healthAddr string - managerOptions = flags.ManagerOptions{} - logOptions = logs.NewOptions() + enableLeaderElection bool + leaderElectionLeaseDuration time.Duration + leaderElectionRenewDeadline time.Duration + leaderElectionRetryPeriod time.Duration + watchFilterValue string + watchNamespace string + profilerAddress string + enableContentionProfiling bool + syncPeriod time.Duration + restConfigQPS float32 + restConfigBurst int + clusterCacheClientQPS float32 + clusterCacheClientBurst int + webhookPort int + webhookCertDir string + webhookCertName string + webhookKeyName string + healthAddr string + managerOptions = flags.ManagerOptions{} + logOptions = logs.NewOptions() // KCP specific flags. kubeadmControlPlaneConcurrency int - clusterCacheTrackerConcurrency int + clusterCacheConcurrency int etcdDialTimeout time.Duration etcdCallTimeout time.Duration useDeprecatedInfraMachineNaming bool @@ -137,7 +138,7 @@ func InitFlags(fs *pflag.FlagSet) { fs.IntVar(&kubeadmControlPlaneConcurrency, "kubeadmcontrolplane-concurrency", 10, "Number of kubeadm control planes to process simultaneously") - fs.IntVar(&clusterCacheTrackerConcurrency, "clustercachetracker-concurrency", 10, + fs.IntVar(&clusterCacheConcurrency, "clustercache-concurrency", 100, "Number of clusters to process simultaneously") fs.DurationVar(&syncPeriod, "sync-period", 10*time.Minute, @@ -149,11 +150,11 @@ func InitFlags(fs *pflag.FlagSet) { fs.IntVar(&restConfigBurst, "kube-api-burst", 30, "Maximum number of queries that should be allowed in one burst from the controller client to the Kubernetes API server.") - fs.Float32Var(&clusterCacheTrackerClientQPS, "clustercachetracker-client-qps", 20, - "Maximum queries per second from the cluster cache tracker clients to the Kubernetes API server of workload clusters.") + fs.Float32Var(&clusterCacheClientQPS, "clustercache-client-qps", 20, + "Maximum queries per second from the cluster cache clients to the Kubernetes API server of workload clusters.") - fs.IntVar(&clusterCacheTrackerClientBurst, "clustercachetracker-client-burst", 30, - "Maximum number of queries that should be allowed in one burst from the cluster cache tracker clients to the Kubernetes API server of workload clusters.") + fs.IntVar(&clusterCacheClientBurst, "clustercache-client-burst", 30, + "Maximum number of queries that should be allowed in one burst from the cluster cache clients to the Kubernetes API server of workload clusters.") fs.IntVar(&webhookPort, "webhook-port", 9443, "Webhook Server port") @@ -325,39 +326,34 @@ func setupReconcilers(ctx context.Context, mgr ctrl.Manager) { os.Exit(1) } - // Set up a ClusterCacheTracker to provide to controllers - // requiring a connection to a remote cluster - tracker, err := remote.NewClusterCacheTracker(mgr, remote.ClusterCacheTrackerOptions{ - SecretCachingClient: secretCachingClient, - ControllerName: controllerName, - Log: &ctrl.Log, - ClientUncachedObjects: []client.Object{ - &corev1.ConfigMap{}, - &corev1.Secret{}, - &corev1.Pod{}, - &appsv1.Deployment{}, - &appsv1.DaemonSet{}, + clusterCache, err := clustercache.SetupWithManager(ctx, mgr, clustercache.Options{ + SecretClient: secretCachingClient, + Cache: clustercache.CacheOptions{}, + Client: clustercache.ClientOptions{ + QPS: clusterCacheClientQPS, + Burst: clusterCacheClientBurst, + UserAgent: remote.DefaultClusterAPIUserAgent(controllerName), + Cache: clustercache.ClientCacheOptions{ + DisableFor: []client.Object{ + &corev1.ConfigMap{}, + &corev1.Secret{}, + &corev1.Pod{}, + &appsv1.Deployment{}, + &appsv1.DaemonSet{}, + }, + }, }, - ClientQPS: clusterCacheTrackerClientQPS, - ClientBurst: clusterCacheTrackerClientBurst, - }) - if err != nil { - setupLog.Error(err, "unable to create cluster cache tracker") - os.Exit(1) - } - if err := (&remote.ClusterCacheReconciler{ - Client: mgr.GetClient(), - Tracker: tracker, WatchFilterValue: watchFilterValue, - }).SetupWithManager(ctx, mgr, concurrency(clusterCacheTrackerConcurrency)); err != nil { - setupLog.Error(err, "unable to create controller", "controller", "ClusterCacheReconciler") + }, concurrency(clusterCacheConcurrency)) + if err != nil { + setupLog.Error(err, "Unable to create ClusterCache") os.Exit(1) } if err := (&kubeadmcontrolplanecontrollers.KubeadmControlPlaneReconciler{ Client: mgr.GetClient(), SecretCachingClient: secretCachingClient, - Tracker: tracker, + ClusterCache: clusterCache, WatchFilterValue: watchFilterValue, EtcdDialTimeout: etcdDialTimeout, EtcdCallTimeout: etcdCallTimeout, diff --git a/exp/addons/controllers/alias.go b/exp/addons/controllers/alias.go index 63a1d324effe..8ff469621ffd 100644 --- a/exp/addons/controllers/alias.go +++ b/exp/addons/controllers/alias.go @@ -24,14 +24,14 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" - "sigs.k8s.io/cluster-api/controllers/remote" + "sigs.k8s.io/cluster-api/controllers/clustercache" clusterresourcesets "sigs.k8s.io/cluster-api/exp/addons/internal/controllers" ) // ClusterResourceSetReconciler reconciles a ClusterResourceSet object. type ClusterResourceSetReconciler struct { - Client client.Client - Tracker *remote.ClusterCacheTracker + Client client.Client + ClusterCache clustercache.ClusterCache // WatchFilterValue is the label value used to filter events prior to reconciliation. WatchFilterValue string @@ -40,7 +40,7 @@ type ClusterResourceSetReconciler struct { func (r *ClusterResourceSetReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, options controller.Options, partialSecretCache cache.Cache) error { return (&clusterresourcesets.ClusterResourceSetReconciler{ Client: r.Client, - Tracker: r.Tracker, + ClusterCache: r.ClusterCache, WatchFilterValue: r.WatchFilterValue, }).SetupWithManager(ctx, mgr, options, partialSecretCache) } diff --git a/exp/addons/internal/controllers/clusterresourceset_controller.go b/exp/addons/internal/controllers/clusterresourceset_controller.go index 6fed7a91affc..6c4cc1d45c41 100644 --- a/exp/addons/internal/controllers/clusterresourceset_controller.go +++ b/exp/addons/internal/controllers/clusterresourceset_controller.go @@ -41,7 +41,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/source" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" - "sigs.k8s.io/cluster-api/controllers/remote" + "sigs.k8s.io/cluster-api/controllers/clustercache" addonsv1 "sigs.k8s.io/cluster-api/exp/addons/api/v1beta1" resourcepredicates "sigs.k8s.io/cluster-api/exp/addons/internal/controllers/predicates" "sigs.k8s.io/cluster-api/util" @@ -61,8 +61,8 @@ var ErrSecretTypeNotSupported = errors.New("unsupported secret type") // ClusterResourceSetReconciler reconciles a ClusterResourceSet object. type ClusterResourceSetReconciler struct { - Client client.Client - Tracker *remote.ClusterCacheTracker + Client client.Client + ClusterCache clustercache.ClusterCache // WatchFilterValue is the label value used to filter events prior to reconciliation. WatchFilterValue string @@ -76,6 +76,7 @@ func (r *ClusterResourceSetReconciler) SetupWithManager(ctx context.Context, mgr &clusterv1.Cluster{}, handler.EnqueueRequestsFromMapFunc(r.clusterToClusterResourceSet), ). + WatchesRawSource(r.ClusterCache.GetClusterSource("clusterresourceset", r.clusterToClusterResourceSet)). WatchesMetadata( &corev1.ConfigMap{}, handler.EnqueueRequestsFromMapFunc( @@ -159,14 +160,13 @@ func (r *ClusterResourceSetReconciler) Reconcile(ctx context.Context, req ctrl.R } errs := []error{} - errClusterLockedOccurred := false + errClusterNotConnectedOccurred := false for _, cluster := range clusters { if err := r.ApplyClusterResourceSet(ctx, cluster, clusterResourceSet); err != nil { - // Requeue if the reconcile failed because the ClusterCacheTracker was locked for - // the current cluster because of concurrent access. - if errors.Is(err, remote.ErrClusterLocked) { - log.V(5).Info("Requeuing because another worker has the lock on the ClusterCacheTracker") - errClusterLockedOccurred = true + // Requeue if the reconcile failed because connection to workload cluster was down. + if errors.Is(err, clustercache.ErrClusterNotConnected) { + log.V(5).Info("Requeuing because connection to the workload cluster is down") + errClusterNotConnectedOccurred = true } else { // Append the error if the error is not ErrClusterLocked. errs = append(errs, err) @@ -195,8 +195,8 @@ func (r *ClusterResourceSetReconciler) Reconcile(ctx context.Context, req ctrl.R return ctrl.Result{}, kerrors.NewAggregate(errs) } - // Requeue if ErrClusterLocked was returned for one of the clusters. - if errClusterLockedOccurred { + // Requeue if ErrClusterNotConnected was returned for one of the clusters. + if errClusterNotConnectedOccurred { // Requeue after a minute to not end up in exponential delayed requeue which // could take up to 16m40s. return ctrl.Result{RequeueAfter: time.Minute}, nil @@ -353,7 +353,7 @@ func (r *ClusterResourceSetReconciler) ApplyClusterResourceSet(ctx context.Conte resourceSetBinding := clusterResourceSetBinding.GetOrCreateBinding(clusterResourceSet) - remoteClient, err := r.Tracker.GetClient(ctx, util.ObjectKey(cluster)) + remoteClient, err := r.ClusterCache.GetClient(ctx, util.ObjectKey(cluster)) if err != nil { conditions.MarkFalse(clusterResourceSet, addonsv1.ResourcesAppliedCondition, addonsv1.RemoteClusterClientFailedReason, clusterv1.ConditionSeverityError, err.Error()) return err diff --git a/exp/addons/internal/controllers/clusterresourceset_controller_test.go b/exp/addons/internal/controllers/clusterresourceset_controller_test.go index b803bc620de5..da67a5391023 100644 --- a/exp/addons/internal/controllers/clusterresourceset_controller_test.go +++ b/exp/addons/internal/controllers/clusterresourceset_controller_test.go @@ -110,8 +110,15 @@ metadata: g.Expect(env.CreateAndWait(ctx, testCluster)).To(Succeed()) t.Log("Creating the remote Cluster kubeconfig") g.Expect(env.CreateKubeconfigSecret(ctx, testCluster)).To(Succeed()) - _, err = tracker.GetClient(ctx, client.ObjectKeyFromObject(testCluster)) - g.Expect(err).ToNot(HaveOccurred()) + // Set InfrastructureReady to true so ClusterCache creates the clusterAccessor. + patch := client.MergeFrom(testCluster.DeepCopy()) + testCluster.Status.InfrastructureReady = true + g.Expect(env.Status().Patch(ctx, testCluster, patch)).To(Succeed()) + + g.Eventually(func(g Gomega) { + _, err = clusterCache.GetClient(ctx, client.ObjectKeyFromObject(testCluster)) + g.Expect(err).ToNot(HaveOccurred()) + }, 1*time.Minute, 5*time.Second).Should(Succeed()) createConfigMapAndSecret(g, ns.Name, configmapName, secretName) return ns diff --git a/exp/addons/internal/controllers/suite_test.go b/exp/addons/internal/controllers/suite_test.go index bea34b9c1ea6..1e560ae533d5 100644 --- a/exp/addons/internal/controllers/suite_test.go +++ b/exp/addons/internal/controllers/suite_test.go @@ -32,15 +32,16 @@ import ( "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/cluster-api/api/v1beta1/index" + "sigs.k8s.io/cluster-api/controllers/clustercache" "sigs.k8s.io/cluster-api/controllers/remote" addonsv1 "sigs.k8s.io/cluster-api/exp/addons/api/v1beta1" "sigs.k8s.io/cluster-api/internal/test/envtest" ) var ( - env *envtest.Environment - tracker *remote.ClusterCacheTracker - ctx = ctrl.SetupSignalHandler() + env *envtest.Environment + clusterCache clustercache.ClusterCache + ctx = ctrl.SetupSignalHandler() ) func TestMain(m *testing.M) { @@ -77,14 +78,29 @@ func TestMain(m *testing.M) { panic(fmt.Sprintf("Failed to start cache for metadata only Secret watches: %v", err)) } - tracker, err = remote.NewClusterCacheTracker(mgr, remote.ClusterCacheTrackerOptions{}) + clusterCache, err = clustercache.SetupWithManager(ctx, mgr, clustercache.Options{ + SecretClient: mgr.GetClient(), + Cache: clustercache.CacheOptions{ + Indexes: []clustercache.CacheOptionsIndex{clustercache.NodeProviderIDIndex}, + }, + Client: clustercache.ClientOptions{ + UserAgent: remote.DefaultClusterAPIUserAgent("test-controller-manager"), + Cache: clustercache.ClientCacheOptions{ + DisableFor: []client.Object{ + // Don't cache ConfigMaps & Secrets. + &corev1.ConfigMap{}, + &corev1.Secret{}, + }, + }, + }, + }, controller.Options{MaxConcurrentReconciles: 10}) if err != nil { - panic(fmt.Sprintf("Failed to create new cluster cache tracker: %v", err)) + panic(fmt.Sprintf("Failed to create ClusterCache: %v", err)) } reconciler := ClusterResourceSetReconciler{ - Client: mgr.GetClient(), - Tracker: tracker, + Client: mgr.GetClient(), + ClusterCache: clusterCache, } if err = reconciler.SetupWithManager(ctx, mgr, controller.Options{MaxConcurrentReconciles: 10}, partialSecretCache); err != nil { panic(fmt.Sprintf("Failed to set up cluster resource set reconciler: %v", err)) diff --git a/exp/controllers/alias.go b/exp/controllers/alias.go index 0db7dfb5f090..920ef26da2de 100644 --- a/exp/controllers/alias.go +++ b/exp/controllers/alias.go @@ -23,15 +23,15 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" - "sigs.k8s.io/cluster-api/controllers/remote" + "sigs.k8s.io/cluster-api/controllers/clustercache" machinepool "sigs.k8s.io/cluster-api/exp/internal/controllers" ) // MachinePoolReconciler reconciles a MachinePool object. type MachinePoolReconciler struct { - Client client.Client - APIReader client.Reader - Tracker *remote.ClusterCacheTracker + Client client.Client + APIReader client.Reader + ClusterCache clustercache.ClusterCache // WatchFilterValue is the label value used to filter events prior to reconciliation. WatchFilterValue string @@ -41,7 +41,7 @@ func (r *MachinePoolReconciler) SetupWithManager(ctx context.Context, mgr ctrl.M return (&machinepool.MachinePoolReconciler{ Client: r.Client, APIReader: r.APIReader, - Tracker: r.Tracker, + ClusterCache: r.ClusterCache, WatchFilterValue: r.WatchFilterValue, }).SetupWithManager(ctx, mgr, options) } diff --git a/exp/internal/controllers/machinepool_controller.go b/exp/internal/controllers/machinepool_controller.go index f7fc99706daa..27d9a6e8a976 100644 --- a/exp/internal/controllers/machinepool_controller.go +++ b/exp/internal/controllers/machinepool_controller.go @@ -39,8 +39,8 @@ import ( clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/api/v1beta1/index" + "sigs.k8s.io/cluster-api/controllers/clustercache" "sigs.k8s.io/cluster-api/controllers/external" - "sigs.k8s.io/cluster-api/controllers/remote" expv1 "sigs.k8s.io/cluster-api/exp/api/v1beta1" "sigs.k8s.io/cluster-api/internal/util/ssa" "sigs.k8s.io/cluster-api/util" @@ -71,9 +71,9 @@ const ( // MachinePoolReconciler reconciles a MachinePool object. type MachinePoolReconciler struct { - Client client.Client - APIReader client.Reader - Tracker *remote.ClusterCacheTracker + Client client.Client + APIReader client.Reader + ClusterCache clustercache.ClusterCache // WatchFilterValue is the label value used to filter events prior to reconciliation. WatchFilterValue string @@ -121,6 +121,7 @@ func (r *MachinePoolReconciler) SetupWithManager(ctx context.Context, mgr ctrl.M ), ), ). + WatchesRawSource(r.ClusterCache.GetClusterSource("machinepool", clusterToMachinePools)). Build(r) if err != nil { return errors.Wrap(err, "failed setting up with a controller manager") @@ -219,10 +220,9 @@ func (r *MachinePoolReconciler) Reconcile(ctx context.Context, req ctrl.Request) // Handle deletion reconciliation loop. if !mp.ObjectMeta.DeletionTimestamp.IsZero() { err := r.reconcileDelete(ctx, cluster, mp) - // Requeue if the reconcile failed because the ClusterCacheTracker was locked for - // the current cluster because of concurrent access. - if errors.Is(err, remote.ErrClusterLocked) { - log.V(5).Info("Requeuing because another worker has the lock on the ClusterCacheTracker") + // Requeue if the reconcile failed because connection to workload cluster was down. + if errors.Is(err, clustercache.ErrClusterNotConnected) { + log.V(5).Info("Requeuing because connection to the workload cluster is down") return ctrl.Result{RequeueAfter: time.Minute}, nil } @@ -235,10 +235,9 @@ func (r *MachinePoolReconciler) Reconcile(ctx context.Context, req ctrl.Request) machinePool: mp, } res, err := r.reconcile(ctx, scope) - // Requeue if the reconcile failed because the ClusterCacheTracker was locked for - // the current cluster because of concurrent access. - if errors.Is(err, remote.ErrClusterLocked) { - log.V(5).Info("Requeuing because another worker has the lock on the ClusterCacheTracker") + // Requeue if the reconcile failed because connection to workload cluster was down. + if errors.Is(err, clustercache.ErrClusterNotConnected) { + log.V(5).Info("Requeuing because connection to the workload cluster is down") return ctrl.Result{RequeueAfter: time.Minute}, nil } return res, err @@ -306,7 +305,7 @@ func (r *MachinePoolReconciler) reconcileDeleteNodes(ctx context.Context, cluste return nil } - clusterClient, err := r.Tracker.GetClient(ctx, util.ObjectKey(cluster)) + clusterClient, err := r.ClusterCache.GetClient(ctx, util.ObjectKey(cluster)) if err != nil { return err } @@ -370,9 +369,8 @@ func (r *MachinePoolReconciler) watchClusterNodes(ctx context.Context, cluster * return nil } - return r.Tracker.Watch(ctx, remote.WatchInput{ + return r.ClusterCache.Watch(ctx, util.ObjectKey(cluster), clustercache.WatchInput{ Name: "machinepool-watchNodes", - Cluster: util.ObjectKey(cluster), Watcher: r.controller, Kind: &corev1.Node{}, EventHandler: handler.EnqueueRequestsFromMapFunc(r.nodeToMachinePool), diff --git a/exp/internal/controllers/machinepool_controller_noderef.go b/exp/internal/controllers/machinepool_controller_noderef.go index 8d34d4d148e0..20748cbccbc0 100644 --- a/exp/internal/controllers/machinepool_controller_noderef.go +++ b/exp/internal/controllers/machinepool_controller_noderef.go @@ -75,7 +75,7 @@ func (r *MachinePoolReconciler) reconcileNodeRefs(ctx context.Context, s *scope) return ctrl.Result{}, nil } - clusterClient, err := r.Tracker.GetClient(ctx, util.ObjectKey(cluster)) + clusterClient, err := r.ClusterCache.GetClient(ctx, util.ObjectKey(cluster)) if err != nil { return ctrl.Result{}, err } @@ -86,7 +86,7 @@ func (r *MachinePoolReconciler) reconcileNodeRefs(ctx context.Context, s *scope) // Return early if nodeRefMap is nil. if s.nodeRefMap == nil { - return ctrl.Result{}, errors.Wrapf(err, "failed to get node references") + return ctrl.Result{}, errors.New("failed to get Node references") } // Get the Node references. diff --git a/exp/internal/controllers/machinepool_controller_phases.go b/exp/internal/controllers/machinepool_controller_phases.go index e46b58f8818c..3a5474d05914 100644 --- a/exp/internal/controllers/machinepool_controller_phases.go +++ b/exp/internal/controllers/machinepool_controller_phases.go @@ -285,7 +285,7 @@ func (r *MachinePoolReconciler) reconcileInfrastructure(ctx context.Context, s * conditions.WithFallbackValue(ready, clusterv1.WaitingForInfrastructureFallbackReason, clusterv1.ConditionSeverityInfo, ""), ) - clusterClient, err := r.Tracker.GetClient(ctx, util.ObjectKey(cluster)) + clusterClient, err := r.ClusterCache.GetClient(ctx, util.ObjectKey(cluster)) if err != nil { return ctrl.Result{}, err } diff --git a/exp/internal/controllers/machinepool_controller_phases_test.go b/exp/internal/controllers/machinepool_controller_phases_test.go index 1029ba800729..6bbab052f53f 100644 --- a/exp/internal/controllers/machinepool_controller_phases_test.go +++ b/exp/internal/controllers/machinepool_controller_phases_test.go @@ -21,7 +21,6 @@ import ( "testing" "time" - "github.com/go-logr/logr" . "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -33,11 +32,10 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" - "sigs.k8s.io/controller-runtime/pkg/log" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + "sigs.k8s.io/cluster-api/controllers/clustercache" "sigs.k8s.io/cluster-api/controllers/external" - "sigs.k8s.io/cluster-api/controllers/remote" expv1 "sigs.k8s.io/cluster-api/exp/api/v1beta1" "sigs.k8s.io/cluster-api/internal/test/builder" "sigs.k8s.io/cluster-api/internal/util/ssa" @@ -128,8 +126,8 @@ func TestReconcileMachinePoolPhases(t *testing.T) { fakeClient := fake.NewClientBuilder().WithObjects(defaultCluster, defaultKubeconfigSecret, machinepool, bootstrapConfig, infraConfig, builder.TestBootstrapConfigCRD, builder.TestInfrastructureMachineTemplateCRD).Build() r := &MachinePoolReconciler{ - Client: fakeClient, - Tracker: remote.NewTestClusterCacheTracker(logr.New(log.NullLogSink{}), fakeClient, fakeClient, fakeClient.Scheme(), client.ObjectKey{Name: defaultCluster.Name, Namespace: defaultCluster.Namespace}), + Client: fakeClient, + ClusterCache: clustercache.NewFakeClusterCache(fakeClient, client.ObjectKey{Name: defaultCluster.Name, Namespace: defaultCluster.Namespace}), } scope := &scope{ @@ -165,8 +163,8 @@ func TestReconcileMachinePoolPhases(t *testing.T) { fakeClient := fake.NewClientBuilder().WithObjects(defaultCluster, defaultKubeconfigSecret, machinepool, bootstrapConfig, infraConfig, builder.TestBootstrapConfigCRD, builder.TestInfrastructureMachineTemplateCRD).Build() r := &MachinePoolReconciler{ - Client: fakeClient, - Tracker: remote.NewTestClusterCacheTracker(logr.New(log.NullLogSink{}), fakeClient, fakeClient, fakeClient.Scheme(), client.ObjectKey{Name: defaultCluster.Name, Namespace: defaultCluster.Namespace}), + Client: fakeClient, + ClusterCache: clustercache.NewFakeClusterCache(fakeClient, client.ObjectKey{Name: defaultCluster.Name, Namespace: defaultCluster.Namespace}), } scope := &scope{ @@ -199,8 +197,8 @@ func TestReconcileMachinePoolPhases(t *testing.T) { fakeClient := fake.NewClientBuilder().WithObjects(defaultCluster, defaultKubeconfigSecret, machinepool, bootstrapConfig, infraConfig, builder.TestBootstrapConfigCRD, builder.TestInfrastructureMachineTemplateCRD).Build() r := &MachinePoolReconciler{ - Client: fakeClient, - Tracker: remote.NewTestClusterCacheTracker(logr.New(log.NullLogSink{}), fakeClient, fakeClient, fakeClient.Scheme(), client.ObjectKey{Name: defaultCluster.Name, Namespace: defaultCluster.Namespace}), + Client: fakeClient, + ClusterCache: clustercache.NewFakeClusterCache(fakeClient, client.ObjectKey{Name: defaultCluster.Name, Namespace: defaultCluster.Namespace}), } scope := &scope{ @@ -249,8 +247,8 @@ func TestReconcileMachinePoolPhases(t *testing.T) { fakeClient := fake.NewClientBuilder().WithObjects(defaultCluster, defaultKubeconfigSecret, machinepool, bootstrapConfig, infraConfig, builder.TestBootstrapConfigCRD, builder.TestInfrastructureMachineTemplateCRD).Build() r := &MachinePoolReconciler{ - Client: fakeClient, - Tracker: remote.NewTestClusterCacheTracker(logr.New(log.NullLogSink{}), fakeClient, fakeClient, fakeClient.Scheme(), client.ObjectKey{Name: defaultCluster.Name, Namespace: defaultCluster.Namespace}), + Client: fakeClient, + ClusterCache: clustercache.NewFakeClusterCache(fakeClient, client.ObjectKey{Name: defaultCluster.Name, Namespace: defaultCluster.Namespace}), } scope := &scope{ @@ -311,8 +309,8 @@ func TestReconcileMachinePoolPhases(t *testing.T) { fakeClient := fake.NewClientBuilder().WithObjects(defaultCluster, defaultKubeconfigSecret, machinepool, bootstrapConfig, infraConfig, builder.TestBootstrapConfigCRD, builder.TestInfrastructureMachineTemplateCRD).Build() r := &MachinePoolReconciler{ - Client: fakeClient, - Tracker: remote.NewTestClusterCacheTracker(logr.New(log.NullLogSink{}), fakeClient, fakeClient, fakeClient.Scheme(), client.ObjectKey{Name: defaultCluster.Name, Namespace: defaultCluster.Namespace}), + Client: fakeClient, + ClusterCache: clustercache.NewFakeClusterCache(fakeClient, client.ObjectKey{Name: defaultCluster.Name, Namespace: defaultCluster.Namespace}), } scope := &scope{ @@ -351,8 +349,8 @@ func TestReconcileMachinePoolPhases(t *testing.T) { fakeClient := fake.NewClientBuilder().WithObjects(defaultCluster, defaultKubeconfigSecret, machinepool, bootstrapConfig, infraConfig, builder.TestBootstrapConfigCRD, builder.TestInfrastructureMachineTemplateCRD).Build() r := &MachinePoolReconciler{ - Client: fakeClient, - Tracker: remote.NewTestClusterCacheTracker(logr.New(log.NullLogSink{}), fakeClient, fakeClient, fakeClient.Scheme(), client.ObjectKey{Name: defaultCluster.Name, Namespace: defaultCluster.Namespace}), + Client: fakeClient, + ClusterCache: clustercache.NewFakeClusterCache(fakeClient, client.ObjectKey{Name: defaultCluster.Name, Namespace: defaultCluster.Namespace}), } scope := &scope{ @@ -398,8 +396,8 @@ func TestReconcileMachinePoolPhases(t *testing.T) { fakeClient := fake.NewClientBuilder().WithObjects(defaultCluster, defaultKubeconfigSecret, machinepool, bootstrapConfig, infraConfig, builder.TestBootstrapConfigCRD, builder.TestInfrastructureMachineTemplateCRD).Build() r := &MachinePoolReconciler{ - Client: fakeClient, - Tracker: remote.NewTestClusterCacheTracker(logr.New(log.NullLogSink{}), fakeClient, fakeClient, fakeClient.Scheme(), client.ObjectKey{Name: defaultCluster.Name, Namespace: defaultCluster.Namespace}), + Client: fakeClient, + ClusterCache: clustercache.NewFakeClusterCache(fakeClient, client.ObjectKey{Name: defaultCluster.Name, Namespace: defaultCluster.Namespace}), } scope := &scope{ @@ -458,8 +456,8 @@ func TestReconcileMachinePoolPhases(t *testing.T) { fakeClient := fake.NewClientBuilder().WithObjects(defaultCluster, defaultKubeconfigSecret, machinepool, bootstrapConfig, infraConfig, builder.TestBootstrapConfigCRD, builder.TestInfrastructureMachineTemplateCRD).Build() r := &MachinePoolReconciler{ - Client: fakeClient, - Tracker: remote.NewTestClusterCacheTracker(logr.New(log.NullLogSink{}), fakeClient, fakeClient, fakeClient.Scheme(), client.ObjectKey{Name: defaultCluster.Name, Namespace: defaultCluster.Namespace}), + Client: fakeClient, + ClusterCache: clustercache.NewFakeClusterCache(fakeClient, client.ObjectKey{Name: defaultCluster.Name, Namespace: defaultCluster.Namespace}), } scope := &scope{ @@ -524,8 +522,8 @@ func TestReconcileMachinePoolPhases(t *testing.T) { fakeClient := fake.NewClientBuilder().WithObjects(defaultCluster, defaultKubeconfigSecret, machinepool, bootstrapConfig, infraConfig, builder.TestBootstrapConfigCRD, builder.TestInfrastructureMachineTemplateCRD).Build() r := &MachinePoolReconciler{ - Client: fakeClient, - Tracker: remote.NewTestClusterCacheTracker(logr.New(log.NullLogSink{}), fakeClient, fakeClient, fakeClient.Scheme(), client.ObjectKey{Name: defaultCluster.Name, Namespace: defaultCluster.Namespace}), + Client: fakeClient, + ClusterCache: clustercache.NewFakeClusterCache(fakeClient, client.ObjectKey{Name: defaultCluster.Name, Namespace: defaultCluster.Namespace}), } scope := &scope{ @@ -588,8 +586,8 @@ func TestReconcileMachinePoolPhases(t *testing.T) { fakeClient := fake.NewClientBuilder().WithObjects(defaultCluster, defaultKubeconfigSecret, machinePool, bootstrapConfig, infraConfig, builder.TestBootstrapConfigCRD, builder.TestInfrastructureMachineTemplateCRD).Build() r := &MachinePoolReconciler{ - Client: fakeClient, - Tracker: remote.NewTestClusterCacheTracker(logr.New(log.NullLogSink{}), fakeClient, fakeClient, fakeClient.Scheme(), client.ObjectKey{Name: defaultCluster.Name, Namespace: defaultCluster.Namespace}), + Client: fakeClient, + ClusterCache: clustercache.NewFakeClusterCache(fakeClient, client.ObjectKey{Name: defaultCluster.Name, Namespace: defaultCluster.Namespace}), } scope := &scope{ @@ -674,8 +672,8 @@ func TestReconcileMachinePoolPhases(t *testing.T) { fakeClient := fake.NewClientBuilder().WithObjects(defaultCluster, defaultKubeconfigSecret, machinePool, bootstrapConfig, infraConfig, builder.TestBootstrapConfigCRD, builder.TestInfrastructureMachineTemplateCRD).Build() r := &MachinePoolReconciler{ - Client: fakeClient, - Tracker: remote.NewTestClusterCacheTracker(logr.New(log.NullLogSink{}), fakeClient, fakeClient, fakeClient.Scheme(), client.ObjectKey{Name: defaultCluster.Name, Namespace: defaultCluster.Namespace}), + Client: fakeClient, + ClusterCache: clustercache.NewFakeClusterCache(fakeClient, client.ObjectKey{Name: defaultCluster.Name, Namespace: defaultCluster.Namespace}), } scope := &scope{ @@ -1285,8 +1283,8 @@ func TestReconcileMachinePoolInfrastructure(t *testing.T) { infraConfig := &unstructured.Unstructured{Object: tc.infraConfig} fakeClient := fake.NewClientBuilder().WithObjects(tc.machinepool, infraConfig, builder.TestBootstrapConfigCRD, builder.TestInfrastructureMachineTemplateCRD).Build() r := &MachinePoolReconciler{ - Client: fakeClient, - Tracker: remote.NewTestClusterCacheTracker(logr.New(log.NullLogSink{}), fakeClient, fakeClient, fakeClient.Scheme(), client.ObjectKey{Name: defaultCluster.Name, Namespace: defaultCluster.Namespace}), + Client: fakeClient, + ClusterCache: clustercache.NewFakeClusterCache(fakeClient, client.ObjectKey{Name: defaultCluster.Name, Namespace: defaultCluster.Namespace}), } scope := &scope{ @@ -1788,9 +1786,9 @@ func TestReconcileMachinePoolScaleToFromZero(t *testing.T) { fakeClient := fake.NewClientBuilder().WithObjects(testCluster, kubeconfigSecret, machinepool, bootstrapConfig, infraConfig, builder.TestBootstrapConfigCRD, builder.TestInfrastructureMachineTemplateCRD).Build() r := &MachinePoolReconciler{ - Client: fakeClient, - Tracker: remote.NewTestClusterCacheTracker(logr.New(log.NullLogSink{}), env.GetClient(), env.GetClient(), env.GetClient().Scheme(), client.ObjectKey{Name: testCluster.Name, Namespace: testCluster.Namespace}), - recorder: record.NewFakeRecorder(32), + Client: fakeClient, + ClusterCache: clustercache.NewFakeClusterCache(env.GetClient(), client.ObjectKey{Name: testCluster.Name, Namespace: testCluster.Namespace}), + recorder: record.NewFakeRecorder(32), } scope := &scope{ @@ -1850,9 +1848,9 @@ func TestReconcileMachinePoolScaleToFromZero(t *testing.T) { fakeClient := fake.NewClientBuilder().WithObjects(testCluster, kubeconfigSecret, machinepool, bootstrapConfig, infraConfig, builder.TestBootstrapConfigCRD, builder.TestInfrastructureMachineTemplateCRD).Build() r := &MachinePoolReconciler{ - Client: fakeClient, - Tracker: remote.NewTestClusterCacheTracker(logr.New(log.NullLogSink{}), env.GetClient(), env.GetClient(), env.GetClient().Scheme(), client.ObjectKey{Name: testCluster.Name, Namespace: testCluster.Namespace}), - recorder: record.NewFakeRecorder(32), + Client: fakeClient, + ClusterCache: clustercache.NewFakeClusterCache(env.GetClient(), client.ObjectKey{Name: testCluster.Name, Namespace: testCluster.Namespace}), + recorder: record.NewFakeRecorder(32), } scope := &scope{ @@ -1895,9 +1893,9 @@ func TestReconcileMachinePoolScaleToFromZero(t *testing.T) { fakeClient := fake.NewClientBuilder().WithObjects(testCluster, kubeconfigSecret, machinepool, bootstrapConfig, infraConfig, builder.TestBootstrapConfigCRD, builder.TestInfrastructureMachineTemplateCRD).Build() r := &MachinePoolReconciler{ - Client: fakeClient, - recorder: record.NewFakeRecorder(32), - Tracker: remote.NewTestClusterCacheTracker(logr.New(log.NullLogSink{}), fakeClient, fakeClient, fakeClient.Scheme(), client.ObjectKey{Name: testCluster.Name, Namespace: testCluster.Namespace}), + Client: fakeClient, + recorder: record.NewFakeRecorder(32), + ClusterCache: clustercache.NewFakeClusterCache(fakeClient, client.ObjectKey{Name: testCluster.Name, Namespace: testCluster.Namespace}), } scope := &scope{ @@ -1936,9 +1934,9 @@ func TestReconcileMachinePoolScaleToFromZero(t *testing.T) { fakeClient := fake.NewClientBuilder().WithObjects(testCluster, kubeconfigSecret, machinepool, bootstrapConfig, infraConfig, builder.TestBootstrapConfigCRD, builder.TestInfrastructureMachineTemplateCRD).Build() r := &MachinePoolReconciler{ - Client: fakeClient, - recorder: record.NewFakeRecorder(32), - Tracker: remote.NewTestClusterCacheTracker(logr.New(log.NullLogSink{}), fakeClient, fakeClient, fakeClient.Scheme(), client.ObjectKey{Name: testCluster.Name, Namespace: testCluster.Namespace}), + Client: fakeClient, + recorder: record.NewFakeRecorder(32), + ClusterCache: clustercache.NewFakeClusterCache(fakeClient, client.ObjectKey{Name: testCluster.Name, Namespace: testCluster.Namespace}), } scope := &scope{ @@ -1999,9 +1997,9 @@ func TestReconcileMachinePoolScaleToFromZero(t *testing.T) { fakeClient := fake.NewClientBuilder().WithObjects(testCluster, kubeconfigSecret, machinepool, bootstrapConfig, infraConfig, builder.TestBootstrapConfigCRD, builder.TestInfrastructureMachineTemplateCRD).Build() r := &MachinePoolReconciler{ - Client: fakeClient, - Tracker: remote.NewTestClusterCacheTracker(logr.New(log.NullLogSink{}), env.GetClient(), env.GetClient(), env.GetClient().Scheme(), client.ObjectKey{Name: testCluster.Name, Namespace: testCluster.Namespace}), - recorder: record.NewFakeRecorder(32), + Client: fakeClient, + ClusterCache: clustercache.NewFakeClusterCache(env.GetClient(), client.ObjectKey{Name: testCluster.Name, Namespace: testCluster.Namespace}), + recorder: record.NewFakeRecorder(32), } scope := &scope{ diff --git a/exp/internal/controllers/machinepool_controller_test.go b/exp/internal/controllers/machinepool_controller_test.go index 31af2e75251f..0d5e1e40acb5 100644 --- a/exp/internal/controllers/machinepool_controller_test.go +++ b/exp/internal/controllers/machinepool_controller_test.go @@ -38,7 +38,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/reconcile" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" - "sigs.k8s.io/cluster-api/controllers/remote" + "sigs.k8s.io/cluster-api/controllers/clustercache" expv1 "sigs.k8s.io/cluster-api/exp/api/v1beta1" "sigs.k8s.io/cluster-api/internal/test/builder" "sigs.k8s.io/cluster-api/util" @@ -553,9 +553,9 @@ func TestReconcileMachinePoolRequest(t *testing.T) { }).WithObjects(trackerObjects...).Build() r := &MachinePoolReconciler{ - Client: clientFake, - APIReader: clientFake, - Tracker: remote.NewTestClusterCacheTracker(ctrl.LoggerFrom(ctx), clientFake, trackerClientFake, clientFake.Scheme(), client.ObjectKey{Name: testCluster.Name, Namespace: testCluster.Namespace}), + Client: clientFake, + APIReader: clientFake, + ClusterCache: clustercache.NewFakeClusterCache(trackerClientFake, client.ObjectKey{Name: testCluster.Name, Namespace: testCluster.Namespace}), } result, err := r.Reconcile(ctx, reconcile.Request{NamespacedName: util.ObjectKey(&tc.machinePool)}) @@ -824,8 +824,8 @@ func TestRemoveMachinePoolFinalizerAfterDeleteReconcile(t *testing.T) { key := client.ObjectKey{Namespace: m.Namespace, Name: m.Name} clientFake := fake.NewClientBuilder().WithObjects(testCluster, m).WithStatusSubresource(&expv1.MachinePool{}).Build() mr := &MachinePoolReconciler{ - Client: clientFake, - Tracker: remote.NewTestClusterCacheTracker(ctrl.LoggerFrom(ctx), clientFake, clientFake, clientFake.Scheme(), client.ObjectKey{Name: testCluster.Name, Namespace: testCluster.Namespace}), + Client: clientFake, + ClusterCache: clustercache.NewFakeClusterCache(clientFake, client.ObjectKey{Name: testCluster.Name, Namespace: testCluster.Namespace}), } _, err := mr.Reconcile(ctx, reconcile.Request{NamespacedName: key}) g.Expect(err).ToNot(HaveOccurred()) @@ -1100,9 +1100,9 @@ func TestMachinePoolConditions(t *testing.T) { ).WithStatusSubresource(&expv1.MachinePool{}).Build() r := &MachinePoolReconciler{ - Client: clientFake, - APIReader: clientFake, - Tracker: remote.NewTestClusterCacheTracker(ctrl.LoggerFrom(ctx), clientFake, clientFake, clientFake.Scheme(), client.ObjectKey{Name: testCluster.Name, Namespace: testCluster.Namespace}), + Client: clientFake, + APIReader: clientFake, + ClusterCache: clustercache.NewFakeClusterCache(clientFake, client.ObjectKey{Name: testCluster.Name, Namespace: testCluster.Namespace}), } _, err := r.Reconcile(ctx, reconcile.Request{NamespacedName: util.ObjectKey(machinePool)}) diff --git a/exp/internal/controllers/suite_test.go b/exp/internal/controllers/suite_test.go index 93194d2f8361..86446177ac29 100644 --- a/exp/internal/controllers/suite_test.go +++ b/exp/internal/controllers/suite_test.go @@ -22,10 +22,14 @@ import ( "os" "testing" + corev1 "k8s.io/api/core/v1" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/cluster-api/api/v1beta1/index" + "sigs.k8s.io/cluster-api/controllers/clustercache" + "sigs.k8s.io/cluster-api/controllers/remote" "sigs.k8s.io/cluster-api/internal/test/envtest" ) @@ -42,12 +46,31 @@ func TestMain(m *testing.M) { } setupReconcilers := func(ctx context.Context, mgr ctrl.Manager) { - machinePoolReconciler := MachinePoolReconciler{ - Client: mgr.GetClient(), - recorder: mgr.GetEventRecorderFor("machinepool-controller"), - } - err := machinePoolReconciler.SetupWithManager(ctx, mgr, controller.Options{MaxConcurrentReconciles: 1}) + clusterCache, err := clustercache.SetupWithManager(ctx, mgr, clustercache.Options{ + SecretClient: mgr.GetClient(), + Cache: clustercache.CacheOptions{ + Indexes: []clustercache.CacheOptionsIndex{clustercache.NodeProviderIDIndex}, + }, + Client: clustercache.ClientOptions{ + UserAgent: remote.DefaultClusterAPIUserAgent("test-controller-manager"), + Cache: clustercache.ClientCacheOptions{ + DisableFor: []client.Object{ + // Don't cache ConfigMaps & Secrets. + &corev1.ConfigMap{}, + &corev1.Secret{}, + }, + }, + }, + }, controller.Options{MaxConcurrentReconciles: 10}) if err != nil { + panic(fmt.Sprintf("Failed to create ClusterCache: %v", err)) + } + + if err := (&MachinePoolReconciler{ + Client: mgr.GetClient(), + ClusterCache: clusterCache, + recorder: mgr.GetEventRecorderFor("machinepool-controller"), + }).SetupWithManager(ctx, mgr, controller.Options{MaxConcurrentReconciles: 1}); err != nil { panic(fmt.Sprintf("Failed to set up machine pool reconciler: %v", err)) } } diff --git a/exp/topology/desiredstate/desired_state.go b/exp/topology/desiredstate/desired_state.go index c80e5c5a3091..680f9f70d798 100644 --- a/exp/topology/desiredstate/desired_state.go +++ b/exp/topology/desiredstate/desired_state.go @@ -32,8 +32,8 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + "sigs.k8s.io/cluster-api/controllers/clustercache" "sigs.k8s.io/cluster-api/controllers/external" - "sigs.k8s.io/cluster-api/controllers/remote" expv1 "sigs.k8s.io/cluster-api/exp/api/v1beta1" runtimecatalog "sigs.k8s.io/cluster-api/exp/runtime/catalog" runtimehooksv1 "sigs.k8s.io/cluster-api/exp/runtime/hooks/api/v1alpha1" @@ -57,10 +57,10 @@ type Generator interface { } // NewGenerator creates a new generator to generate desired state. -func NewGenerator(client client.Client, tracker *remote.ClusterCacheTracker, runtimeClient runtimeclient.Client) Generator { +func NewGenerator(client client.Client, clusterCache clustercache.ClusterCache, runtimeClient runtimeclient.Client) Generator { return &generator{ Client: client, - Tracker: tracker, + ClusterCache: clusterCache, RuntimeClient: runtimeClient, patchEngine: patches.NewEngine(runtimeClient), } @@ -71,7 +71,7 @@ func NewGenerator(client client.Client, tracker *remote.ClusterCacheTracker, run type generator struct { Client client.Client - Tracker *remote.ClusterCacheTracker + ClusterCache clustercache.ClusterCache RuntimeClient runtimeclient.Client @@ -118,7 +118,7 @@ func (g *generator) Generate(ctx context.Context, s *scope.Scope) (*scope.Cluste // - Make upgrade decisions on the control plane. // - Making upgrade decisions on machine pools. if len(s.Current.MachinePools) > 0 { - client, err := g.Tracker.GetClient(ctx, client.ObjectKeyFromObject(s.Current.Cluster)) + client, err := g.ClusterCache.GetClient(ctx, client.ObjectKeyFromObject(s.Current.Cluster)) if err != nil { return nil, errors.Wrap(err, "failed to check if any MachinePool is upgrading") } diff --git a/internal/controllers/cluster/cluster_controller.go b/internal/controllers/cluster/cluster_controller.go index ca1d86e22d0c..1d97e001ff04 100644 --- a/internal/controllers/cluster/cluster_controller.go +++ b/internal/controllers/cluster/cluster_controller.go @@ -27,6 +27,7 @@ import ( corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" kerrors "k8s.io/apimachinery/pkg/util/errors" "k8s.io/client-go/tools/record" @@ -40,6 +41,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/reconcile" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + "sigs.k8s.io/cluster-api/controllers/clustercache" "sigs.k8s.io/cluster-api/controllers/external" expv1 "sigs.k8s.io/cluster-api/exp/api/v1beta1" "sigs.k8s.io/cluster-api/feature" @@ -48,6 +50,7 @@ import ( "sigs.k8s.io/cluster-api/util/annotations" "sigs.k8s.io/cluster-api/util/collections" "sigs.k8s.io/cluster-api/util/conditions" + conditionsv1beta2 "sigs.k8s.io/cluster-api/util/conditions/v1beta2" "sigs.k8s.io/cluster-api/util/finalizers" "sigs.k8s.io/cluster-api/util/patch" "sigs.k8s.io/cluster-api/util/predicates" @@ -70,12 +73,15 @@ const ( // Reconciler reconciles a Cluster object. type Reconciler struct { - Client client.Client - APIReader client.Reader + Client client.Client + APIReader client.Reader + ClusterCache clustercache.ClusterCache // WatchFilterValue is the label value used to filter events prior to reconciliation. WatchFilterValue string + RemoteConnectionGracePeriod time.Duration + recorder record.EventRecorder externalTracker external.ObjectTracker } @@ -84,6 +90,9 @@ func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, opt predicateLog := ctrl.LoggerFrom(ctx).WithValues("controller", "cluster") c, err := ctrl.NewControllerManagedBy(mgr). For(&clusterv1.Cluster{}). + WatchesRawSource(r.ClusterCache.GetClusterSource("cluster", func(_ context.Context, o client.Object) []ctrl.Request { + return []ctrl.Request{{NamespacedName: client.ObjectKeyFromObject(o)}} + }, clustercache.WatchForProbeFailure(r.RemoteConnectionGracePeriod))). Watches( &clusterv1.Machine{}, handler.EnqueueRequestsFromMapFunc(r.controlPlaneMachineToCluster), @@ -153,6 +162,29 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Re } }() + lastProbeSuccessTime := r.ClusterCache.GetLastProbeSuccessTimestamp(ctx, client.ObjectKeyFromObject(cluster)) + if time.Since(lastProbeSuccessTime) > r.RemoteConnectionGracePeriod { + var msg string + if lastProbeSuccessTime.IsZero() { + msg = "Remote connection probe failed" + } else { + msg = fmt.Sprintf("Remote connection probe failed, probe last succeeded at %s", lastProbeSuccessTime.Format(time.RFC3339)) + } + conditionsv1beta2.Set(cluster, metav1.Condition{ + Type: clusterv1.ClusterRemoteConnectionProbeV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.ClusterRemoteConnectionProbeFailedV1Beta2Reason, + Message: msg, + }) + } else { + conditionsv1beta2.Set(cluster, metav1.Condition{ + Type: clusterv1.ClusterRemoteConnectionProbeV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.ClusterRemoteConnectionProbeSucceededV1Beta2Reason, + Message: "Remote connection probe succeeded", + }) + } + // Handle deletion reconciliation loop. if !cluster.ObjectMeta.DeletionTimestamp.IsZero() { return r.reconcileDelete(ctx, cluster) diff --git a/internal/controllers/cluster/cluster_controller_test.go b/internal/controllers/cluster/cluster_controller_test.go index 1025644505c7..828dda74e8fe 100644 --- a/internal/controllers/cluster/cluster_controller_test.go +++ b/internal/controllers/cluster/cluster_controller_test.go @@ -36,6 +36,7 @@ import ( "sigs.k8s.io/cluster-api/internal/test/builder" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/conditions" + conditionsv1beta2 "sigs.k8s.io/cluster-api/util/conditions/v1beta2" "sigs.k8s.io/cluster-api/util/patch" ) @@ -80,6 +81,28 @@ func TestClusterReconciler(t *testing.T) { } return len(instance.Finalizers) > 0 }, timeout).Should(BeTrue()) + + // Validate the RemoteConnectionProbe condition is false (because kubeconfig Secret doesn't exist) + g.Eventually(func(g Gomega) { + g.Expect(env.Get(ctx, key, instance)).To(Succeed()) + + condition := conditionsv1beta2.Get(instance, clusterv1.ClusterRemoteConnectionProbeV1Beta2Condition) + g.Expect(condition).ToNot(BeNil()) + g.Expect(condition.Status).To(Equal(metav1.ConditionFalse)) + g.Expect(condition.Reason).To(Equal(clusterv1.ClusterRemoteConnectionProbeFailedV1Beta2Reason)) + }, timeout).Should(Succeed()) + + t.Log("Creating the Cluster Kubeconfig Secret") + g.Expect(env.CreateKubeconfigSecret(ctx, instance)).To(Succeed()) + + g.Eventually(func(g Gomega) { + g.Expect(env.Get(ctx, key, instance)).To(Succeed()) + + condition := conditionsv1beta2.Get(instance, clusterv1.ClusterRemoteConnectionProbeV1Beta2Condition) + g.Expect(condition).ToNot(BeNil()) + g.Expect(condition.Status).To(Equal(metav1.ConditionTrue)) + g.Expect(condition.Reason).To(Equal(clusterv1.ClusterRemoteConnectionProbeSucceededV1Beta2Reason)) + }, timeout).Should(Succeed()) }) t.Run("Should successfully patch a cluster object if the status diff is empty but the spec diff is not", func(t *testing.T) { diff --git a/internal/controllers/cluster/suite_test.go b/internal/controllers/cluster/suite_test.go index ef2985a14ca6..64d7ab23540d 100644 --- a/internal/controllers/cluster/suite_test.go +++ b/internal/controllers/cluster/suite_test.go @@ -24,14 +24,17 @@ import ( "time" . "github.com/onsi/gomega" + corev1 "k8s.io/api/core/v1" apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" "k8s.io/apimachinery/pkg/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/api/v1beta1/index" + "sigs.k8s.io/cluster-api/controllers/clustercache" "sigs.k8s.io/cluster-api/controllers/remote" machinecontroller "sigs.k8s.io/cluster-api/internal/controllers/machine" "sigs.k8s.io/cluster-api/internal/test/envtest" @@ -61,35 +64,38 @@ func TestMain(m *testing.M) { } setupReconcilers := func(ctx context.Context, mgr ctrl.Manager) { - // Set up a ClusterCacheTracker and ClusterCacheReconciler to provide to controllers - // requiring a connection to a remote cluster - tracker, err := remote.NewClusterCacheTracker( - mgr, - remote.ClusterCacheTrackerOptions{ - Log: &ctrl.Log, - Indexes: []remote.Index{remote.NodeProviderIDIndex}, + clusterCache, err := clustercache.SetupWithManager(ctx, mgr, clustercache.Options{ + SecretClient: mgr.GetClient(), + Cache: clustercache.CacheOptions{ + Indexes: []clustercache.CacheOptionsIndex{clustercache.NodeProviderIDIndex}, }, - ) + Client: clustercache.ClientOptions{ + UserAgent: remote.DefaultClusterAPIUserAgent("test-controller-manager"), + Cache: clustercache.ClientCacheOptions{ + DisableFor: []client.Object{ + // Don't cache ConfigMaps & Secrets. + &corev1.ConfigMap{}, + &corev1.Secret{}, + }, + }, + }, + }, controller.Options{MaxConcurrentReconciles: 10}) if err != nil { - panic(fmt.Sprintf("unable to create cluster cache tracker: %v", err)) - } - if err := (&remote.ClusterCacheReconciler{ - Client: mgr.GetClient(), - Tracker: tracker, - }).SetupWithManager(ctx, mgr, controller.Options{MaxConcurrentReconciles: 1}); err != nil { - panic(fmt.Sprintf("Failed to start ClusterCacheReconciler: %v", err)) + panic(fmt.Sprintf("Failed to create ClusterCache: %v", err)) } if err := (&Reconciler{ - Client: mgr.GetClient(), - APIReader: mgr.GetClient(), + Client: mgr.GetClient(), + APIReader: mgr.GetClient(), + ClusterCache: clusterCache, + RemoteConnectionGracePeriod: 50 * time.Second, }).SetupWithManager(ctx, mgr, controller.Options{MaxConcurrentReconciles: 1}); err != nil { panic(fmt.Sprintf("Failed to start ClusterReconciler: %v", err)) } if err := (&machinecontroller.Reconciler{ - Client: mgr.GetClient(), - APIReader: mgr.GetAPIReader(), - Tracker: tracker, + Client: mgr.GetClient(), + APIReader: mgr.GetAPIReader(), + ClusterCache: clusterCache, }).SetupWithManager(ctx, mgr, controller.Options{MaxConcurrentReconciles: 1}); err != nil { panic(fmt.Sprintf("Failed to start MachineReconciler: %v", err)) } diff --git a/internal/controllers/machine/machine_controller.go b/internal/controllers/machine/machine_controller.go index 4b6189811e46..f4b2b7d9701f 100644 --- a/internal/controllers/machine/machine_controller.go +++ b/internal/controllers/machine/machine_controller.go @@ -43,9 +43,9 @@ import ( clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/api/v1beta1/index" + "sigs.k8s.io/cluster-api/controllers/clustercache" "sigs.k8s.io/cluster-api/controllers/external" "sigs.k8s.io/cluster-api/controllers/noderefutil" - "sigs.k8s.io/cluster-api/controllers/remote" "sigs.k8s.io/cluster-api/internal/controllers/machine/drain" "sigs.k8s.io/cluster-api/internal/util/ssa" "sigs.k8s.io/cluster-api/util" @@ -81,9 +81,9 @@ var ( // Reconciler reconciles a Machine object. type Reconciler struct { - Client client.Client - APIReader client.Reader - Tracker *remote.ClusterCacheTracker + Client client.Client + APIReader client.Reader + ClusterCache clustercache.ClusterCache // WatchFilterValue is the label value used to filter events prior to reconciliation. WatchFilterValue string @@ -135,6 +135,7 @@ func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, opt predicates.ResourceHasFilterLabel(mgr.GetScheme(), predicateLog, r.WatchFilterValue), ), )). + WatchesRawSource(r.ClusterCache.GetClusterSource("machine", clusterToMachines)). Watches( &clusterv1.MachineSet{}, handler.EnqueueRequestsFromMapFunc(msToMachines), @@ -242,10 +243,9 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Re ) res, err := doReconcile(ctx, reconcileDelete, s) - // Requeue if the reconcile failed because the ClusterCacheTracker was locked for - // the current cluster because of concurrent access. - if errors.Is(err, remote.ErrClusterLocked) { - log.V(5).Info("Requeuing because another worker has the lock on the ClusterCacheTracker") + // Requeue if the reconcile failed because connection to workload cluster was down. + if errors.Is(err, clustercache.ErrClusterNotConnected) { + log.V(5).Info("Requeuing because connection to the workload cluster is down") return ctrl.Result{RequeueAfter: time.Minute}, nil } return res, err @@ -253,10 +253,9 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Re // Handle normal reconciliation loop. res, err := doReconcile(ctx, alwaysReconcile, s) - // Requeue if the reconcile failed because the ClusterCacheTracker was locked for - // the current cluster because of concurrent access. - if errors.Is(err, remote.ErrClusterLocked) { - log.V(5).Info("Requeuing because another worker has the lock on the ClusterCacheTracker") + // Requeue if the reconcile failed because connection to workload cluster was down. + if errors.Is(err, clustercache.ErrClusterNotConnected) { + log.V(5).Info("Requeuing because connection to the workload cluster is down") return ctrl.Result{RequeueAfter: time.Minute}, nil } return res, err @@ -622,7 +621,7 @@ func (r *Reconciler) isDeleteNodeAllowed(ctx context.Context, cluster *clusterv1 // NOTE: The following is a best-effort attempt to retrieve the node, // errors are logged but not returned to ensure machines are deleted // even if the node cannot be retrieved. - remoteClient, err := r.Tracker.GetClient(ctx, util.ObjectKey(cluster)) + remoteClient, err := r.ClusterCache.GetClient(ctx, util.ObjectKey(cluster)) if err != nil { log.Error(err, "Failed to get cluster client while deleting Machine and checking for nodes") } else { @@ -697,10 +696,10 @@ func (r *Reconciler) drainNode(ctx context.Context, cluster *clusterv1.Cluster, log := ctrl.LoggerFrom(ctx, "Node", klog.KRef("", nodeName)) ctx = ctrl.LoggerInto(ctx, log) - remoteClient, err := r.Tracker.GetClient(ctx, util.ObjectKey(cluster)) + remoteClient, err := r.ClusterCache.GetClient(ctx, util.ObjectKey(cluster)) if err != nil { - if errors.Is(err, remote.ErrClusterLocked) { - log.V(5).Info("Requeuing drain Node because another worker has the lock on the ClusterCacheTracker") + if errors.Is(err, clustercache.ErrClusterNotConnected) { + log.V(5).Info("Requeuing drain Node because connection to the workload cluster is down") return ctrl.Result{RequeueAfter: time.Minute}, nil } log.Error(err, "Error creating a remote client for cluster while draining Node, won't retry") @@ -767,7 +766,7 @@ func (r *Reconciler) drainNode(ctx context.Context, cluster *clusterv1.Cluster, // even the "no-op" drain would be requeued. if cacheEntry, ok := r.drainCache.Has(client.ObjectKeyFromObject(machine)); ok { if requeueAfter, requeue := shouldRequeueDrain(time.Now(), cacheEntry.LastDrain); requeue { - log.Info(fmt.Sprintf("Requeuing in %s because there already was a drain in the last %s", requeueAfter, drainRetryInterval)) + log.Info(fmt.Sprintf("Requeuing in %s because there already was a drain in the last %s", requeueAfter.Truncate(time.Second/10), drainRetryInterval)) return ctrl.Result{RequeueAfter: requeueAfter}, nil } } @@ -817,7 +816,7 @@ func (r *Reconciler) shouldWaitForNodeVolumes(ctx context.Context, cluster *clus log := ctrl.LoggerFrom(ctx, "Node", klog.KRef("", nodeName)) ctx = ctrl.LoggerInto(ctx, log) - remoteClient, err := r.Tracker.GetClient(ctx, util.ObjectKey(cluster)) + remoteClient, err := r.ClusterCache.GetClient(ctx, util.ObjectKey(cluster)) if err != nil { return true, err } @@ -850,10 +849,10 @@ func (r *Reconciler) shouldWaitForNodeVolumes(ctx context.Context, cluster *clus func (r *Reconciler) deleteNode(ctx context.Context, cluster *clusterv1.Cluster, name string) error { log := ctrl.LoggerFrom(ctx) - remoteClient, err := r.Tracker.GetClient(ctx, util.ObjectKey(cluster)) + remoteClient, err := r.ClusterCache.GetClient(ctx, util.ObjectKey(cluster)) if err != nil { - if errors.Is(err, remote.ErrClusterLocked) { - return errors.Wrapf(err, "failed deleting Node because another worker has the lock on the ClusterCacheTracker") + if errors.Is(err, clustercache.ErrClusterNotConnected) { + return errors.Wrapf(err, "failed deleting Node because connection to the workload cluster is down") } log.Error(err, "Error creating a remote client for cluster while deleting Node, won't retry") return nil @@ -941,13 +940,12 @@ func (r *Reconciler) watchClusterNodes(ctx context.Context, cluster *clusterv1.C } // If there is no tracker, don't watch remote nodes - if r.Tracker == nil { + if r.ClusterCache == nil { return nil } - return r.Tracker.Watch(ctx, remote.WatchInput{ + return r.ClusterCache.Watch(ctx, util.ObjectKey(cluster), clustercache.WatchInput{ Name: "machine-watchNodes", - Cluster: util.ObjectKey(cluster), Watcher: r.controller, Kind: &corev1.Node{}, EventHandler: handler.EnqueueRequestsFromMapFunc(r.nodeToMachine), diff --git a/internal/controllers/machine/machine_controller_noderef.go b/internal/controllers/machine/machine_controller_noderef.go index 9643b3457608..c6e5abee621d 100644 --- a/internal/controllers/machine/machine_controller_noderef.go +++ b/internal/controllers/machine/machine_controller_noderef.go @@ -63,7 +63,7 @@ func (r *Reconciler) reconcileNode(ctx context.Context, s *scope) (ctrl.Result, return ctrl.Result{}, nil } - remoteClient, err := r.Tracker.GetClient(ctx, util.ObjectKey(cluster)) + remoteClient, err := r.ClusterCache.GetClient(ctx, util.ObjectKey(cluster)) if err != nil { return ctrl.Result{}, err } diff --git a/internal/controllers/machine/machine_controller_noderef_test.go b/internal/controllers/machine/machine_controller_noderef_test.go index acb486cd48a5..16ba51a118f5 100644 --- a/internal/controllers/machine/machine_controller_noderef_test.go +++ b/internal/controllers/machine/machine_controller_noderef_test.go @@ -31,11 +31,13 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/handler" "sigs.k8s.io/controller-runtime/pkg/reconcile" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/api/v1beta1/index" + "sigs.k8s.io/cluster-api/controllers/clustercache" "sigs.k8s.io/cluster-api/controllers/remote" "sigs.k8s.io/cluster-api/internal/test/builder" "sigs.k8s.io/cluster-api/internal/topology/ownerrefs" @@ -204,9 +206,9 @@ func TestReconcileNode(t *testing.T) { } r := &Reconciler{ - Tracker: remote.NewTestClusterCacheTracker(ctrl.Log, c, c, fakeScheme, client.ObjectKeyFromObject(defaultCluster)), - Client: c, - recorder: record.NewFakeRecorder(10), + ClusterCache: clustercache.NewFakeClusterCache(c, client.ObjectKeyFromObject(defaultCluster)), + Client: c, + recorder: record.NewFakeRecorder(10), } s := &scope{cluster: defaultCluster, machine: tc.machine} result, err := r.reconcileNode(ctx, s) @@ -239,6 +241,11 @@ func TestGetNode(t *testing.T) { } g.Expect(env.Create(ctx, testCluster)).To(Succeed()) + // Set InfrastructureReady to true so ClusterCache creates the clusterAccessor. + patch := client.MergeFrom(testCluster.DeepCopy()) + testCluster.Status.InfrastructureReady = true + g.Expect(env.Status().Patch(ctx, testCluster, patch)).To(Succeed()) + g.Expect(env.CreateKubeconfigSecret(ctx, testCluster)).To(Succeed()) defer func(do ...client.Object) { g.Expect(env.Cleanup(ctx, do...)).To(Succeed()) @@ -310,35 +317,50 @@ func TestGetNode(t *testing.T) { g.Expect(env.Cleanup(ctx, do...)).To(Succeed()) }(nodesToCleanup...) - tracker, err := remote.NewClusterCacheTracker( - env.Manager, remote.ClusterCacheTrackerOptions{ - Indexes: []remote.Index{remote.NodeProviderIDIndex}, + clusterCache, err := clustercache.SetupWithManager(ctx, env.Manager, clustercache.Options{ + SecretClient: env.Manager.GetClient(), + Cache: clustercache.CacheOptions{ + Indexes: []clustercache.CacheOptionsIndex{clustercache.NodeProviderIDIndex}, }, - ) - g.Expect(err).ToNot(HaveOccurred()) + Client: clustercache.ClientOptions{ + UserAgent: remote.DefaultClusterAPIUserAgent("test-controller-manager"), + Cache: clustercache.ClientCacheOptions{ + DisableFor: []client.Object{ + // Don't cache ConfigMaps & Secrets. + &corev1.ConfigMap{}, + &corev1.Secret{}, + }, + }, + }, + }, controller.Options{MaxConcurrentReconciles: 10, SkipNameValidation: ptr.To(true)}) + if err != nil { + panic(fmt.Sprintf("Failed to create ClusterCache: %v", err)) + } r := &Reconciler{ - Tracker: tracker, - Client: env, + ClusterCache: clusterCache, + Client: env, } w, err := ctrl.NewControllerManagedBy(env.Manager).For(&corev1.Node{}).Build(r) g.Expect(err).ToNot(HaveOccurred()) - g.Expect(tracker.Watch(ctx, remote.WatchInput{ - Name: "TestGetNode", - Cluster: util.ObjectKey(testCluster), - Watcher: w, - Kind: &corev1.Node{}, - EventHandler: handler.EnqueueRequestsFromMapFunc(func(context.Context, client.Object) []reconcile.Request { - return nil - }), - })).To(Succeed()) + // Retry because the ClusterCache might not have immediately created the clusterAccessor. + g.Eventually(func(g Gomega) { + g.Expect(clusterCache.Watch(ctx, util.ObjectKey(testCluster), clustercache.WatchInput{ + Name: "TestGetNode", + Watcher: w, + Kind: &corev1.Node{}, + EventHandler: handler.EnqueueRequestsFromMapFunc(func(context.Context, client.Object) []reconcile.Request { + return nil + }), + })).To(Succeed()) + }, 1*time.Minute, 5*time.Second).Should(Succeed()) for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { g := NewWithT(t) - remoteClient, err := r.Tracker.GetClient(ctx, util.ObjectKey(testCluster)) + remoteClient, err := r.ClusterCache.GetClient(ctx, util.ObjectKey(testCluster)) g.Expect(err).ToNot(HaveOccurred()) node, err := r.getNode(ctx, remoteClient, tc.providerIDInput) @@ -514,7 +536,11 @@ func TestNodeLabelSync(t *testing.T) { g.Expect(env.Create(ctx, cluster)).To(Succeed()) defaultKubeconfigSecret := kubeconfig.GenerateSecret(cluster, kubeconfig.FromEnvTestConfig(env.Config, cluster)) - g.Expect(env.Create(ctx, defaultKubeconfigSecret)).To(Succeed()) + g.Expect(env.CreateAndWait(ctx, defaultKubeconfigSecret)).To(Succeed()) + // Set InfrastructureReady to true so ClusterCache creates the clusterAccessor. + patch := client.MergeFrom(cluster.DeepCopy()) + cluster.Status.InfrastructureReady = true + g.Expect(env.Status().Patch(ctx, cluster, patch)).To(Succeed()) g.Expect(env.Create(ctx, infraMachine)).To(Succeed()) // Set InfrastructureMachine .status.interruptible and .status.ready to true. diff --git a/internal/controllers/machine/machine_controller_status_test.go b/internal/controllers/machine/machine_controller_status_test.go index 80a3204c2a37..2da725277532 100644 --- a/internal/controllers/machine/machine_controller_status_test.go +++ b/internal/controllers/machine/machine_controller_status_test.go @@ -1138,6 +1138,10 @@ func TestReconcileMachinePhases(t *testing.T) { g.Expect(env.Create(ctx, cluster)).To(Succeed()) defaultKubeconfigSecret = kubeconfig.GenerateSecret(cluster, kubeconfig.FromEnvTestConfig(env.Config, cluster)) g.Expect(env.Create(ctx, defaultKubeconfigSecret)).To(Succeed()) + // Set InfrastructureReady to true so ClusterCache creates the clusterAccessor. + patch := client.MergeFrom(cluster.DeepCopy()) + cluster.Status.InfrastructureReady = true + g.Expect(env.Status().Patch(ctx, cluster, patch)).To(Succeed()) g.Expect(env.Create(ctx, bootstrapConfig)).To(Succeed()) g.Expect(env.Create(ctx, infraMachine)).To(Succeed()) @@ -1186,6 +1190,10 @@ func TestReconcileMachinePhases(t *testing.T) { g.Expect(env.Create(ctx, cluster)).To(Succeed()) defaultKubeconfigSecret = kubeconfig.GenerateSecret(cluster, kubeconfig.FromEnvTestConfig(env.Config, cluster)) g.Expect(env.Create(ctx, defaultKubeconfigSecret)).To(Succeed()) + // Set InfrastructureReady to true so ClusterCache creates the clusterAccessor. + patch := client.MergeFrom(cluster.DeepCopy()) + cluster.Status.InfrastructureReady = true + g.Expect(env.Status().Patch(ctx, cluster, patch)).To(Succeed()) g.Expect(env.Create(ctx, bootstrapConfig)).To(Succeed()) g.Expect(env.Create(ctx, infraMachine)).To(Succeed()) @@ -1225,6 +1233,10 @@ func TestReconcileMachinePhases(t *testing.T) { g.Expect(env.Create(ctx, cluster)).To(Succeed()) defaultKubeconfigSecret = kubeconfig.GenerateSecret(cluster, kubeconfig.FromEnvTestConfig(env.Config, cluster)) g.Expect(env.Create(ctx, defaultKubeconfigSecret)).To(Succeed()) + // Set InfrastructureReady to true so ClusterCache creates the clusterAccessor. + patch := client.MergeFrom(cluster.DeepCopy()) + cluster.Status.InfrastructureReady = true + g.Expect(env.Status().Patch(ctx, cluster, patch)).To(Succeed()) g.Expect(env.Create(ctx, bootstrapConfig)).To(Succeed()) g.Expect(env.Create(ctx, infraMachine)).To(Succeed()) @@ -1293,6 +1305,10 @@ func TestReconcileMachinePhases(t *testing.T) { g.Expect(env.Create(ctx, cluster)).To(Succeed()) defaultKubeconfigSecret = kubeconfig.GenerateSecret(cluster, kubeconfig.FromEnvTestConfig(env.Config, cluster)) g.Expect(env.Create(ctx, defaultKubeconfigSecret)).To(Succeed()) + // Set InfrastructureReady to true so ClusterCache creates the clusterAccessor. + patch := client.MergeFrom(cluster.DeepCopy()) + cluster.Status.InfrastructureReady = true + g.Expect(env.Status().Patch(ctx, cluster, patch)).To(Succeed()) g.Expect(env.Create(ctx, bootstrapConfig)).To(Succeed()) g.Expect(env.Create(ctx, infraMachine)).To(Succeed()) @@ -1378,6 +1394,10 @@ func TestReconcileMachinePhases(t *testing.T) { g.Expect(env.Create(ctx, cluster)).To(Succeed()) defaultKubeconfigSecret = kubeconfig.GenerateSecret(cluster, kubeconfig.FromEnvTestConfig(env.Config, cluster)) g.Expect(env.Create(ctx, defaultKubeconfigSecret)).To(Succeed()) + // Set InfrastructureReady to true so ClusterCache creates the clusterAccessor. + patch := client.MergeFrom(cluster.DeepCopy()) + cluster.Status.InfrastructureReady = true + g.Expect(env.Status().Patch(ctx, cluster, patch)).To(Succeed()) g.Expect(env.Create(ctx, bootstrapConfig)).To(Succeed()) g.Expect(env.Create(ctx, infraMachine)).To(Succeed()) @@ -1452,6 +1472,10 @@ func TestReconcileMachinePhases(t *testing.T) { g.Expect(env.Create(ctx, cluster)).To(Succeed()) defaultKubeconfigSecret = kubeconfig.GenerateSecret(cluster, kubeconfig.FromEnvTestConfig(env.Config, cluster)) g.Expect(env.Create(ctx, defaultKubeconfigSecret)).To(Succeed()) + // Set InfrastructureReady to true so ClusterCache creates the clusterAccessor. + patch := client.MergeFrom(cluster.DeepCopy()) + cluster.Status.InfrastructureReady = true + g.Expect(env.Status().Patch(ctx, cluster, patch)).To(Succeed()) g.Expect(env.Create(ctx, bootstrapConfig)).To(Succeed()) g.Expect(env.Create(ctx, infraMachine)).To(Succeed()) @@ -1515,6 +1539,10 @@ func TestReconcileMachinePhases(t *testing.T) { g.Expect(env.Create(ctx, cluster)).To(Succeed()) defaultKubeconfigSecret = kubeconfig.GenerateSecret(cluster, kubeconfig.FromEnvTestConfig(env.Config, cluster)) g.Expect(env.Create(ctx, defaultKubeconfigSecret)).To(Succeed()) + // Set InfrastructureReady to true so ClusterCache creates the clusterAccessor. + patch := client.MergeFrom(cluster.DeepCopy()) + cluster.Status.InfrastructureReady = true + g.Expect(env.Status().Patch(ctx, cluster, patch)).To(Succeed()) g.Expect(env.Create(ctx, bootstrapConfig)).To(Succeed()) g.Expect(env.Create(ctx, infraMachine)).To(Succeed()) @@ -1583,6 +1611,10 @@ func TestReconcileMachinePhases(t *testing.T) { g.Expect(env.Create(ctx, cluster)).To(Succeed()) defaultKubeconfigSecret = kubeconfig.GenerateSecret(cluster, kubeconfig.FromEnvTestConfig(env.Config, cluster)) g.Expect(env.Create(ctx, defaultKubeconfigSecret)).To(Succeed()) + // Set InfrastructureReady to true so ClusterCache creates the clusterAccessor. + patch := client.MergeFrom(cluster.DeepCopy()) + cluster.Status.InfrastructureReady = true + g.Expect(env.Status().Patch(ctx, cluster, patch)).To(Succeed()) g.Expect(env.Create(ctx, bootstrapConfig)).To(Succeed()) g.Expect(env.Create(ctx, infraMachine)).To(Succeed()) @@ -1629,6 +1661,7 @@ func TestReconcileMachinePhases(t *testing.T) { } g.Expect(machine.Status.GetTypedPhase()).To(Equal(clusterv1.MachinePhaseDeleting)) nodeHealthyCondition := conditions.Get(machine, clusterv1.MachineNodeHealthyCondition) + g.Expect(nodeHealthyCondition).ToNot(BeNil()) g.Expect(nodeHealthyCondition.Status).To(Equal(corev1.ConditionFalse)) g.Expect(nodeHealthyCondition.Reason).To(Equal(clusterv1.DeletingReason)) // Verify that the LastUpdated timestamp was updated diff --git a/internal/controllers/machine/machine_controller_test.go b/internal/controllers/machine/machine_controller_test.go index 06fbcbeca3fe..c8d90e1fdf56 100644 --- a/internal/controllers/machine/machine_controller_test.go +++ b/internal/controllers/machine/machine_controller_test.go @@ -22,14 +22,12 @@ import ( "testing" "time" - "github.com/go-logr/logr" . "github.com/onsi/gomega" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/tools/record" "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" @@ -37,12 +35,11 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client/apiutil" "sigs.k8s.io/controller-runtime/pkg/client/fake" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" - "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/reconcile" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/api/v1beta1/index" - "sigs.k8s.io/cluster-api/controllers/remote" + "sigs.k8s.io/cluster-api/controllers/clustercache" "sigs.k8s.io/cluster-api/internal/controllers/machine/drain" "sigs.k8s.io/cluster-api/internal/test/builder" "sigs.k8s.io/cluster-api/internal/util/ssa" @@ -111,6 +108,11 @@ func TestWatches(t *testing.T) { g.Expect(env.Create(ctx, testCluster)).To(Succeed()) g.Expect(env.CreateKubeconfigSecret(ctx, testCluster)).To(Succeed()) + // Set InfrastructureReady to true so ClusterCache creates the clusterAccessor. + testClusterOriginal := client.MergeFrom(testCluster.DeepCopy()) + testCluster.Status.InfrastructureReady = true + g.Expect(env.Status().Patch(ctx, testCluster, testClusterOriginal)).To(Succeed()) + g.Expect(env.Create(ctx, defaultBootstrap)).To(Succeed()) g.Expect(env.Create(ctx, node)).To(Succeed()) g.Expect(env.Create(ctx, infraMachine)).To(Succeed()) @@ -913,10 +915,10 @@ func TestReconcileRequest(t *testing.T) { ).WithStatusSubresource(&clusterv1.Machine{}).WithIndex(&corev1.Node{}, index.NodeProviderIDField, index.NodeByProviderID).Build() r := &Reconciler{ - Client: clientFake, - Tracker: remote.NewTestClusterCacheTracker(logr.New(log.NullLogSink{}), clientFake, clientFake, scheme.Scheme, client.ObjectKey{Name: testCluster.Name, Namespace: testCluster.Namespace}), - ssaCache: ssa.NewCache(), - recorder: record.NewFakeRecorder(10), + Client: clientFake, + ClusterCache: clustercache.NewFakeClusterCache(clientFake, client.ObjectKey{Name: testCluster.Name, Namespace: testCluster.Namespace}), + ssaCache: ssa.NewCache(), + recorder: record.NewFakeRecorder(10), } result, err := r.Reconcile(ctx, reconcile.Request{NamespacedName: util.ObjectKey(&tc.machine)}) @@ -1193,10 +1195,10 @@ func TestMachineConditions(t *testing.T) { Build() r := &Reconciler{ - Client: clientFake, - recorder: record.NewFakeRecorder(10), - Tracker: remote.NewTestClusterCacheTracker(logr.New(log.NullLogSink{}), clientFake, clientFake, scheme.Scheme, client.ObjectKey{Name: testCluster.Name, Namespace: testCluster.Namespace}), - ssaCache: ssa.NewCache(), + Client: clientFake, + recorder: record.NewFakeRecorder(10), + ClusterCache: clustercache.NewFakeClusterCache(clientFake, client.ObjectKey{Name: testCluster.Name, Namespace: testCluster.Namespace}), + ssaCache: ssa.NewCache(), } _, err := r.Reconcile(ctx, reconcile.Request{NamespacedName: util.ObjectKey(&machine)}) @@ -1562,11 +1564,10 @@ func TestDrainNode(t *testing.T) { WithObjects(remoteObjs...). Build() - tracker := remote.NewTestClusterCacheTracker(ctrl.Log, c, remoteClient, fakeScheme, client.ObjectKeyFromObject(testCluster)) r := &Reconciler{ - Client: c, - Tracker: tracker, - drainCache: drain.NewCache(), + Client: c, + ClusterCache: clustercache.NewFakeClusterCache(remoteClient, client.ObjectKeyFromObject(testCluster)), + drainCache: drain.NewCache(), } res, err := r.drainNode(ctx, testCluster, testMachine, tt.nodeName) @@ -1661,12 +1662,11 @@ func TestDrainNode_withCaching(t *testing.T) { WithObjects(remoteObjs...). Build() - tracker := remote.NewTestClusterCacheTracker(ctrl.Log, c, remoteClient, fakeScheme, client.ObjectKeyFromObject(testCluster)) drainCache := drain.NewCache() r := &Reconciler{ - Client: c, - Tracker: tracker, - drainCache: drainCache, + Client: c, + ClusterCache: clustercache.NewFakeClusterCache(remoteClient, client.ObjectKeyFromObject(testCluster)), + drainCache: drainCache, } // The first reconcile will cordon the Node, evict the one Pod running on the Node and then requeue. @@ -1962,10 +1962,9 @@ func TestShouldWaitForNodeVolumes(t *testing.T) { objs = append(objs, testCluster, tt.node) c := fake.NewClientBuilder().WithObjects(objs...).Build() - tracker := remote.NewTestClusterCacheTracker(ctrl.Log, c, c, fakeScheme, client.ObjectKeyFromObject(testCluster)) r := &Reconciler{ - Client: c, - Tracker: tracker, + Client: c, + ClusterCache: clustercache.NewFakeClusterCache(c, client.ObjectKeyFromObject(testCluster)), } got, err := r.shouldWaitForNodeVolumes(ctx, testCluster, tt.node.Name) @@ -2432,6 +2431,11 @@ func TestNodeToMachine(t *testing.T) { g.Expect(env.Create(ctx, testCluster)).To(Succeed()) g.Expect(env.CreateKubeconfigSecret(ctx, testCluster)).To(Succeed()) + // Set InfrastructureReady to true so ClusterCache creates the clusterAccessor. + testClusterOriginal := client.MergeFrom(testCluster.DeepCopy()) + testCluster.Status.InfrastructureReady = true + g.Expect(env.Status().Patch(ctx, testCluster, testClusterOriginal)).To(Succeed()) + g.Expect(env.Create(ctx, defaultBootstrap)).To(Succeed()) g.Expect(env.Create(ctx, targetNode)).To(Succeed()) g.Expect(env.Create(ctx, randomNode)).To(Succeed()) @@ -2767,11 +2771,10 @@ func TestNodeDeletion(t *testing.T) { m.Spec.NodeDeletionTimeout = tc.deletionTimeout fakeClient := tc.createFakeClient(node, m, cpmachine1) - tracker := remote.NewTestClusterCacheTracker(ctrl.Log, fakeClient, fakeClient, fakeScheme, client.ObjectKeyFromObject(&testCluster)) r := &Reconciler{ Client: fakeClient, - Tracker: tracker, + ClusterCache: clustercache.NewFakeClusterCache(fakeClient, client.ObjectKeyFromObject(&testCluster)), recorder: record.NewFakeRecorder(10), nodeDeletionRetryTimeout: 10 * time.Millisecond, } @@ -2897,11 +2900,10 @@ func TestNodeDeletionWithoutNodeRefFallback(t *testing.T) { m.Spec.NodeDeletionTimeout = tc.deletionTimeout fakeClient := tc.createFakeClient(node, m, cpmachine1) - tracker := remote.NewTestClusterCacheTracker(ctrl.Log, fakeClient, fakeClient, fakeScheme, client.ObjectKeyFromObject(&testCluster)) r := &Reconciler{ Client: fakeClient, - Tracker: tracker, + ClusterCache: clustercache.NewFakeClusterCache(fakeClient, client.ObjectKeyFromObject(&testCluster)), recorder: record.NewFakeRecorder(10), nodeDeletionRetryTimeout: 10 * time.Millisecond, } diff --git a/internal/controllers/machine/suite_test.go b/internal/controllers/machine/suite_test.go index eb0ad7a983e1..94c68649886c 100644 --- a/internal/controllers/machine/suite_test.go +++ b/internal/controllers/machine/suite_test.go @@ -33,10 +33,12 @@ import ( "k8s.io/apimachinery/pkg/runtime/schema" clientgoscheme "k8s.io/client-go/kubernetes/scheme" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/api/v1beta1/index" + "sigs.k8s.io/cluster-api/controllers/clustercache" "sigs.k8s.io/cluster-api/controllers/remote" "sigs.k8s.io/cluster-api/internal/test/envtest" ) @@ -66,29 +68,35 @@ func TestMain(m *testing.M) { } setupReconcilers := func(ctx context.Context, mgr ctrl.Manager) { - // Set up a ClusterCacheTracker and ClusterCacheReconciler to provide to controllers - // requiring a connection to a remote cluster - tracker, err := remote.NewClusterCacheTracker( - mgr, - remote.ClusterCacheTrackerOptions{ - Log: &ctrl.Log, - Indexes: []remote.Index{remote.NodeProviderIDIndex}, + clusterCache, err := clustercache.SetupWithManager(ctx, mgr, clustercache.Options{ + SecretClient: mgr.GetClient(), + Cache: clustercache.CacheOptions{ + Indexes: []clustercache.CacheOptionsIndex{clustercache.NodeProviderIDIndex}, }, - ) + Client: clustercache.ClientOptions{ + UserAgent: remote.DefaultClusterAPIUserAgent("test-controller-manager"), + Cache: clustercache.ClientCacheOptions{ + DisableFor: []client.Object{ + // Don't cache ConfigMaps & Secrets. + &corev1.ConfigMap{}, + &corev1.Secret{}, + }, + }, + }, + }, controller.Options{MaxConcurrentReconciles: 10}) if err != nil { - panic(fmt.Sprintf("unable to create cluster cache tracker: %v", err)) - } - if err := (&remote.ClusterCacheReconciler{ - Client: mgr.GetClient(), - Tracker: tracker, - }).SetupWithManager(ctx, mgr, controller.Options{MaxConcurrentReconciles: 1}); err != nil { - panic(fmt.Sprintf("Failed to start ClusterCacheReconciler: %v", err)) + panic(fmt.Sprintf("Failed to create ClusterCache: %v", err)) } + // Setting ConnectionCreationRetryInterval to 2 seconds, otherwise client creation is + // only retried every 30s. If we get unlucky tests are then failing with timeout. + clusterCache.(interface{ SetConnectionCreationRetryInterval(time.Duration) }). + SetConnectionCreationRetryInterval(2 * time.Second) + if err := (&Reconciler{ - Client: mgr.GetClient(), - APIReader: mgr.GetAPIReader(), - Tracker: tracker, + Client: mgr.GetClient(), + APIReader: mgr.GetAPIReader(), + ClusterCache: clusterCache, }).SetupWithManager(ctx, mgr, controller.Options{MaxConcurrentReconciles: 1}); err != nil { panic(fmt.Sprintf("Failed to start MachineReconciler: %v", err)) } diff --git a/internal/controllers/machinedeployment/machinedeployment_controller_test.go b/internal/controllers/machinedeployment/machinedeployment_controller_test.go index 51e14f39d82c..2b5aecad8020 100644 --- a/internal/controllers/machinedeployment/machinedeployment_controller_test.go +++ b/internal/controllers/machinedeployment/machinedeployment_controller_test.go @@ -62,6 +62,11 @@ func TestMachineDeploymentReconciler(t *testing.T) { t.Log("Creating the Cluster Kubeconfig Secret") g.Expect(env.CreateKubeconfigSecret(ctx, cluster)).To(Succeed()) + // Set InfrastructureReady to true so ClusterCache creates the clusterAccessor. + patch := client.MergeFrom(cluster.DeepCopy()) + cluster.Status.InfrastructureReady = true + g.Expect(env.Status().Patch(ctx, cluster, patch)).To(Succeed()) + return ns, cluster } diff --git a/internal/controllers/machinedeployment/suite_test.go b/internal/controllers/machinedeployment/suite_test.go index 104df1bb50b0..81ebe8c5ab52 100644 --- a/internal/controllers/machinedeployment/suite_test.go +++ b/internal/controllers/machinedeployment/suite_test.go @@ -39,6 +39,7 @@ import ( clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/api/v1beta1/index" + "sigs.k8s.io/cluster-api/controllers/clustercache" "sigs.k8s.io/cluster-api/controllers/remote" machinecontroller "sigs.k8s.io/cluster-api/internal/controllers/machine" machinesetcontroller "sigs.k8s.io/cluster-api/internal/controllers/machineset" @@ -69,36 +70,37 @@ func TestMain(m *testing.M) { } setupReconcilers := func(ctx context.Context, mgr ctrl.Manager) { - // Set up a ClusterCacheTracker and ClusterCacheReconciler to provide to controllers - // requiring a connection to a remote cluster - tracker, err := remote.NewClusterCacheTracker( - mgr, - remote.ClusterCacheTrackerOptions{ - Log: &ctrl.Log, - Indexes: []remote.Index{remote.NodeProviderIDIndex}, + clusterCache, err := clustercache.SetupWithManager(ctx, mgr, clustercache.Options{ + SecretClient: mgr.GetClient(), + Cache: clustercache.CacheOptions{ + Indexes: []clustercache.CacheOptionsIndex{clustercache.NodeProviderIDIndex}, }, - ) + Client: clustercache.ClientOptions{ + UserAgent: remote.DefaultClusterAPIUserAgent("test-controller-manager"), + Cache: clustercache.ClientCacheOptions{ + DisableFor: []client.Object{ + // Don't cache ConfigMaps & Secrets. + &corev1.ConfigMap{}, + &corev1.Secret{}, + }, + }, + }, + }, controller.Options{MaxConcurrentReconciles: 10}) if err != nil { - panic(fmt.Sprintf("unable to create cluster cache tracker: %v", err)) - } - if err := (&remote.ClusterCacheReconciler{ - Client: mgr.GetClient(), - Tracker: tracker, - }).SetupWithManager(ctx, mgr, controller.Options{MaxConcurrentReconciles: 1}); err != nil { - panic(fmt.Sprintf("Failed to start ClusterCacheReconciler: %v", err)) + panic(fmt.Sprintf("Failed to create ClusterCache: %v", err)) } if err := (&machinecontroller.Reconciler{ - Client: mgr.GetClient(), - APIReader: mgr.GetAPIReader(), - Tracker: tracker, + Client: mgr.GetClient(), + APIReader: mgr.GetAPIReader(), + ClusterCache: clusterCache, }).SetupWithManager(ctx, mgr, controller.Options{MaxConcurrentReconciles: 1}); err != nil { panic(fmt.Sprintf("Failed to start MachineReconciler: %v", err)) } if err := (&machinesetcontroller.Reconciler{ - Client: mgr.GetClient(), - APIReader: mgr.GetAPIReader(), - Tracker: tracker, + Client: mgr.GetClient(), + APIReader: mgr.GetAPIReader(), + ClusterCache: clusterCache, }).SetupWithManager(ctx, mgr, controller.Options{MaxConcurrentReconciles: 1}); err != nil { panic(fmt.Sprintf("Failed to start MachineSetReconciler: %v", err)) } diff --git a/internal/controllers/machinehealthcheck/machinehealthcheck_controller.go b/internal/controllers/machinehealthcheck/machinehealthcheck_controller.go index a6e4d0d60366..568e494dc174 100644 --- a/internal/controllers/machinehealthcheck/machinehealthcheck_controller.go +++ b/internal/controllers/machinehealthcheck/machinehealthcheck_controller.go @@ -44,8 +44,8 @@ import ( clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/api/v1beta1/index" + "sigs.k8s.io/cluster-api/controllers/clustercache" "sigs.k8s.io/cluster-api/controllers/external" - "sigs.k8s.io/cluster-api/controllers/remote" "sigs.k8s.io/cluster-api/internal/controllers/machine" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/annotations" @@ -74,8 +74,8 @@ const ( // Reconciler reconciles a MachineHealthCheck object. type Reconciler struct { - Client client.Client - Tracker *remote.ClusterCacheTracker + Client client.Client + ClusterCache clustercache.ClusterCache // WatchFilterValue is the label value used to filter events prior to reconciliation. WatchFilterValue string @@ -104,7 +104,9 @@ func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, opt predicates.ResourceHasFilterLabel(mgr.GetScheme(), predicateLog, r.WatchFilterValue), ), ), - ).Build(r) + ). + WatchesRawSource(r.ClusterCache.GetClusterSource("machinehealthcheck", r.clusterToMachineHealthCheck)). + Build(r) if err != nil { return errors.Wrap(err, "failed setting up with a controller manager") } @@ -171,13 +173,11 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Re result, err := r.reconcile(ctx, log, cluster, m) if err != nil { - // Requeue if the reconcile failed because the ClusterCacheTracker was locked for - // the current cluster because of concurrent access. - if errors.Is(err, remote.ErrClusterLocked) { - log.V(5).Info("Requeuing because another worker has the lock on the ClusterCacheTracker") + // Requeue if the reconcile failed because connection to workload cluster was down. + if errors.Is(err, clustercache.ErrClusterNotConnected) { + log.V(5).Info("Requeuing because connection to the workload cluster is down") return ctrl.Result{RequeueAfter: time.Minute}, nil } - log.Error(err, "Failed to reconcile MachineHealthCheck") r.recorder.Eventf(m, corev1.EventTypeWarning, "ReconcileError", "%v", err) // Requeue immediately if any errors occurred @@ -200,7 +200,7 @@ func (r *Reconciler) reconcile(ctx context.Context, logger logr.Logger, cluster var remoteClient client.Client if conditions.IsTrue(cluster, clusterv1.ControlPlaneInitializedCondition) { var err error - remoteClient, err = r.Tracker.GetClient(ctx, util.ObjectKey(cluster)) + remoteClient, err = r.ClusterCache.GetClient(ctx, util.ObjectKey(cluster)) if err != nil { logger.Error(err, "Error creating remote cluster cache") return ctrl.Result{}, err @@ -528,13 +528,12 @@ func (r *Reconciler) nodeToMachineHealthCheck(ctx context.Context, o client.Obje func (r *Reconciler) watchClusterNodes(ctx context.Context, cluster *clusterv1.Cluster) error { // If there is no tracker, don't watch remote nodes - if r.Tracker == nil { + if r.ClusterCache == nil { return nil } - return r.Tracker.Watch(ctx, remote.WatchInput{ + return r.ClusterCache.Watch(ctx, util.ObjectKey(cluster), clustercache.WatchInput{ Name: "machinehealthcheck-watchClusterNodes", - Cluster: util.ObjectKey(cluster), Watcher: r.controller, Kind: &corev1.Node{}, EventHandler: handler.EnqueueRequestsFromMapFunc(r.nodeToMachineHealthCheck), diff --git a/internal/controllers/machinehealthcheck/machinehealthcheck_controller_test.go b/internal/controllers/machinehealthcheck/machinehealthcheck_controller_test.go index 71c35d951eb9..3a1dbeeb152f 100644 --- a/internal/controllers/machinehealthcheck/machinehealthcheck_controller_test.go +++ b/internal/controllers/machinehealthcheck/machinehealthcheck_controller_test.go @@ -33,7 +33,6 @@ import ( "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/intstr" "k8s.io/apimachinery/pkg/util/uuid" - "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/tools/record" "k8s.io/utils/ptr" "sigs.k8s.io/controller-runtime/pkg/client" @@ -44,7 +43,7 @@ import ( clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/api/v1beta1/index" - "sigs.k8s.io/cluster-api/controllers/remote" + "sigs.k8s.io/cluster-api/controllers/clustercache" capierrors "sigs.k8s.io/cluster-api/errors" "sigs.k8s.io/cluster-api/internal/test/builder" "sigs.k8s.io/cluster-api/internal/webhooks" @@ -2352,13 +2351,9 @@ func createCluster(g *WithT, namespaceName string) *clusterv1.Cluster { }, } - g.Expect(env.Create(ctx, cluster)).To(Succeed()) + g.Expect(env.CreateAndWait(ctx, cluster)).To(Succeed()) - // Make sure the cluster is in the cache before proceeding - g.Eventually(func() error { - var cl clusterv1.Cluster - return env.Get(ctx, util.ObjectKey(cluster), &cl) - }, timeout, 100*time.Millisecond).Should(Succeed()) + g.Expect(env.CreateKubeconfigSecret(ctx, cluster)).To(Succeed()) // This is required for MHC to perform checks patchHelper, err := patch.NewHelper(cluster, env.Client) @@ -2366,17 +2361,11 @@ func createCluster(g *WithT, namespaceName string) *clusterv1.Cluster { conditions.MarkTrue(cluster, clusterv1.InfrastructureReadyCondition) g.Expect(patchHelper.Patch(ctx, cluster)).To(Succeed()) - // Wait for cluster in cache to be updated post-patch - g.Eventually(func() bool { - err := env.Get(ctx, util.ObjectKey(cluster), cluster) - if err != nil { - return false - } - - return conditions.IsTrue(cluster, clusterv1.InfrastructureReadyCondition) - }, timeout, 100*time.Millisecond).Should(BeTrue()) - - g.Expect(env.CreateKubeconfigSecret(ctx, cluster)).To(Succeed()) + // Wait for cluster in the cached client to be updated post-patch + g.Eventually(func(g Gomega) { + g.Expect(env.Get(ctx, util.ObjectKey(cluster), cluster)).To(Succeed()) + g.Expect(conditions.IsTrue(cluster, clusterv1.InfrastructureReadyCondition)).To(BeTrue()) + }, timeout, 100*time.Millisecond).Should(Succeed()) return cluster } @@ -2680,9 +2669,9 @@ func TestPatchTargets(t *testing.T) { mhc, ).WithStatusSubresource(&clusterv1.MachineHealthCheck{}, &clusterv1.Machine{}).Build() r := &Reconciler{ - Client: cl, - recorder: record.NewFakeRecorder(32), - Tracker: remote.NewTestClusterCacheTracker(logr.New(log.NullLogSink{}), cl, cl, scheme.Scheme, client.ObjectKey{Name: clusterName, Namespace: namespace}, "machinehealthcheck-watchClusterNodes"), + Client: cl, + recorder: record.NewFakeRecorder(32), + ClusterCache: clustercache.NewFakeClusterCache(cl, client.ObjectKey{Name: clusterName, Namespace: namespace}, "machinehealthcheck-watchClusterNodes"), } // To make the patch fail, create patchHelper with a different client. diff --git a/internal/controllers/machinehealthcheck/suite_test.go b/internal/controllers/machinehealthcheck/suite_test.go index fdd86997d03a..69dc6beea760 100644 --- a/internal/controllers/machinehealthcheck/suite_test.go +++ b/internal/controllers/machinehealthcheck/suite_test.go @@ -24,14 +24,17 @@ import ( "time" . "github.com/onsi/gomega" + corev1 "k8s.io/api/core/v1" apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" "k8s.io/apimachinery/pkg/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/api/v1beta1/index" + "sigs.k8s.io/cluster-api/controllers/clustercache" "sigs.k8s.io/cluster-api/controllers/remote" clustercontroller "sigs.k8s.io/cluster-api/internal/controllers/cluster" machinecontroller "sigs.k8s.io/cluster-api/internal/controllers/machine" @@ -64,48 +67,55 @@ func TestMain(m *testing.M) { } setupReconcilers := func(ctx context.Context, mgr ctrl.Manager) { - // Set up a ClusterCacheTracker and ClusterCacheReconciler to provide to controllers - // requiring a connection to a remote cluster - tracker, err := remote.NewClusterCacheTracker( - mgr, - remote.ClusterCacheTrackerOptions{ - Log: &ctrl.Log, - Indexes: []remote.Index{remote.NodeProviderIDIndex}, + clusterCache, err := clustercache.SetupWithManager(ctx, mgr, clustercache.Options{ + SecretClient: mgr.GetClient(), + Cache: clustercache.CacheOptions{ + Indexes: []clustercache.CacheOptionsIndex{clustercache.NodeProviderIDIndex}, }, - ) + Client: clustercache.ClientOptions{ + UserAgent: remote.DefaultClusterAPIUserAgent("test-controller-manager"), + Cache: clustercache.ClientCacheOptions{ + DisableFor: []client.Object{ + // Don't cache ConfigMaps & Secrets. + &corev1.ConfigMap{}, + &corev1.Secret{}, + }, + }, + }, + }, controller.Options{MaxConcurrentReconciles: 10}) if err != nil { - panic(fmt.Sprintf("unable to create cluster cache tracker: %v", err)) - } - if err := (&remote.ClusterCacheReconciler{ - Client: mgr.GetClient(), - Tracker: tracker, - }).SetupWithManager(ctx, mgr, controller.Options{MaxConcurrentReconciles: 1}); err != nil { - panic(fmt.Sprintf("Failed to start ClusterCacheReconciler: %v", err)) + panic(fmt.Sprintf("Failed to create ClusterCache: %v", err)) } + // Setting ConnectionCreationRetryInterval to 2 seconds, otherwise client creation is + // only retried every 30s. If we get unlucky tests are then failing with timeout. + clusterCache.(interface{ SetConnectionCreationRetryInterval(time.Duration) }). + SetConnectionCreationRetryInterval(2 * time.Second) + if err := (&clustercontroller.Reconciler{ - Client: mgr.GetClient(), - APIReader: mgr.GetClient(), + Client: mgr.GetClient(), + APIReader: mgr.GetClient(), + ClusterCache: clusterCache, }).SetupWithManager(ctx, mgr, controller.Options{MaxConcurrentReconciles: 1}); err != nil { panic(fmt.Sprintf("Failed to start ClusterReconciler: %v", err)) } if err := (&Reconciler{ - Client: mgr.GetClient(), - Tracker: tracker, + Client: mgr.GetClient(), + ClusterCache: clusterCache, }).SetupWithManager(ctx, mgr, controller.Options{MaxConcurrentReconciles: 1}); err != nil { panic(fmt.Sprintf("Failed to start Reconciler : %v", err)) } if err := (&machinecontroller.Reconciler{ - Client: mgr.GetClient(), - APIReader: mgr.GetAPIReader(), - Tracker: tracker, + Client: mgr.GetClient(), + APIReader: mgr.GetAPIReader(), + ClusterCache: clusterCache, }).SetupWithManager(ctx, mgr, controller.Options{MaxConcurrentReconciles: 1}); err != nil { panic(fmt.Sprintf("Failed to start MachineReconciler: %v", err)) } if err := (&machinesetcontroller.Reconciler{ - Client: mgr.GetClient(), - APIReader: mgr.GetAPIReader(), - Tracker: tracker, + Client: mgr.GetClient(), + APIReader: mgr.GetAPIReader(), + ClusterCache: clusterCache, }).SetupWithManager(ctx, mgr, controller.Options{MaxConcurrentReconciles: 1}); err != nil { panic(fmt.Sprintf("Failed to start MMachineSetReconciler: %v", err)) } diff --git a/internal/controllers/machineset/machineset_controller.go b/internal/controllers/machineset/machineset_controller.go index 51da8e19440c..905dcb66849e 100644 --- a/internal/controllers/machineset/machineset_controller.go +++ b/internal/controllers/machineset/machineset_controller.go @@ -45,9 +45,9 @@ import ( "sigs.k8s.io/controller-runtime/pkg/handler" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + "sigs.k8s.io/cluster-api/controllers/clustercache" "sigs.k8s.io/cluster-api/controllers/external" "sigs.k8s.io/cluster-api/controllers/noderefutil" - "sigs.k8s.io/cluster-api/controllers/remote" "sigs.k8s.io/cluster-api/internal/contract" "sigs.k8s.io/cluster-api/internal/controllers/machine" "sigs.k8s.io/cluster-api/internal/util/ssa" @@ -87,9 +87,9 @@ const machineSetManagerName = "capi-machineset" // Reconciler reconciles a MachineSet object. type Reconciler struct { - Client client.Client - APIReader client.Reader - Tracker *remote.ClusterCacheTracker + Client client.Client + APIReader client.Reader + ClusterCache clustercache.ClusterCache // WatchFilterValue is the label value used to filter events prior to reconciliation. WatchFilterValue string @@ -128,7 +128,9 @@ func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, opt predicates.ResourceHasFilterLabel(mgr.GetScheme(), predicateLog, r.WatchFilterValue), ), ), - ).Complete(r) + ). + WatchesRawSource(r.ClusterCache.GetClusterSource("machineset", clusterToMachineSets)). + Complete(r) if err != nil { return errors.Wrap(err, "failed setting up with a controller manager") } @@ -196,10 +198,9 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Re result, err := r.reconcile(ctx, cluster, machineSet) if err != nil { - // Requeue if the reconcile failed because the ClusterCacheTracker was locked for - // the current cluster because of concurrent access. - if errors.Is(err, remote.ErrClusterLocked) { - log.V(5).Info("Requeuing because another worker has the lock on the ClusterCacheTracker") + // Requeue if the reconcile failed because connection to workload cluster was down. + if errors.Is(err, clustercache.ErrClusterNotConnected) { + log.V(5).Info("Requeuing because connection to the workload cluster is down") return ctrl.Result{RequeueAfter: time.Minute}, nil } r.recorder.Eventf(machineSet, corev1.EventTypeWarning, "ReconcileError", "%v", err) @@ -1047,7 +1048,7 @@ func (r *Reconciler) updateStatus(ctx context.Context, cluster *clusterv1.Cluste } func (r *Reconciler) getMachineNode(ctx context.Context, cluster *clusterv1.Cluster, machine *clusterv1.Machine) (*corev1.Node, error) { - remoteClient, err := r.Tracker.GetClient(ctx, util.ObjectKey(cluster)) + remoteClient, err := r.ClusterCache.GetClient(ctx, util.ObjectKey(cluster)) if err != nil { return nil, err } diff --git a/internal/controllers/machineset/machineset_controller_test.go b/internal/controllers/machineset/machineset_controller_test.go index 449d09a0e954..9c1f901ac7f8 100644 --- a/internal/controllers/machineset/machineset_controller_test.go +++ b/internal/controllers/machineset/machineset_controller_test.go @@ -61,6 +61,11 @@ func TestMachineSetReconciler(t *testing.T) { t.Log("Creating the Cluster Kubeconfig Secret") g.Expect(env.CreateKubeconfigSecret(ctx, cluster)).To(Succeed()) + // Set InfrastructureReady to true so ClusterCache creates the clusterAccessor. + patch := client.MergeFrom(cluster.DeepCopy()) + cluster.Status.InfrastructureReady = true + g.Expect(env.Status().Patch(ctx, cluster, patch)).To(Succeed()) + return ns, cluster } diff --git a/internal/controllers/machineset/suite_test.go b/internal/controllers/machineset/suite_test.go index f42aa1737b58..59c592951382 100644 --- a/internal/controllers/machineset/suite_test.go +++ b/internal/controllers/machineset/suite_test.go @@ -38,6 +38,7 @@ import ( clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/api/v1beta1/index" + "sigs.k8s.io/cluster-api/controllers/clustercache" "sigs.k8s.io/cluster-api/controllers/remote" machinecontroller "sigs.k8s.io/cluster-api/internal/controllers/machine" "sigs.k8s.io/cluster-api/internal/test/envtest" @@ -68,36 +69,37 @@ func TestMain(m *testing.M) { } setupReconcilers := func(ctx context.Context, mgr ctrl.Manager) { - // Set up a ClusterCacheTracker and ClusterCacheReconciler to provide to controllers - // requiring a connection to a remote cluster - tracker, err := remote.NewClusterCacheTracker( - mgr, - remote.ClusterCacheTrackerOptions{ - Log: &ctrl.Log, - Indexes: []remote.Index{remote.NodeProviderIDIndex}, + clusterCache, err := clustercache.SetupWithManager(ctx, mgr, clustercache.Options{ + SecretClient: mgr.GetClient(), + Cache: clustercache.CacheOptions{ + Indexes: []clustercache.CacheOptionsIndex{clustercache.NodeProviderIDIndex}, }, - ) + Client: clustercache.ClientOptions{ + UserAgent: remote.DefaultClusterAPIUserAgent("test-controller-manager"), + Cache: clustercache.ClientCacheOptions{ + DisableFor: []client.Object{ + // Don't cache ConfigMaps & Secrets. + &corev1.ConfigMap{}, + &corev1.Secret{}, + }, + }, + }, + }, controller.Options{MaxConcurrentReconciles: 10}) if err != nil { - panic(fmt.Sprintf("unable to create cluster cache tracker: %v", err)) - } - if err := (&remote.ClusterCacheReconciler{ - Client: mgr.GetClient(), - Tracker: tracker, - }).SetupWithManager(ctx, mgr, controller.Options{MaxConcurrentReconciles: 1}); err != nil { - panic(fmt.Sprintf("Failed to start ClusterCacheReconciler: %v", err)) + panic(fmt.Sprintf("Failed to create ClusterCache: %v", err)) } if err := (&Reconciler{ - Client: mgr.GetClient(), - APIReader: mgr.GetAPIReader(), - Tracker: tracker, + Client: mgr.GetClient(), + APIReader: mgr.GetAPIReader(), + ClusterCache: clusterCache, }).SetupWithManager(ctx, mgr, controller.Options{MaxConcurrentReconciles: 1}); err != nil { panic(fmt.Sprintf("Failed to start MMachineSetReconciler: %v", err)) } if err := (&machinecontroller.Reconciler{ - Client: mgr.GetClient(), - APIReader: mgr.GetAPIReader(), - Tracker: tracker, + Client: mgr.GetClient(), + APIReader: mgr.GetAPIReader(), + ClusterCache: clusterCache, }).SetupWithManager(ctx, mgr, controller.Options{MaxConcurrentReconciles: 1}); err != nil { panic(fmt.Sprintf("Failed to start MachineReconciler: %v", err)) } diff --git a/internal/controllers/topology/cluster/cluster_controller.go b/internal/controllers/topology/cluster/cluster_controller.go index aeb86f152405..1654a55622c4 100644 --- a/internal/controllers/topology/cluster/cluster_controller.go +++ b/internal/controllers/topology/cluster/cluster_controller.go @@ -35,8 +35,8 @@ import ( clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/api/v1beta1/index" + "sigs.k8s.io/cluster-api/controllers/clustercache" "sigs.k8s.io/cluster-api/controllers/external" - "sigs.k8s.io/cluster-api/controllers/remote" expv1 "sigs.k8s.io/cluster-api/exp/api/v1beta1" runtimecatalog "sigs.k8s.io/cluster-api/exp/runtime/catalog" runtimehooksv1 "sigs.k8s.io/cluster-api/exp/runtime/hooks/api/v1alpha1" @@ -66,8 +66,8 @@ import ( // Reconciler reconciles a managed topology for a Cluster object. type Reconciler struct { - Client client.Client - Tracker *remote.ClusterCacheTracker + Client client.Client + ClusterCache clustercache.ClusterCache // APIReader is used to list MachineSets directly via the API server to avoid // race conditions caused by an outdated cache. APIReader client.Reader @@ -94,6 +94,9 @@ func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, opt predicates.ClusterHasTopology(mgr.GetScheme(), predicateLog), )). Named("topology/cluster"). + WatchesRawSource(r.ClusterCache.GetClusterSource("topology/cluster", func(_ context.Context, o client.Object) []ctrl.Request { + return []ctrl.Request{{NamespacedName: client.ObjectKeyFromObject(o)}} + })). Watches( &clusterv1.ClusterClass{}, handler.EnqueueRequestsFromMapFunc(r.clusterClassToCluster), @@ -123,7 +126,7 @@ func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, opt Cache: mgr.GetCache(), Scheme: mgr.GetScheme(), } - r.desiredStateGenerator = desiredstate.NewGenerator(r.Client, r.Tracker, r.RuntimeClient) + r.desiredStateGenerator = desiredstate.NewGenerator(r.Client, r.ClusterCache, r.RuntimeClient) r.recorder = mgr.GetEventRecorderFor("topology/cluster-controller") if r.patchHelperFactory == nil { r.patchHelperFactory = serverSideApplyPatchHelperFactory(r.Client, ssa.NewCache()) @@ -133,7 +136,7 @@ func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, opt // SetupForDryRun prepares the Reconciler for a dry run execution. func (r *Reconciler) SetupForDryRun(recorder record.EventRecorder) { - r.desiredStateGenerator = desiredstate.NewGenerator(r.Client, r.Tracker, r.RuntimeClient) + r.desiredStateGenerator = desiredstate.NewGenerator(r.Client, r.ClusterCache, r.RuntimeClient) r.recorder = recorder r.patchHelperFactory = dryRunPatchHelperFactory(r.Client) } @@ -203,10 +206,9 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Re // Handle normal reconciliation loop. result, err := r.reconcile(ctx, s) if err != nil { - // Requeue if the reconcile failed because the ClusterCacheTracker was locked for - // the current cluster because of concurrent access. - if errors.Is(err, remote.ErrClusterLocked) { - log.V(5).Info("Requeuing because another worker has the lock on the ClusterCacheTracker") + // Requeue if the reconcile failed because connection to workload cluster was down. + if errors.Is(err, clustercache.ErrClusterNotConnected) { + log.V(5).Info("Requeuing because connection to the workload cluster is down") return ctrl.Result{RequeueAfter: time.Minute}, nil } } diff --git a/internal/controllers/topology/cluster/cluster_controller_test.go b/internal/controllers/topology/cluster/cluster_controller_test.go index 209421179cb5..9e8d95bbf784 100644 --- a/internal/controllers/topology/cluster/cluster_controller_test.go +++ b/internal/controllers/topology/cluster/cluster_controller_test.go @@ -894,6 +894,18 @@ func setupTestEnvForIntegrationTests(ns *corev1.Namespace) (func() error, error) return cleanup, err } } + // Set InfrastructureReady to true so ClusterCache creates the clusterAccessors. + patch := client.MergeFrom(cluster1.DeepCopy()) + cluster1.Status.InfrastructureReady = true + if err := env.Status().Patch(ctx, cluster1, patch); err != nil { + return nil, err + } + patch = client.MergeFrom(cluster2.DeepCopy()) + cluster2.Status.InfrastructureReady = true + if err := env.Status().Patch(ctx, cluster2, patch); err != nil { + return nil, err + } + return cleanup, nil } @@ -1041,7 +1053,7 @@ func assertMachineDeploymentsReconcile(cluster *clusterv1.Cluster) error { // Check replicas and version for the MachineDeployment. if *md.Spec.Replicas != *topologyMD.Replicas { - return fmt.Errorf("replicas %v does not match expected %v", md.Spec.Replicas, topologyMD.Replicas) + return fmt.Errorf("replicas %v does not match expected %v", *md.Spec.Replicas, *topologyMD.Replicas) } if *md.Spec.Template.Spec.Version != cluster.Spec.Topology.Version { return fmt.Errorf("version %v does not match expected %v", *md.Spec.Template.Spec.Version, cluster.Spec.Topology.Version) diff --git a/internal/controllers/topology/cluster/suite_test.go b/internal/controllers/topology/cluster/suite_test.go index 080980413274..41f997b73c04 100644 --- a/internal/controllers/topology/cluster/suite_test.go +++ b/internal/controllers/topology/cluster/suite_test.go @@ -34,6 +34,7 @@ import ( clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/api/v1beta1/index" + "sigs.k8s.io/cluster-api/controllers/clustercache" "sigs.k8s.io/cluster-api/controllers/remote" expv1 "sigs.k8s.io/cluster-api/exp/api/v1beta1" "sigs.k8s.io/cluster-api/internal/controllers/clusterclass" @@ -65,37 +66,30 @@ func TestMain(m *testing.M) { } } setupReconcilers := func(ctx context.Context, mgr ctrl.Manager) { - // Set up a ClusterCacheTracker and ClusterCacheReconciler to provide to controllers - // requiring a connection to a remote cluster - secretCachingClient, err := client.New(mgr.GetConfig(), client.Options{ - HTTPClient: mgr.GetHTTPClient(), - Cache: &client.CacheOptions{ - Reader: mgr.GetCache(), + clusterCache, err := clustercache.SetupWithManager(ctx, mgr, clustercache.Options{ + SecretClient: mgr.GetClient(), + Cache: clustercache.CacheOptions{ + Indexes: []clustercache.CacheOptionsIndex{clustercache.NodeProviderIDIndex}, }, - }) - if err != nil { - panic(fmt.Sprintf("unable to create secretCachingClient: %v", err)) - } - tracker, err := remote.NewClusterCacheTracker( - mgr, - remote.ClusterCacheTrackerOptions{ - Log: &ctrl.Log, - SecretCachingClient: secretCachingClient, + Client: clustercache.ClientOptions{ + UserAgent: remote.DefaultClusterAPIUserAgent("test-controller-manager"), + Cache: clustercache.ClientCacheOptions{ + DisableFor: []client.Object{ + // Don't cache ConfigMaps & Secrets. + &corev1.ConfigMap{}, + &corev1.Secret{}, + }, + }, }, - ) + }, controller.Options{MaxConcurrentReconciles: 10}) if err != nil { - panic(fmt.Sprintf("unable to create cluster cache tracker: %v", err)) - } - if err := (&remote.ClusterCacheReconciler{ - Client: mgr.GetClient(), - Tracker: tracker, - }).SetupWithManager(ctx, mgr, controller.Options{MaxConcurrentReconciles: 1}); err != nil { - panic(fmt.Sprintf("Failed to start ClusterCacheReconciler: %v", err)) + panic(fmt.Sprintf("Failed to create ClusterCache: %v", err)) } + if err := (&Reconciler{ - Client: mgr.GetClient(), - APIReader: mgr.GetAPIReader(), - Tracker: tracker, + Client: mgr.GetClient(), + APIReader: mgr.GetAPIReader(), + ClusterCache: clusterCache, }).SetupWithManager(ctx, mgr, controller.Options{MaxConcurrentReconciles: 1}); err != nil { panic(fmt.Sprintf("unable to create topology cluster reconciler: %v", err)) } diff --git a/internal/test/envtest/environment.go b/internal/test/envtest/environment.go index 1c6dd9dd0252..080659d9ffc4 100644 --- a/internal/test/envtest/environment.go +++ b/internal/test/envtest/environment.go @@ -78,7 +78,17 @@ func init() { // Otherwise it would fall back and log to os.Stderr. // This would lead to race conditions because input.M.Run() writes os.Stderr // while some go routines in controller-runtime use os.Stderr to write logs. - if err := logsv1.ValidateAndApply(logs.NewOptions(), nil); err != nil { + logOptions := logs.NewOptions() + logOptions.Verbosity = logsv1.VerbosityLevel(2) + if logLevel := os.Getenv("CAPI_TEST_ENV_LOG_LEVEL"); logLevel != "" { + logLevelInt, err := strconv.Atoi(logLevel) + if err != nil { + klog.ErrorS(err, "Unable to convert value of CAPI_TEST_ENV_LOG_LEVEL environment variable to integer") + os.Exit(1) + } + logOptions.Verbosity = logsv1.VerbosityLevel(logLevelInt) + } + if err := logsv1.ValidateAndApply(logOptions, nil); err != nil { klog.ErrorS(err, "Unable to validate and apply log options") os.Exit(1) } diff --git a/internal/webhooks/cluster.go b/internal/webhooks/cluster.go index 255f820b291d..de51e7ab9d65 100644 --- a/internal/webhooks/cluster.go +++ b/internal/webhooks/cluster.go @@ -65,15 +65,15 @@ func (webhook *Cluster) SetupWebhookWithManager(mgr ctrl.Manager) error { // +kubebuilder:webhook:verbs=create;update;delete,path=/validate-cluster-x-k8s-io-v1beta1-cluster,mutating=false,failurePolicy=fail,matchPolicy=Equivalent,groups=cluster.x-k8s.io,resources=clusters,versions=v1beta1,name=validation.cluster.cluster.x-k8s.io,sideEffects=None,admissionReviewVersions=v1;v1beta1 // +kubebuilder:webhook:verbs=create;update,path=/mutate-cluster-x-k8s-io-v1beta1-cluster,mutating=true,failurePolicy=fail,matchPolicy=Equivalent,groups=cluster.x-k8s.io,resources=clusters,versions=v1beta1,name=default.cluster.cluster.x-k8s.io,sideEffects=None,admissionReviewVersions=v1;v1beta1 -// ClusterCacheTrackerReader is a scoped-down interface from ClusterCacheTracker that only allows to get a reader client. -type ClusterCacheTrackerReader interface { +// ClusterCacheReader is a scoped-down interface from ClusterCacheTracker that only allows to get a reader client. +type ClusterCacheReader interface { GetReader(ctx context.Context, cluster client.ObjectKey) (client.Reader, error) } // Cluster implements a validating and defaulting webhook for Cluster. type Cluster struct { - Client client.Reader - Tracker ClusterCacheTrackerReader + Client client.Reader + ClusterCacheReader ClusterCacheReader decoder admission.Decoder } @@ -493,7 +493,7 @@ func (webhook *Cluster) validateTopologyVersion(ctx context.Context, fldPath *fi } // minor version cannot be increased if MachinePools are upgrading or not yet on the current version - if err := validateTopologyMachinePoolVersions(ctx, webhook.Client, webhook.Tracker, oldCluster, oldVersion); err != nil { + if err := validateTopologyMachinePoolVersions(ctx, webhook.Client, webhook.ClusterCacheReader, oldCluster, oldVersion); err != nil { allErrs = append(allErrs, fmt.Errorf("blocking version update due to MachinePool version check: %v", err)) } @@ -600,7 +600,7 @@ func validateTopologyMachineDeploymentVersions(ctx context.Context, ctrlClient c return nil } -func validateTopologyMachinePoolVersions(ctx context.Context, ctrlClient client.Reader, tracker ClusterCacheTrackerReader, oldCluster *clusterv1.Cluster, oldVersion semver.Version) error { +func validateTopologyMachinePoolVersions(ctx context.Context, ctrlClient client.Reader, clusterCacheReader ClusterCacheReader, oldCluster *clusterv1.Cluster, oldVersion semver.Version) error { // List all the machine pools in the current cluster and in a managed topology. // FROM: current_state.go getCurrentMachinePoolState mps := &expv1.MachinePoolList{} @@ -620,7 +620,7 @@ func validateTopologyMachinePoolVersions(ctx context.Context, ctrlClient client. return nil } - wlClient, err := tracker.GetReader(ctx, client.ObjectKeyFromObject(oldCluster)) + wlClient, err := clusterCacheReader.GetReader(ctx, client.ObjectKeyFromObject(oldCluster)) if err != nil { return errors.Wrap(err, "unable to get client for workload cluster") } diff --git a/internal/webhooks/cluster_test.go b/internal/webhooks/cluster_test.go index 46451ac7a82e..dccee66d87ab 100644 --- a/internal/webhooks/cluster_test.go +++ b/internal/webhooks/cluster_test.go @@ -2131,11 +2131,11 @@ func TestClusterTopologyValidation(t *testing.T) { WithScheme(fakeScheme). Build() - // Use an empty fakeClusterCacheTracker here because the real cases are tested in Test_validateTopologyMachinePoolVersions. - fakeClusterCacheTrackerReader := &fakeClusterCacheTracker{client: fake.NewFakeClient()} + // Use an empty fakeClusterCache here because the real cases are tested in Test_validateTopologyMachinePoolVersions. + fakeClusterCacheReader := &fakeClusterCache{client: fake.NewFakeClient()} // Create the webhook and add the fakeClient as its client. This is required because the test uses a Managed Topology. - webhook := &Cluster{Client: fakeClient, Tracker: fakeClusterCacheTrackerReader} + webhook := &Cluster{Client: fakeClient, ClusterCacheReader: fakeClusterCacheReader} warnings, err := webhook.validate(ctx, tt.old, tt.in) if tt.expectErr { @@ -3628,7 +3628,7 @@ func Test_validateTopologyMachinePoolVersions(t *testing.T) { oldVersion, err := semver.ParseTolerant(tt.old.Spec.Topology.Version) g.Expect(err).ToNot(HaveOccurred()) - fakeClusterCacheTracker := &fakeClusterCacheTracker{ + fakeClusterCacheTracker := &fakeClusterCache{ client: fake.NewClientBuilder(). WithObjects(tt.workloadObjects...). Build(), @@ -3795,10 +3795,10 @@ func refToUnstructured(ref *corev1.ObjectReference) *unstructured.Unstructured { return output } -type fakeClusterCacheTracker struct { +type fakeClusterCache struct { client client.Reader } -func (f *fakeClusterCacheTracker) GetReader(_ context.Context, _ types.NamespacedName) (client.Reader, error) { +func (f *fakeClusterCache) GetReader(_ context.Context, _ types.NamespacedName) (client.Reader, error) { return f.client, nil } diff --git a/main.go b/main.go index bf90217daedf..8347a7b8f659 100644 --- a/main.go +++ b/main.go @@ -49,6 +49,7 @@ import ( clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/api/v1beta1/index" "sigs.k8s.io/cluster-api/controllers" + "sigs.k8s.io/cluster-api/controllers/clustercache" "sigs.k8s.io/cluster-api/controllers/remote" addonsv1 "sigs.k8s.io/cluster-api/exp/addons/api/v1beta1" addonscontrollers "sigs.k8s.io/cluster-api/exp/addons/controllers" @@ -85,29 +86,30 @@ var ( controllerName = "cluster-api-controller-manager" // flags. - enableLeaderElection bool - leaderElectionLeaseDuration time.Duration - leaderElectionRenewDeadline time.Duration - leaderElectionRetryPeriod time.Duration - watchFilterValue string - watchNamespace string - profilerAddress string - enableContentionProfiling bool - syncPeriod time.Duration - restConfigQPS float32 - restConfigBurst int - clusterCacheTrackerClientQPS float32 - clusterCacheTrackerClientBurst int - webhookPort int - webhookCertDir string - webhookCertName string - webhookKeyName string - healthAddr string - managerOptions = flags.ManagerOptions{} - logOptions = logs.NewOptions() + enableLeaderElection bool + leaderElectionLeaseDuration time.Duration + leaderElectionRenewDeadline time.Duration + leaderElectionRetryPeriod time.Duration + watchFilterValue string + watchNamespace string + profilerAddress string + enableContentionProfiling bool + syncPeriod time.Duration + restConfigQPS float32 + restConfigBurst int + clusterCacheClientQPS float32 + clusterCacheClientBurst int + webhookPort int + webhookCertDir string + webhookCertName string + webhookKeyName string + healthAddr string + managerOptions = flags.ManagerOptions{} + logOptions = logs.NewOptions() // core Cluster API specific flags. + remoteConnectionGracePeriod time.Duration clusterTopologyConcurrency int - clusterCacheTrackerConcurrency int + clusterCacheConcurrency int clusterClassConcurrency int clusterConcurrency int extensionConfigConcurrency int @@ -172,6 +174,10 @@ func InitFlags(fs *pflag.FlagSet) { fs.BoolVar(&enableContentionProfiling, "contention-profiling", false, "Enable block profiling") + fs.DurationVar(&remoteConnectionGracePeriod, "remote-connection-grace-period", 50*time.Second, + "Grace period after which the RemoteConnectionProbe condition on a Cluster goes to false, "+ + "if connection failures to a workload cluster occur") + fs.IntVar(&clusterTopologyConcurrency, "clustertopology-concurrency", 10, "Number of clusters to process simultaneously") @@ -181,7 +187,7 @@ func InitFlags(fs *pflag.FlagSet) { fs.IntVar(&clusterConcurrency, "cluster-concurrency", 10, "Number of clusters to process simultaneously") - fs.IntVar(&clusterCacheTrackerConcurrency, "clustercachetracker-concurrency", 10, + fs.IntVar(&clusterCacheConcurrency, "clustercache-concurrency", 100, "Number of clusters to process simultaneously") fs.IntVar(&extensionConfigConcurrency, "extensionconfig-concurrency", 10, @@ -214,11 +220,11 @@ func InitFlags(fs *pflag.FlagSet) { fs.IntVar(&restConfigBurst, "kube-api-burst", 30, "Maximum number of queries that should be allowed in one burst from the controller client to the Kubernetes API server.") - fs.Float32Var(&clusterCacheTrackerClientQPS, "clustercachetracker-client-qps", 20, - "Maximum queries per second from the cluster cache tracker clients to the Kubernetes API server of workload clusters.") + fs.Float32Var(&clusterCacheClientQPS, "clustercache-client-qps", 20, + "Maximum queries per second from the cluster cache clients to the Kubernetes API server of workload clusters.") - fs.IntVar(&clusterCacheTrackerClientBurst, "clustercachetracker-client-burst", 30, - "Maximum number of queries that should be allowed in one burst from the cluster cache tracker clients to the Kubernetes API server of workload clusters.") + fs.IntVar(&clusterCacheClientBurst, "clustercache-client-burst", 30, + "Maximum number of queries that should be allowed in one burst from the cluster cache clients to the Kubernetes API server of workload clusters.") fs.IntVar(&webhookPort, "webhook-port", 9443, "Webhook Server port") @@ -358,8 +364,8 @@ func main() { setupChecks(mgr) setupIndexes(ctx, mgr) - tracker := setupReconcilers(ctx, mgr, watchNamespaces, &syncPeriod) - setupWebhooks(mgr, tracker) + clusterCache := setupReconcilers(ctx, mgr, watchNamespaces, &syncPeriod) + setupWebhooks(mgr, clusterCache) setupLog.Info("Starting manager", "version", version.Get().String()) if err := mgr.Start(ctx); err != nil { @@ -387,7 +393,7 @@ func setupIndexes(ctx context.Context, mgr ctrl.Manager) { } } -func setupReconcilers(ctx context.Context, mgr ctrl.Manager, watchNamespaces map[string]cache.Config, syncPeriod *time.Duration) webhooks.ClusterCacheTrackerReader { +func setupReconcilers(ctx context.Context, mgr ctrl.Manager, watchNamespaces map[string]cache.Config, syncPeriod *time.Duration) clustercache.ClusterCache { secretCachingClient, err := client.New(mgr.GetConfig(), client.Options{ HTTPClient: mgr.GetHTTPClient(), Cache: &client.CacheOptions{ @@ -399,38 +405,30 @@ func setupReconcilers(ctx context.Context, mgr ctrl.Manager, watchNamespaces map os.Exit(1) } - // Set up a ClusterCacheTracker and ClusterCacheReconciler to provide to controllers - // requiring a connection to a remote cluster - tracker, err := remote.NewClusterCacheTracker( - mgr, - remote.ClusterCacheTrackerOptions{ - SecretCachingClient: secretCachingClient, - ControllerName: controllerName, - Log: &ctrl.Log, - ClientUncachedObjects: []client.Object{ - // Don't cache ConfigMaps & Secrets. - &corev1.ConfigMap{}, - &corev1.Secret{}, - // Don't cache Pods & DaemonSets (we get/list them e.g. during drain). - &corev1.Pod{}, - &appsv1.DaemonSet{}, + clusterCache, err := clustercache.SetupWithManager(ctx, mgr, clustercache.Options{ + SecretClient: secretCachingClient, + Cache: clustercache.CacheOptions{ + Indexes: []clustercache.CacheOptionsIndex{clustercache.NodeProviderIDIndex}, + }, + Client: clustercache.ClientOptions{ + QPS: clusterCacheClientQPS, + Burst: clusterCacheClientBurst, + UserAgent: remote.DefaultClusterAPIUserAgent(controllerName), + Cache: clustercache.ClientCacheOptions{ + DisableFor: []client.Object{ + // Don't cache ConfigMaps & Secrets. + &corev1.ConfigMap{}, + &corev1.Secret{}, + // Don't cache Pods & DaemonSets (we get/list them e.g. during drain). + &corev1.Pod{}, + &appsv1.DaemonSet{}, + }, }, - Indexes: []remote.Index{remote.NodeProviderIDIndex}, - ClientQPS: clusterCacheTrackerClientQPS, - ClientBurst: clusterCacheTrackerClientBurst, }, - ) - if err != nil { - setupLog.Error(err, "Unable to create cluster cache tracker") - os.Exit(1) - } - - if err := (&remote.ClusterCacheReconciler{ - Client: mgr.GetClient(), - Tracker: tracker, WatchFilterValue: watchFilterValue, - }).SetupWithManager(ctx, mgr, concurrency(clusterCacheTrackerConcurrency)); err != nil { - setupLog.Error(err, "Unable to create controller", "controller", "ClusterCacheReconciler") + }, concurrency(clusterCacheConcurrency)) + if err != nil { + setupLog.Error(err, "Unable to create ClusterCache") os.Exit(1) } @@ -488,7 +486,7 @@ func setupReconcilers(ctx context.Context, mgr ctrl.Manager, watchNamespaces map Client: mgr.GetClient(), APIReader: mgr.GetAPIReader(), RuntimeClient: runtimeClient, - Tracker: tracker, + ClusterCache: clusterCache, WatchFilterValue: watchFilterValue, }).SetupWithManager(ctx, mgr, concurrency(clusterTopologyConcurrency)); err != nil { setupLog.Error(err, "Unable to create controller", "controller", "ClusterTopology") @@ -527,9 +525,11 @@ func setupReconcilers(ctx context.Context, mgr ctrl.Manager, watchNamespaces map } if err := (&controllers.ClusterReconciler{ - Client: mgr.GetClient(), - APIReader: mgr.GetAPIReader(), - WatchFilterValue: watchFilterValue, + Client: mgr.GetClient(), + APIReader: mgr.GetAPIReader(), + ClusterCache: clusterCache, + WatchFilterValue: watchFilterValue, + RemoteConnectionGracePeriod: remoteConnectionGracePeriod, }).SetupWithManager(ctx, mgr, concurrency(clusterConcurrency)); err != nil { setupLog.Error(err, "Unable to create controller", "controller", "Cluster") os.Exit(1) @@ -537,7 +537,7 @@ func setupReconcilers(ctx context.Context, mgr ctrl.Manager, watchNamespaces map if err := (&controllers.MachineReconciler{ Client: mgr.GetClient(), APIReader: mgr.GetAPIReader(), - Tracker: tracker, + ClusterCache: clusterCache, WatchFilterValue: watchFilterValue, }).SetupWithManager(ctx, mgr, concurrency(machineConcurrency)); err != nil { setupLog.Error(err, "Unable to create controller", "controller", "Machine") @@ -546,7 +546,7 @@ func setupReconcilers(ctx context.Context, mgr ctrl.Manager, watchNamespaces map if err := (&controllers.MachineSetReconciler{ Client: mgr.GetClient(), APIReader: mgr.GetAPIReader(), - Tracker: tracker, + ClusterCache: clusterCache, WatchFilterValue: watchFilterValue, DeprecatedInfraMachineNaming: useDeprecatedInfraMachineNaming, }).SetupWithManager(ctx, mgr, concurrency(machineSetConcurrency)); err != nil { @@ -566,7 +566,7 @@ func setupReconcilers(ctx context.Context, mgr ctrl.Manager, watchNamespaces map if err := (&expcontrollers.MachinePoolReconciler{ Client: mgr.GetClient(), APIReader: mgr.GetAPIReader(), - Tracker: tracker, + ClusterCache: clusterCache, WatchFilterValue: watchFilterValue, }).SetupWithManager(ctx, mgr, concurrency(machinePoolConcurrency)); err != nil { setupLog.Error(err, "Unable to create controller", "controller", "MachinePool") @@ -577,7 +577,7 @@ func setupReconcilers(ctx context.Context, mgr ctrl.Manager, watchNamespaces map if feature.Gates.Enabled(feature.ClusterResourceSet) { if err := (&addonscontrollers.ClusterResourceSetReconciler{ Client: mgr.GetClient(), - Tracker: tracker, + ClusterCache: clusterCache, WatchFilterValue: watchFilterValue, }).SetupWithManager(ctx, mgr, concurrency(clusterResourceSetConcurrency), partialSecretCache); err != nil { setupLog.Error(err, "Unable to create controller", "controller", "ClusterResourceSet") @@ -594,17 +594,17 @@ func setupReconcilers(ctx context.Context, mgr ctrl.Manager, watchNamespaces map if err := (&controllers.MachineHealthCheckReconciler{ Client: mgr.GetClient(), - Tracker: tracker, + ClusterCache: clusterCache, WatchFilterValue: watchFilterValue, }).SetupWithManager(ctx, mgr, concurrency(machineHealthCheckConcurrency)); err != nil { setupLog.Error(err, "Unable to create controller", "controller", "MachineHealthCheck") os.Exit(1) } - return tracker + return clusterCache } -func setupWebhooks(mgr ctrl.Manager, tracker webhooks.ClusterCacheTrackerReader) { +func setupWebhooks(mgr ctrl.Manager, clusterCacheReader webhooks.ClusterCacheReader) { // NOTE: ClusterClass and managed topologies are behind ClusterTopology feature gate flag; the webhook // is going to prevent creating or updating new objects in case the feature flag is disabled. if err := (&webhooks.ClusterClass{Client: mgr.GetClient()}).SetupWebhookWithManager(mgr); err != nil { @@ -614,7 +614,7 @@ func setupWebhooks(mgr ctrl.Manager, tracker webhooks.ClusterCacheTrackerReader) // NOTE: ClusterClass and managed topologies are behind ClusterTopology feature gate flag; the webhook // is going to prevent usage of Cluster.Topology in case the feature flag is disabled. - if err := (&webhooks.Cluster{Client: mgr.GetClient(), ClusterCacheTrackerReader: tracker}).SetupWebhookWithManager(mgr); err != nil { + if err := (&webhooks.Cluster{Client: mgr.GetClient(), ClusterCacheReader: clusterCacheReader}).SetupWebhookWithManager(mgr); err != nil { setupLog.Error(err, "Unable to create webhook", "webhook", "Cluster") os.Exit(1) } diff --git a/test/infrastructure/docker/controllers/alias.go b/test/infrastructure/docker/controllers/alias.go index ca4ecf6e0ab8..1bd5da4be727 100644 --- a/test/infrastructure/docker/controllers/alias.go +++ b/test/infrastructure/docker/controllers/alias.go @@ -24,7 +24,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" - "sigs.k8s.io/cluster-api/controllers/remote" + "sigs.k8s.io/cluster-api/controllers/clustercache" "sigs.k8s.io/cluster-api/test/infrastructure/container" dockercontrollers "sigs.k8s.io/cluster-api/test/infrastructure/docker/internal/controllers" ) @@ -36,7 +36,7 @@ import ( type DockerMachineReconciler struct { Client client.Client ContainerRuntime container.Runtime - Tracker *remote.ClusterCacheTracker + ClusterCache clustercache.ClusterCache // WatchFilterValue is the label value used to filter events prior to reconciliation. WatchFilterValue string @@ -47,7 +47,7 @@ func (r *DockerMachineReconciler) SetupWithManager(ctx context.Context, mgr ctrl return (&dockercontrollers.DockerMachineReconciler{ Client: r.Client, ContainerRuntime: r.ContainerRuntime, - Tracker: r.Tracker, + ClusterCache: r.ClusterCache, WatchFilterValue: r.WatchFilterValue, }).SetupWithManager(ctx, mgr, options) } diff --git a/test/infrastructure/docker/exp/controllers/alias.go b/test/infrastructure/docker/exp/controllers/alias.go index 718b001c6201..a7778328d0e9 100644 --- a/test/infrastructure/docker/exp/controllers/alias.go +++ b/test/infrastructure/docker/exp/controllers/alias.go @@ -25,7 +25,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" - "sigs.k8s.io/cluster-api/controllers/remote" "sigs.k8s.io/cluster-api/test/infrastructure/container" dockermachinepoolcontrollers "sigs.k8s.io/cluster-api/test/infrastructure/docker/exp/internal/controllers" ) @@ -35,7 +34,6 @@ type DockerMachinePoolReconciler struct { Client client.Client Scheme *runtime.Scheme ContainerRuntime container.Runtime - Tracker *remote.ClusterCacheTracker // WatchFilterValue is the label value used to filter events prior to reconciliation. WatchFilterValue string @@ -47,7 +45,6 @@ func (r *DockerMachinePoolReconciler) SetupWithManager(ctx context.Context, mgr Client: r.Client, Scheme: r.Scheme, ContainerRuntime: r.ContainerRuntime, - Tracker: r.Tracker, WatchFilterValue: r.WatchFilterValue, }).SetupWithManager(ctx, mgr, options) } diff --git a/test/infrastructure/docker/exp/internal/controllers/dockermachinepool_controller.go b/test/infrastructure/docker/exp/internal/controllers/dockermachinepool_controller.go index 8306ce097ad0..6cdcf5a22390 100644 --- a/test/infrastructure/docker/exp/internal/controllers/dockermachinepool_controller.go +++ b/test/infrastructure/docker/exp/internal/controllers/dockermachinepool_controller.go @@ -21,7 +21,6 @@ import ( "context" "fmt" "sort" - "time" "github.com/pkg/errors" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -41,7 +40,6 @@ import ( clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/controllers/external" - "sigs.k8s.io/cluster-api/controllers/remote" expv1 "sigs.k8s.io/cluster-api/exp/api/v1beta1" utilexp "sigs.k8s.io/cluster-api/exp/util" "sigs.k8s.io/cluster-api/internal/util/ssa" @@ -67,7 +65,6 @@ type DockerMachinePoolReconciler struct { Client client.Client Scheme *runtime.Scheme ContainerRuntime container.Runtime - Tracker *remote.ClusterCacheTracker // WatchFilterValue is the label value used to filter events prior to reconciliation. WatchFilterValue string @@ -155,14 +152,7 @@ func (r *DockerMachinePoolReconciler) Reconcile(ctx context.Context, req ctrl.Re } // Handle non-deleted machines - res, err = r.reconcileNormal(ctx, cluster, machinePool, dockerMachinePool) - // Requeue if the reconcile failed because the ClusterCacheTracker was locked for - // the current cluster because of concurrent access. - if errors.Is(err, remote.ErrClusterLocked) { - log.V(5).Info("Requeuing because another worker has the lock on the ClusterCacheTracker") - return ctrl.Result{RequeueAfter: time.Minute}, nil - } - return res, err + return r.reconcileNormal(ctx, cluster, machinePool, dockerMachinePool) } // SetupWithManager will add watches for this controller. diff --git a/test/infrastructure/docker/internal/controllers/dockermachine_controller.go b/test/infrastructure/docker/internal/controllers/dockermachine_controller.go index 5e72a160afad..fd4a2ba25825 100644 --- a/test/infrastructure/docker/internal/controllers/dockermachine_controller.go +++ b/test/infrastructure/docker/internal/controllers/dockermachine_controller.go @@ -38,7 +38,7 @@ import ( clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" bootstrapv1 "sigs.k8s.io/cluster-api/bootstrap/kubeadm/api/v1beta1" - "sigs.k8s.io/cluster-api/controllers/remote" + "sigs.k8s.io/cluster-api/controllers/clustercache" utilexp "sigs.k8s.io/cluster-api/exp/util" "sigs.k8s.io/cluster-api/test/infrastructure/container" infrav1 "sigs.k8s.io/cluster-api/test/infrastructure/docker/api/v1beta1" @@ -57,7 +57,7 @@ import ( type DockerMachineReconciler struct { client.Client ContainerRuntime container.Runtime - Tracker *remote.ClusterCacheTracker + ClusterCache clustercache.ClusterCache // WatchFilterValue is the label value used to filter events prior to reconciliation. WatchFilterValue string @@ -193,8 +193,8 @@ func (r *DockerMachineReconciler) Reconcile(ctx context.Context, req ctrl.Reques res, err := r.reconcileNormal(ctx, cluster, dockerCluster, machine, dockerMachine, externalMachine, externalLoadBalancer) // Requeue if the reconcile failed because the ClusterCacheTracker was locked for // the current cluster because of concurrent access. - if errors.Is(err, remote.ErrClusterLocked) { - log.V(5).Info("Requeuing because another worker has the lock on the ClusterCacheTracker") + if errors.Is(err, clustercache.ErrClusterNotConnected) { + log.V(5).Info("Requeuing because connection to the workload cluster is down") return ctrl.Result{RequeueAfter: time.Minute}, nil } return res, err @@ -421,7 +421,7 @@ func (r *DockerMachineReconciler) reconcileNormal(ctx context.Context, cluster * // Usually a cloud provider will do this, but there is no docker-cloud provider. // Requeue if there is an error, as this is likely momentary load balancer // state changes during control plane provisioning. - remoteClient, err := r.Tracker.GetClient(ctx, client.ObjectKeyFromObject(cluster)) + remoteClient, err := r.ClusterCache.GetClient(ctx, client.ObjectKeyFromObject(cluster)) if err != nil { return ctrl.Result{}, errors.Wrap(err, "failed to generate workload cluster client") } @@ -502,7 +502,9 @@ func (r *DockerMachineReconciler) SetupWithManager(ctx context.Context, mgr ctrl builder.WithPredicates( predicates.ClusterUnpausedAndInfrastructureReady(mgr.GetScheme(), predicateLog), ), - ).Complete(r) + ). + WatchesRawSource(r.ClusterCache.GetClusterSource("dockermachine", clusterToDockerMachines)). + Complete(r) if err != nil { return errors.Wrap(err, "failed setting up with a controller manager") } diff --git a/test/infrastructure/docker/main.go b/test/infrastructure/docker/main.go index e1e774952c5c..8ab8366569be 100644 --- a/test/infrastructure/docker/main.go +++ b/test/infrastructure/docker/main.go @@ -45,6 +45,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/webhook" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + "sigs.k8s.io/cluster-api/controllers/clustercache" "sigs.k8s.io/cluster-api/controllers/remote" expv1 "sigs.k8s.io/cluster-api/exp/api/v1beta1" "sigs.k8s.io/cluster-api/feature" @@ -69,29 +70,29 @@ var ( controllerName = "cluster-api-docker-controller-manager" // flags. - enableLeaderElection bool - leaderElectionLeaseDuration time.Duration - leaderElectionRenewDeadline time.Duration - leaderElectionRetryPeriod time.Duration - watchFilterValue string - watchNamespace string - profilerAddress string - enableContentionProfiling bool - syncPeriod time.Duration - restConfigQPS float32 - restConfigBurst int - clusterCacheTrackerClientQPS float32 - clusterCacheTrackerClientBurst int - webhookPort int - webhookCertDir string - webhookCertName string - webhookKeyName string - healthAddr string - managerOptions = flags.ManagerOptions{} - logOptions = logs.NewOptions() + enableLeaderElection bool + leaderElectionLeaseDuration time.Duration + leaderElectionRenewDeadline time.Duration + leaderElectionRetryPeriod time.Duration + watchFilterValue string + watchNamespace string + profilerAddress string + enableContentionProfiling bool + syncPeriod time.Duration + restConfigQPS float32 + restConfigBurst int + clusterCacheClientQPS float32 + clusterCacheClientBurst int + webhookPort int + webhookCertDir string + webhookCertName string + webhookKeyName string + healthAddr string + managerOptions = flags.ManagerOptions{} + logOptions = logs.NewOptions() // CAPD specific flags. - concurrency int - clusterCacheTrackerConcurrency int + concurrency int + clusterCacheConcurrency int ) func init() { @@ -137,7 +138,7 @@ func InitFlags(fs *pflag.FlagSet) { fs.IntVar(&concurrency, "concurrency", 10, "The number of docker machines to process simultaneously") - fs.IntVar(&clusterCacheTrackerConcurrency, "clustercachetracker-concurrency", 10, + fs.IntVar(&clusterCacheConcurrency, "clustercache-concurrency", 100, "Number of clusters to process simultaneously") fs.DurationVar(&syncPeriod, "sync-period", 10*time.Minute, @@ -149,11 +150,11 @@ func InitFlags(fs *pflag.FlagSet) { fs.IntVar(&restConfigBurst, "kube-api-burst", 30, "Maximum number of queries that should be allowed in one burst from the controller client to the Kubernetes API server.") - fs.Float32Var(&clusterCacheTrackerClientQPS, "clustercachetracker-client-qps", 20, - "Maximum queries per second from the cluster cache tracker clients to the Kubernetes API server of workload clusters.") + fs.Float32Var(&clusterCacheClientQPS, "clustercache-client-qps", 20, + "Maximum queries per second from the cluster cache clients to the Kubernetes API server of workload clusters.") - fs.IntVar(&clusterCacheTrackerClientBurst, "clustercachetracker-client-burst", 30, - "Maximum number of queries that should be allowed in one burst from the cluster cache tracker clients to the Kubernetes API server of workload clusters.") + fs.IntVar(&clusterCacheClientBurst, "clustercache-client-burst", 30, + "Maximum number of queries that should be allowed in one burst from the cluster cache clients to the Kubernetes API server of workload clusters.") fs.IntVar(&webhookPort, "webhook-port", 9443, "Webhook Server port") @@ -323,36 +324,34 @@ func setupReconcilers(ctx context.Context, mgr ctrl.Manager) { os.Exit(1) } - tracker, err := remote.NewClusterCacheTracker( - mgr, - remote.ClusterCacheTrackerOptions{ - SecretCachingClient: secretCachingClient, - ControllerName: controllerName, - Log: &ctrl.Log, - ClientQPS: clusterCacheTrackerClientQPS, - ClientBurst: clusterCacheTrackerClientBurst, + clusterCache, err := clustercache.SetupWithManager(ctx, mgr, clustercache.Options{ + SecretClient: secretCachingClient, + Cache: clustercache.CacheOptions{}, + Client: clustercache.ClientOptions{ + QPS: clusterCacheClientQPS, + Burst: clusterCacheClientBurst, + UserAgent: remote.DefaultClusterAPIUserAgent(controllerName), + Cache: clustercache.ClientCacheOptions{ + DisableFor: []client.Object{ + // Don't cache ConfigMaps & Secrets. + &corev1.ConfigMap{}, + &corev1.Secret{}, + }, + }, }, - ) - if err != nil { - setupLog.Error(err, "Unable to create cluster cache tracker") - os.Exit(1) - } - - if err := (&remote.ClusterCacheReconciler{ - Client: mgr.GetClient(), - Tracker: tracker, WatchFilterValue: watchFilterValue, - }).SetupWithManager(ctx, mgr, controller.Options{ - MaxConcurrentReconciles: clusterCacheTrackerConcurrency, - }); err != nil { - setupLog.Error(err, "Unable to create controller", "controller", "ClusterCacheReconciler") + }, controller.Options{ + MaxConcurrentReconciles: clusterCacheConcurrency, + }) + if err != nil { + setupLog.Error(err, "Unable to create ClusterCache") os.Exit(1) } if err := (&controllers.DockerMachineReconciler{ Client: mgr.GetClient(), ContainerRuntime: runtimeClient, - Tracker: tracker, + ClusterCache: clusterCache, WatchFilterValue: watchFilterValue, }).SetupWithManager(ctx, mgr, controller.Options{ MaxConcurrentReconciles: concurrency, @@ -374,7 +373,6 @@ func setupReconcilers(ctx context.Context, mgr ctrl.Manager) { if err := (&expcontrollers.DockerMachinePoolReconciler{ Client: mgr.GetClient(), ContainerRuntime: runtimeClient, - Tracker: tracker, WatchFilterValue: watchFilterValue, }).SetupWithManager(ctx, mgr, controller.Options{MaxConcurrentReconciles: concurrency}); err != nil { setupLog.Error(err, "Unable to create controller", "controller", "DockerMachinePool") diff --git a/util/util.go b/util/util.go index 10f1a6e963de..b5a5b6b946b3 100644 --- a/util/util.go +++ b/util/util.go @@ -510,7 +510,10 @@ func ClusterToTypedObjectsMapper(c client.Client, ro client.ObjectList, scheme * listOpts = append(listOpts, client.InNamespace(cluster.Namespace)) } - objectList = objectList.DeepCopyObject().(client.ObjectList) + // Note: We have to DeepCopy objectList into a new variable. Otherwise + // we have a race condition between DeepCopyObject and client.List if this + // mapper func is called concurrently. + objectList := objectList.DeepCopyObject().(client.ObjectList) if err := c.List(ctx, objectList, listOpts...); err != nil { return nil } diff --git a/webhooks/alias.go b/webhooks/alias.go index 1b71158d62d7..4d3c3c999080 100644 --- a/webhooks/alias.go +++ b/webhooks/alias.go @@ -29,19 +29,19 @@ import ( // Cluster implements a validating and defaulting webhook for Cluster. type Cluster struct { - Client client.Reader - ClusterCacheTrackerReader ClusterCacheTrackerReader + Client client.Reader + ClusterCacheReader ClusterCacheReader } -// ClusterCacheTrackerReader is a read-only ClusterCacheTracker useful to gather information +// ClusterCacheReader is a read-only ClusterCacheReader useful to gather information // for MachinePool nodes for workload clusters. -type ClusterCacheTrackerReader = webhooks.ClusterCacheTrackerReader +type ClusterCacheReader = webhooks.ClusterCacheReader // SetupWebhookWithManager sets up Cluster webhooks. func (webhook *Cluster) SetupWebhookWithManager(mgr ctrl.Manager) error { return (&webhooks.Cluster{ - Client: webhook.Client, - Tracker: webhook.ClusterCacheTrackerReader, + Client: webhook.Client, + ClusterCacheReader: webhook.ClusterCacheReader, }).SetupWebhookWithManager(mgr) }