From 4aad389b055b45b82aecf3e851a8d0ec3c33fe25 Mon Sep 17 00:00:00 2001 From: Hugo Shaka Date: Mon, 9 Dec 2024 13:38:17 -0500 Subject: [PATCH] Fix flaky TestAgentRolloutController (#49886) * Fix falky TestAgentRolloutController * switch to real clock + increase Eventually timeout * Make reconciliation period a parameter + add TELEPORT_UNSTABLE env var * Update lib/service/service_test.go Co-authored-by: Alan Parra * Apply suggestions from code review Co-authored-by: Alan Parra * Remove env var * lint --------- Co-authored-by: Alan Parra --- lib/autoupdate/rollout/controller.go | 17 +++++++++++++---- lib/service/service.go | 2 +- lib/service/service_test.go | 15 +++++++-------- lib/service/servicecfg/auth.go | 7 +++++++ 4 files changed, 28 insertions(+), 13 deletions(-) diff --git a/lib/autoupdate/rollout/controller.go b/lib/autoupdate/rollout/controller.go index 667a2c557ca32..1c45a6ac4fc5b 100644 --- a/lib/autoupdate/rollout/controller.go +++ b/lib/autoupdate/rollout/controller.go @@ -31,7 +31,7 @@ import ( ) const ( - reconcilerPeriod = time.Minute + defaultReconcilerPeriod = time.Minute ) // Controller wakes up every minute to reconcile the autoupdate_agent_rollout resource. @@ -43,10 +43,14 @@ type Controller struct { reconciler reconciler clock clockwork.Clock log *slog.Logger + period time.Duration } // NewController creates a new Controller for the autoupdate_agent_rollout kind. -func NewController(client Client, log *slog.Logger, clock clockwork.Clock) (*Controller, error) { +// The period can be specified to control the sync frequency. This is mainly +// used to speed up tests or for demo purposes. When empty, the controller picks +// a sane default value. +func NewController(client Client, log *slog.Logger, clock clockwork.Clock, period time.Duration) (*Controller, error) { if client == nil { return nil, trace.BadParameter("missing client") } @@ -57,6 +61,10 @@ func NewController(client Client, log *slog.Logger, clock clockwork.Clock) (*Con return nil, trace.BadParameter("missing clock") } + if period <= 0 { + period = defaultReconcilerPeriod + } + return &Controller{ clock: clock, log: log, @@ -67,14 +75,15 @@ func NewController(client Client, log *slog.Logger, clock clockwork.Clock) (*Con // TODO(hugoShaka): add the strategies here as we implement them }, }, + period: period, }, nil } // Run the autoupdate_agent_rollout controller. This function returns only when its context is canceled. func (c *Controller) Run(ctx context.Context) error { config := interval.Config{ - Duration: reconcilerPeriod, - FirstDuration: reconcilerPeriod, + Duration: c.period, + FirstDuration: c.period, Jitter: retryutils.SeventhJitter, Clock: c.clock, } diff --git a/lib/service/service.go b/lib/service/service.go index 97fbd4a03e370..335b22dc17377 100644 --- a/lib/service/service.go +++ b/lib/service/service.go @@ -2435,7 +2435,7 @@ func (process *TeleportProcess) initAuthService() error { return trace.Wrap(spiffeFedSyncer.Run(process.GracefulExitContext()), "running SPIFFEFederation Syncer") }) - agentRolloutController, err := rollout.NewController(authServer, logger, process.Clock) + agentRolloutController, err := rollout.NewController(authServer, logger, process.Clock, cfg.Auth.AgentRolloutControllerSyncPeriod) if err != nil { return trace.Wrap(err, "creating the rollout controller") } diff --git a/lib/service/service_test.go b/lib/service/service_test.go index e0b421ff93780..e1ddfa148eb5d 100644 --- a/lib/service/service_test.go +++ b/lib/service/service_test.go @@ -1851,13 +1851,13 @@ func TestInitDatabaseService(t *testing.T) { func TestAgentRolloutController(t *testing.T) { t.Parallel() - // Test setup: create a Teleport Auth config - fakeClock := clockwork.NewFakeClock() - dataDir := makeTempDir(t) cfg := servicecfg.MakeDefaultConfig() - cfg.Clock = fakeClock + // We use a real clock because too many sevrices are using the clock and it's not possible to accurately wait for + // each one of them to reach the point where they wait for the clock to advance. If we add a WaitUntil(X waiters) + // check, this will break the next time we add a new waiter. + cfg.Clock = clockwork.NewRealClock() cfg.DataDir = dataDir cfg.SetAuthServerAddress(utils.NetAddr{AddrNetwork: "tcp", Addr: "127.0.0.1:0"}) cfg.Auth.Enabled = true @@ -1866,6 +1866,8 @@ func TestAgentRolloutController(t *testing.T) { cfg.DebugService.Enabled = false cfg.Auth.StorageConfig.Params["path"] = dataDir cfg.Auth.ListenAddr = utils.NetAddr{AddrNetwork: "tcp", Addr: "127.0.0.1:0"} + // Speed up the reconciliation period for testing purposes. + cfg.Auth.AgentRolloutControllerSyncPeriod = 200 * time.Millisecond cfg.CircuitBreakerConfig = breaker.NoopBreakerConfig() process, err := NewTeleport(cfg) @@ -1904,9 +1906,6 @@ func TestAgentRolloutController(t *testing.T) { version, err = authServer.CreateAutoUpdateVersion(ctx, version) require.NoError(t, err) - // Test execution: advance clock to trigger a reconciliation - fakeClock.Advance(2 * time.Minute) - // Test validation: check that a new autoupdate_agent_rollout config was created require.Eventually(t, func() bool { rollout, err := authServer.GetAutoUpdateAgentRollout(ctx) @@ -1914,7 +1913,7 @@ func TestAgentRolloutController(t *testing.T) { return false } return rollout.Spec.GetTargetVersion() == version.Spec.GetAgents().GetTargetVersion() - }, time.Second, 10*time.Millisecond) + }, 5*time.Second, 10*time.Millisecond) } // makeTempDir makes a temp dir with a shorter name than t.TempDir() in order to diff --git a/lib/service/servicecfg/auth.go b/lib/service/servicecfg/auth.go index 1b09042215aed..a8fcfcd57b8f2 100644 --- a/lib/service/servicecfg/auth.go +++ b/lib/service/servicecfg/auth.go @@ -20,6 +20,7 @@ package servicecfg import ( "slices" + "time" "github.com/dustin/go-humanize" "github.com/gravitational/trace" @@ -116,6 +117,12 @@ type AuthConfig struct { // AccessMonitoring configures access monitoring. AccessMonitoring *AccessMonitoringOptions + + // AgentRolloutControllerSyncPeriod controls the period between two + // reconciliations of the agent rollout controller. This value is jittered. + // Empty value means the controller uses its default. + // Used in tests. + AgentRolloutControllerSyncPeriod time.Duration } // AccessMonitoringOptions configures access monitoring.