From de06dace2fb3de92c5655d99cd2e4614d3938cc0 Mon Sep 17 00:00:00 2001 From: sin3point14 Date: Wed, 11 Dec 2024 21:30:50 +0530 Subject: [PATCH] Don't require container to be down in dcc.Down The check can be racy, let the caller do whatever checks it wants to do. daemon.go already has a check when it decides if dcc.Down is needed or not Now we also add a check to RestartServiceWithHaltHeight before it calls Down, with an appropriate error message. --- internal/pkg/daemon/daemon.go | 4 ++++ internal/pkg/docker/compose.go | 27 ++++++++++++++++----------- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/internal/pkg/daemon/daemon.go b/internal/pkg/daemon/daemon.go index 04cf58b..8a1f563 100644 --- a/internal/pkg/daemon/daemon.go +++ b/internal/pkg/daemon/daemon.go @@ -479,6 +479,10 @@ func (d *Daemon) performUpgrade( return errors.Wrapf(err, "failed to check if service is running") } + // This check is prone to race conditions, the image could be up at this point + // but exits before dcc.Down is called. However, at this point we are certain + // that upgrade height has been hit, so, it should be safe to Down an exited + // container. if isRunning { logger.Info("Executing compose down").Notifyf(ctx, "Shutting down chain to perform upgrade. Current image: %s, new image: %s", currImage, newImage) if err = d.dcc.Down(ctx, serviceName, compose.DownTimeout); err != nil { diff --git a/internal/pkg/docker/compose.go b/internal/pkg/docker/compose.go index 8b16353..0938137 100644 --- a/internal/pkg/docker/compose.go +++ b/internal/pkg/docker/compose.go @@ -182,26 +182,18 @@ func (dcc *ComposeClient) upgradeImageInComposeFile(ctx context.Context, service } func (dcc *ComposeClient) Down(ctx context.Context, serviceName string, timeout time.Duration) error { - isImageContainerRunning, err := dcc.IsServiceRunning(ctx, serviceName, timeout) - if err != nil { - return errors.Wrapf(err, "check for container running failed") - } - if !isImageContainerRunning { - return errors.Wrapf(ErrContainerNotRunning, "expected the container to run before calling docker compose down") - } - // 5 seconds buffer to handle the case when docker timeout (-t) // takes slightly longer than defined timeout (thus delay context cancellation) deadline := timeout + 5*time.Second timeoutSeconds := int(math.Round(timeout.Seconds())) - err = cmd.ExecuteWithDeadlineAndLog(ctx, deadline, []string{}, "docker", "compose", "-f", dcc.composeFile, "down", "--remove-orphans", "-t", strconv.Itoa(timeoutSeconds)) + err := cmd.ExecuteWithDeadlineAndLog(ctx, deadline, []string{}, "docker", "compose", "-f", dcc.composeFile, "down", "--remove-orphans", "-t", strconv.Itoa(timeoutSeconds)) if err != nil { return errors.Wrapf(err, "docker compose down failed") } // verify from docker api that the container is down - isImageContainerRunning, err = dcc.IsServiceRunning(ctx, serviceName, timeout) + isImageContainerRunning, err := dcc.IsServiceRunning(ctx, serviceName, timeout) if err != nil { return errors.Wrapf(err, "check for container running failed") } @@ -243,7 +235,20 @@ func (dcc *ComposeClient) Up(ctx context.Context, serviceName string, timeout ti } func (dcc *ComposeClient) RestartServiceWithHaltHeight(ctx context.Context, composeConfig *config.ComposeCli, serviceName string, upgradeHeight int64) error { - err := dcc.Down(ctx, serviceName, composeConfig.DownTimeout) + isImageContainerRunning, err := dcc.IsServiceRunning(ctx, serviceName, composeConfig.DownTimeout) + if err != nil { + return errors.Wrapf(err, "check for container running failed") + } + if !isImageContainerRunning { + return errors.Wrapf(ErrContainerNotRunning, "expected the container to run before restarting with halt height") + } + // The check above is prone to race conditions and the + // container can exit after the check. That should be super rare + // and it should be safe to Down it anyways, since, we are sure that we want + // to register halt height and restart. + // If the container crashed due to some issue, let it crash again after the restart + // blazar can't do anything about that + err = dcc.Down(ctx, serviceName, composeConfig.DownTimeout) if err != nil && !errors.Is(err, ErrContainerNotRunning) { return errors.Wrapf(err, "docker compose down failed") }