Skip to content

Commit

Permalink
Merge pull request #316 from wenduwan/eni_retry
Browse files Browse the repository at this point in the history
amazon-ecs plugin: retry transient ECS task launch failures
  • Loading branch information
Stericson authored Jul 10, 2023
2 parents 3feee5d + a36612a commit 6a54954
Show file tree
Hide file tree
Showing 2 changed files with 103 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
import java.time.Duration;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;

Expand All @@ -48,6 +49,7 @@
import com.amazonaws.waiters.WaiterUnrecoverableException;
import com.google.common.base.Throwables;

import com.google.common.collect.ImmutableList;
import org.apache.commons.lang.StringUtils;
import org.kohsuke.stapler.DataBoundConstructor;

Expand All @@ -67,6 +69,11 @@ public class ECSLauncher extends JNLPLauncher {
private final ECSCloud cloud;
private final ECSService ecsService;
private boolean launched;
private final int maxAttempts = 2;

private static final List<String> FARGATE_RETRYABLE_MESSAGES = ImmutableList.of(
"Timeout waiting for network interface provisioning to complete"
);

@DataBoundConstructor
public ECSLauncher(ECSCloud cloud, String tunnel, String vmargs) {
Expand Down Expand Up @@ -108,7 +115,7 @@ public synchronized void launch(SlaveComputer computer, TaskListener listener) {
try {
long timeout = System.currentTimeMillis() + Duration.ofSeconds(cloud.getSlaveTimeoutInSeconds()).toMillis();

launchECSTask(ecsComputer, listener, timeout);
launchECSTaskWithRetry(ecsComputer, listener, timeout, maxAttempts);

// now wait for agent to be online
waitForAgent(agent, listener, timeout);
Expand Down Expand Up @@ -137,7 +144,21 @@ public synchronized void launch(SlaveComputer computer, TaskListener listener) {
}
}

protected Task launchECSTask(ECSComputer ecsComputer, TaskListener listener, long timeout) throws IOException, InterruptedException {
protected Task launchECSTaskWithRetry(ECSComputer ecsComputer, TaskListener listener, long timeout, int maxAttempts) throws IOException, InterruptedException {
int attempt = 1;
do {
try {
return launchECSTask(ecsComputer, listener, timeout);
} catch (RetryableLaunchFailure e) {
LOGGER.log(Level.WARNING, "Attempt {0}: Failed to start task due to {1}", new Object[]{attempt, e});
}
++attempt;
} while (attempt <= maxAttempts);

throw new IllegalStateException(MessageFormat.format("Failed to start task after {0} attempts", maxAttempts));
}

protected Task launchECSTask(ECSComputer ecsComputer, TaskListener listener, long timeout) throws IOException, InterruptedException, RetryableLaunchFailure {
PrintStream logger = listener.getLogger();

ECSSlave agent = ecsComputer.getNode();
Expand Down Expand Up @@ -169,6 +190,11 @@ protected Task launchECSTask(ECSComputer ecsComputer, TaskListener listener, lon
}
catch (WaiterUnrecoverableException exception){
LOGGER.log(Level.WARNING, MessageFormat.format("[{0}]: ECS Task stopped: {1}", agent.getNodeName(), startedTask.getTaskArn()), exception);

if (FARGATE_RETRYABLE_MESSAGES.stream().anyMatch(exception.getMessage()::contains)) {
throw new RetryableLaunchFailure(exception);
}

throw new IllegalStateException("Task stopped before coming online. TaskARN: " + startedTask.getTaskArn());
}
catch (AmazonServiceException exception){
Expand Down Expand Up @@ -247,4 +273,10 @@ private Collection<String> getDockerRunCommand(ECSSlave slave, String jenkinsUrl
command.add(agent.getName());
return command;
}

protected static final class RetryableLaunchFailure extends Exception {
public RetryableLaunchFailure(Exception e) {
super(e);
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
package com.cloudbees.jenkins.plugins.amazonecs;


import com.amazonaws.services.ecs.model.Task;
import com.amazonaws.waiters.WaiterUnrecoverableException;
import hudson.model.TaskListener;
import org.junit.Rule;
import org.junit.Test;
import org.jvnet.hudson.test.JenkinsRule;
import org.mockito.Mockito;

import java.io.ByteArrayOutputStream;
import java.io.PrintStream;

import static org.junit.Assert.assertThrows;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.Mockito.*;


public class ECSLauncherTest {

@Rule
public JenkinsRule j = new JenkinsRule();

@Test
public void generic_ecs_exception_is_not_retried() throws Exception {

ECSService ecsService = mock(ECSService.class);
ECSCloud cloud = mock(ECSCloud.class);
ECSComputer computer = mock(ECSComputer.class);
TaskListener listener = mock(TaskListener.class);
ByteArrayOutputStream bo = new ByteArrayOutputStream();
Mockito.when(computer.getNode()).thenReturn(mock(ECSSlave.class));
Mockito.when(cloud.getEcsService()).thenReturn(ecsService);
Mockito.when(listener.getLogger()).thenReturn(new PrintStream(bo));

ECSLauncher launcher = Mockito.spy(new ECSLauncher(cloud, "tunnel", ""));

doThrow(new WaiterUnrecoverableException("Generic ecs exception")).when(launcher).launchECSTask(any(ECSComputer.class), any(TaskListener.class), anyLong());

assertThrows("Generic ECS exception", WaiterUnrecoverableException.class, () -> {
launcher.launch(computer, listener);
});

verify(launcher, times(1)).launchECSTask(any(ECSComputer.class), any(TaskListener.class), anyLong());
}

@Test
public void eni_timeout_exception_is_retried() throws Exception {

ECSService ecsService = mock(ECSService.class);
ECSCloud cloud = mock(ECSCloud.class);
ECSComputer computer = mock(ECSComputer.class);
TaskListener listener = mock(TaskListener.class);
ByteArrayOutputStream bo = new ByteArrayOutputStream();
Mockito.when(computer.getNode()).thenReturn(mock(ECSSlave.class));
Mockito.when(cloud.getEcsService()).thenReturn(ecsService);
Mockito.when(listener.getLogger()).thenReturn(new PrintStream(bo));

ECSLauncher launcher = Mockito.spy(new ECSLauncher(cloud, "tunnel", ""));

doThrow(ECSLauncher.RetryableLaunchFailure.class).doReturn(mock(Task.class)).when(launcher).launchECSTask(any(ECSComputer.class), any(TaskListener.class), anyLong());
doNothing().when(launcher).waitForAgent(any(ECSSlave.class), any(TaskListener.class), anyLong());

launcher.launch(computer, listener);

verify(launcher, times(2)).launchECSTask(any(ECSComputer.class), any(TaskListener.class), anyLong());
}
}

0 comments on commit 6a54954

Please sign in to comment.