Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WOR-1777 Handle more instances of ManagementExceptions when deleting resources #1777

Merged
merged 6 commits into from
Jul 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,22 +1,34 @@
package bio.terra.workspace.service.resource.controlled.cloud.azure;

import bio.terra.landingzone.db.exception.LandingZoneNotFoundException;
import bio.terra.lz.futureservice.client.ApiException;
import bio.terra.stairway.FlightContext;
import bio.terra.stairway.StepResult;
import bio.terra.stairway.StepStatus;
import bio.terra.workspace.amalgam.landingzone.azure.LandingZoneServiceApiException;
import bio.terra.workspace.common.exception.AzureManagementExceptionUtils;
import bio.terra.workspace.service.resource.controlled.flight.delete.DeleteControlledResourceStep;
import bio.terra.workspace.service.resource.controlled.model.ControlledResource;
import com.azure.core.management.exception.ManagementException;
import java.util.Arrays;
import java.util.List;
import java.util.Optional;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.http.HttpStatus;

/**
* Common base class for deleting Azure resources. Wraps the Azure resource deletion logic with
* handling for common exceptions.
*/
public abstract class DeleteAzureControlledResourceStep implements DeleteControlledResourceStep {
public abstract class DeleteAzureControlledResourceStep<R extends ControlledResource>
implements DeleteControlledResourceStep {
private final Logger logger = LoggerFactory.getLogger(getClass());
protected final R resource;

protected DeleteAzureControlledResourceStep(R resource) {
this.resource = resource;
}

@Override
public StepResult doStep(FlightContext context) throws InterruptedException {
Expand Down Expand Up @@ -76,14 +88,32 @@ protected StepResult handleResourceDeleteException(Exception e, FlightContext co
// When retrieving shared resources from the landing zone,
// LZS can encounter the same management exceptions as above
// To detect this, we need to examine the message for those error codes
if (e instanceof LandingZoneServiceApiException && e.getCause() instanceof ApiException) {
var msg = e.getCause().getMessage();
if (msg != null
&& missingResourceManagementCodes.stream().anyMatch(code -> msg.contains(code))) {
if (e instanceof LandingZoneServiceApiException && e.getCause() instanceof ApiException apiEx) {
var code =
Optional.ofNullable(apiEx.getMessage())
.flatMap(
msg ->
missingResourceManagementCodes.stream()
.filter(c -> msg.contains(c))
.findFirst());
if (code.isPresent()) {
logger.debug(
"Unable to delete resource {} from workspace {}, because LZS encountered management exception {}: Returning success for delete step",
resource.getResourceType().name(),
resource.getWorkspaceId(),
code.get());
return StepResult.getStepResultSuccess();
}
}

if (e instanceof LandingZoneNotFoundException) {
// If the landing zone is not present, it's probably because it was removed directly
logger.debug(
"Unable to delete resource {} from workspace {}, because no landing zone was found in LZS: Returning success for delete step",
resource.getResourceType().name(),
resource.getWorkspaceId());
return StepResult.getStepResultSuccess();
}
return new StepResult(StepStatus.STEP_RESULT_FAILURE_RETRY, e);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,23 +19,23 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class DeleteAzureBatchPoolStep extends DeleteAzureControlledResourceStep {
public class DeleteAzureBatchPoolStep
extends DeleteAzureControlledResourceStep<ControlledAzureBatchPoolResource> {
private static final Logger logger = LoggerFactory.getLogger(DeleteAzureBatchPoolStep.class);

private final AzureConfiguration azureConfig;
private final CrlService crlService;
private final LandingZoneBatchAccountFinder landingZoneBatchAccountFinder;
private final ControlledAzureBatchPoolResource resource;

public DeleteAzureBatchPoolStep(
AzureConfiguration azureConfig,
CrlService crlService,
LandingZoneBatchAccountFinder landingZoneBatchAccountFinder,
ControlledAzureBatchPoolResource resource) {
super(resource);
this.azureConfig = azureConfig;
this.crlService = crlService;
this.landingZoneBatchAccountFinder = landingZoneBatchAccountFinder;
this.resource = resource;
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@
* A step for deleting a controlled Azure Database resource. This step uses the following process to
* actually delete the Azure Database.
*/
public class DeleteAzureDatabaseStep extends DeleteAzureControlledResourceStep {
public class DeleteAzureDatabaseStep
extends DeleteAzureControlledResourceStep<ControlledAzureDatabaseResource> {
private static final Logger logger = LoggerFactory.getLogger(DeleteAzureDatabaseStep.class);
private final AzureConfiguration azureConfig;
private final CrlService crlService;
private final ControlledAzureDatabaseResource resource;
private final LandingZoneApiDispatch landingZoneApiDispatch;
private final SamService samService;
private final WorkspaceService workspaceService;
Expand All @@ -39,9 +39,9 @@ public DeleteAzureDatabaseStep(
SamService samService,
WorkspaceService workspaceService,
UUID workspaceId) {
super(resource);
this.crlService = crlService;
this.azureConfig = azureConfig;
this.resource = resource;
this.landingZoneApiDispatch = landingZoneApiDispatch;
this.samService = samService;
this.workspaceService = workspaceService;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ public class DeleteAzureDiskStep extends DeleteAzureControlledResourceStep {

public DeleteAzureDiskStep(
AzureConfiguration azureConfig, CrlService crlService, ControlledAzureDiskResource resource) {
super(resource);
this.crlService = crlService;
this.azureConfig = azureConfig;
this.resource = resource;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import bio.terra.workspace.app.configuration.external.AzureConfiguration;
import bio.terra.workspace.common.exception.AzureManagementExceptionUtils;
import bio.terra.workspace.service.crl.CrlService;
import bio.terra.workspace.service.resource.controlled.flight.delete.DeleteControlledResourceStep;
import bio.terra.workspace.service.resource.controlled.cloud.azure.DeleteAzureControlledResourceStep;
import bio.terra.workspace.service.workspace.model.AzureCloudContext;
import com.azure.core.management.exception.ManagementException;
import com.azure.resourcemanager.compute.ComputeManager;
Expand All @@ -24,85 +24,91 @@
* functionality of detaching data disk from a virtual machine before disk deletion. It is not
* possible to delete a disk without detaching it.
*/
public class DetachAzureDiskFromVmStep implements DeleteControlledResourceStep {
public class DetachAzureDiskFromVmStep
extends DeleteAzureControlledResourceStep<ControlledAzureDiskResource> {
private static final Logger logger = LoggerFactory.getLogger(DetachAzureDiskFromVmStep.class);

private final CrlService crlService;
private final AzureConfiguration azureConfig;
private final ControlledAzureDiskResource resource;

public DetachAzureDiskFromVmStep(
AzureConfiguration azureConfig, CrlService crlService, ControlledAzureDiskResource resource) {
super(resource);
this.crlService = crlService;
this.azureConfig = azureConfig;
this.resource = resource;
}

@Override
public StepResult doStep(FlightContext context) throws InterruptedException, RetryException {
public StepResult deleteResource(FlightContext context)
throws InterruptedException, RetryException {
final AzureCloudContext azureCloudContext =
DeleteAzureDiskFlightUtils.getAzureCloudContext(context);
final String diskResourceId =
DeleteAzureDiskFlightUtils.getAzureDiskResourceId(azureCloudContext, resource);
ComputeManager computeManager = crlService.getComputeManager(azureCloudContext, azureConfig);
try {
var disk = computeManager.disks().getById(diskResourceId);
if (!disk.isAttachedToVirtualMachine()) {
logger.info(
"Disk {} in MRG {} for workspace {} is not attached to vm.",
resource.getResourceId(),
azureCloudContext.getAzureResourceGroupId(),
resource.getWorkspaceId());
return StepResult.getStepResultSuccess();
}
var vm = getVirtualMachine(computeManager, disk.virtualMachineId());
if (vm.isEmpty()) {
// this is unlikely event - disk is attached, but the vm cannot be found; let's retry
logger.warn(
"Disk {} in MRG {} for workspace {} is attached to vm, but vm cannot be found. Retrying.",
resource.getResourceId(),
azureCloudContext.getAzureResourceGroupId(),
resource.getWorkspaceId());
return new StepResult(StepStatus.STEP_RESULT_FAILURE_RETRY);
}
var attachedDisk = tryToFindAttachedDisk(vm.get(), resource.getDiskName());
if (attachedDisk.isPresent()) {
detachDisk(vm.get(), attachedDisk.get().getKey());
} else {
// this is unlikely event - disk is attached, but the vm doesn't
// have it in the list of attached disks; let's retry
logger.warn(
"Disk {} in MRG {} for workspace {} is attached, but not found in the list of attached disk for vm {}.",
resource.getResourceId(),
azureCloudContext.getAzureResourceGroupId(),
resource.getWorkspaceId(),
disk.virtualMachineId());
return new StepResult(StepStatus.STEP_RESULT_FAILURE_RETRY);
}
} catch (ManagementException ex) {
if (AzureManagementExceptionUtils.isExceptionCode(
ex, AzureManagementExceptionUtils.RESOURCE_NOT_FOUND)) {
logger.info(
"Disk {} does not exist in MRG {} for workspace {}.",
resource.getResourceId(),
azureCloudContext.getAzureResourceGroupId(),
resource.getWorkspaceId());
return StepResult.getStepResultSuccess();
}
if (AzureManagementExceptionUtils.isExceptionCode(
ex, AzureManagementExceptionUtils.OPERATION_NOT_ALLOWED)) {
logger.error(
"Failed attempt to detach disk {} from vm in MRG {} for workspace {}.",
resource.getResourceId(),
azureCloudContext.getAzureResourceGroupId(),
resource.getWorkspaceId());
return new StepResult(StepStatus.STEP_RESULT_FAILURE_FATAL, ex);
}
return new StepResult(StepStatus.STEP_RESULT_FAILURE_RETRY, ex);
var disk = computeManager.disks().getById(diskResourceId);
if (!disk.isAttachedToVirtualMachine()) {
logger.info(
"Disk {} in MRG {} for workspace {} is not attached to vm.",
resource.getResourceId(),
azureCloudContext.getAzureResourceGroupId(),
resource.getWorkspaceId());
return StepResult.getStepResultSuccess();
}
var vm = getVirtualMachine(computeManager, disk.virtualMachineId());
if (vm.isEmpty()) {
// this is unlikely event - disk is attached, but the vm cannot be found; let's retry
logger.warn(
"Disk {} in MRG {} for workspace {} is attached to vm, but vm cannot be found. Retrying.",
resource.getResourceId(),
azureCloudContext.getAzureResourceGroupId(),
resource.getWorkspaceId());
return new StepResult(StepStatus.STEP_RESULT_FAILURE_RETRY);
}
var attachedDisk = tryToFindAttachedDisk(vm.get(), resource.getDiskName());
if (attachedDisk.isPresent()) {
detachDisk(vm.get(), attachedDisk.get().getKey());
} else {
// this is unlikely event - disk is attached, but the vm doesn't
// have it in the list of attached disks; let's retry
logger.warn(
"Disk {} in MRG {} for workspace {} is attached, but not found in the list of attached disk for vm {}.",
resource.getResourceId(),
azureCloudContext.getAzureResourceGroupId(),
resource.getWorkspaceId(),
disk.virtualMachineId());
return new StepResult(StepStatus.STEP_RESULT_FAILURE_RETRY);
}
return StepResult.getStepResultSuccess();
}

@Override
protected StepResult handleResourceDeleteException(Exception e, FlightContext context) {
final AzureCloudContext azureCloudContext =
DeleteAzureDiskFlightUtils.getAzureCloudContext(context);
if (e instanceof ManagementException ex
&& AzureManagementExceptionUtils.isExceptionCode(
ex, AzureManagementExceptionUtils.RESOURCE_NOT_FOUND)) {
logger.info(
"Disk '{}' does not exist in MRG {} for workspace {}.",
resource.getResourceId(),
azureCloudContext.getAzureResourceGroupId(),
resource.getWorkspaceId());
return StepResult.getStepResultSuccess();
} else if (e instanceof ManagementException ex
&& AzureManagementExceptionUtils.isExceptionCode(
ex, AzureManagementExceptionUtils.OPERATION_NOT_ALLOWED)) {
logger.error(
"Failed attempt to detach disk {} from vm in MRG {} for workspace {}.",
resource.getResourceId(),
azureCloudContext.getAzureResourceGroupId(),
resource.getWorkspaceId());
return new StepResult(StepStatus.STEP_RESULT_FAILURE_FATAL, ex);
} else {
return super.handleResourceDeleteException(e, context);
}
}

@Override
public StepResult undoStep(FlightContext context) throws InterruptedException {
final AzureCloudContext azureCloudContext =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,11 @@

import bio.terra.stairway.FlightContext;
import bio.terra.stairway.StepResult;
import bio.terra.stairway.StepStatus;
import bio.terra.stairway.exception.RetryException;
import bio.terra.workspace.app.configuration.external.AzureConfiguration;
import bio.terra.workspace.common.exception.AzureManagementExceptionUtils;
import bio.terra.workspace.service.crl.CrlService;
import bio.terra.workspace.service.resource.controlled.flight.delete.DeleteControlledResourceStep;
import bio.terra.workspace.service.resource.controlled.cloud.azure.DeleteAzureControlledResourceStep;
import bio.terra.workspace.service.workspace.model.AzureCloudContext;
import com.azure.core.management.exception.ManagementException;
import com.azure.resourcemanager.compute.ComputeManager;
Expand All @@ -16,54 +15,58 @@

/**
* This steps works as a companion step for a disk deletion process. It gets identifier of a virtual
* machine to which data disk is attached and saves it into flight's working map. This data is
* required for 'undo' operation at DetachAzureDiskFromVmStep. It is required in case a disk has
* been detached from a virtual machine, but we need to attach it back in corresponding 'undo'
* operation.
* machine to which data disk is attached and saves it into flight's working map. It is required in
* case a disk has been detached from a virtual machine, but we need to attach it back in
* corresponding 'undo' operation. This is a separate step because the data is required for the
* 'undo' operation at DetachAzureDiskFromVmStep, and we need to guarantee its persistence - which
* we can't guarantee if it is from the 'do' portion of the same step.
*/
public class GetAzureDiskAttachedVmStep implements DeleteControlledResourceStep {
public class GetAzureDiskAttachedVmStep
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't have a better suggestion unfortunately, but just calling out that I find it rather confusing that this step is a Get step that also extends DeleteAzureControlledResourceStep and it has a deleteResource method that doesn't delete anything.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know. Naming things is hard.
There is a fairly extensive explanation in the comments, for this reason.

extends DeleteAzureControlledResourceStep<ControlledAzureDiskResource> {

private static final Logger logger = LoggerFactory.getLogger(GetAzureDiskAttachedVmStep.class);

private final CrlService crlService;
private final AzureConfiguration azureConfig;
private final ControlledAzureDiskResource resource;

public GetAzureDiskAttachedVmStep(
AzureConfiguration azureConfig, CrlService crlService, ControlledAzureDiskResource resource) {
super(resource);
this.crlService = crlService;
this.azureConfig = azureConfig;
this.resource = resource;
}

@Override
public StepResult doStep(FlightContext context) throws InterruptedException, RetryException {
public StepResult deleteResource(FlightContext context)
throws InterruptedException, RetryException {
final AzureCloudContext azureCloudContext =
DeleteAzureDiskFlightUtils.getAzureCloudContext(context);
final String diskResourceId =
DeleteAzureDiskFlightUtils.getAzureDiskResourceId(azureCloudContext, resource);
ComputeManager computeManager = crlService.getComputeManager(azureCloudContext, azureConfig);
try {
var disk = computeManager.disks().getById(diskResourceId);
if (disk.isAttachedToVirtualMachine()) {
context
.getWorkingMap()
.put(DeleteAzureDiskFlightUtils.DISK_ATTACHED_VM_ID_KEY, disk.virtualMachineId());
}
} catch (ManagementException ex) {
if (AzureManagementExceptionUtils.isExceptionCode(
ex, AzureManagementExceptionUtils.RESOURCE_NOT_FOUND)) {
logger.info(
"Disk '{}' does not exist in workspace {}.",
resource.getResourceId(),
resource.getWorkspaceId());
return StepResult.getStepResultSuccess();
}
return new StepResult(StepStatus.STEP_RESULT_FAILURE_RETRY, ex);
var disk = computeManager.disks().getById(diskResourceId);
if (disk.isAttachedToVirtualMachine()) {
context
.getWorkingMap()
.put(DeleteAzureDiskFlightUtils.DISK_ATTACHED_VM_ID_KEY, disk.virtualMachineId());
}
return StepResult.getStepResultSuccess();
}

@Override
protected StepResult handleResourceDeleteException(Exception e, FlightContext context) {
if (e instanceof ManagementException ex
&& AzureManagementExceptionUtils.isExceptionCode(
ex, AzureManagementExceptionUtils.RESOURCE_NOT_FOUND)) {
logger.info(
"Disk '{}' does not exist in workspace {}.",
resource.getResourceId(),
resource.getWorkspaceId());
return StepResult.getStepResultSuccess();
}
return super.handleResourceDeleteException(e, context);
}

@Override
public StepResult undoStep(FlightContext context) throws InterruptedException {
// nothing to undo here
Expand Down
Loading
Loading