From f8730c34e58282f5a47b7e721e3b413a2daf6dee Mon Sep 17 00:00:00 2001 From: Stamatis Katsaounis Date: Thu, 25 Jul 2024 09:32:09 +0300 Subject: [PATCH] bug: properly wait for removal of units and machines (#42) --- anvil-python/anvil/commands/haproxy.py | 2 +- anvil-python/anvil/commands/juju.py | 82 +++++++++++++++++++ anvil-python/anvil/commands/maas_agent.py | 2 +- anvil-python/anvil/commands/maas_region.py | 2 +- anvil-python/anvil/commands/postgresql.py | 2 +- anvil-python/anvil/jobs/steps.py | 67 +++++++++++++++ anvil-python/anvil/provider/local/commands.py | 13 ++- 7 files changed, 159 insertions(+), 11 deletions(-) create mode 100644 anvil-python/anvil/jobs/steps.py diff --git a/anvil-python/anvil/commands/haproxy.py b/anvil-python/anvil/commands/haproxy.py index 3268ff6..678469d 100644 --- a/anvil-python/anvil/commands/haproxy.py +++ b/anvil-python/anvil/commands/haproxy.py @@ -26,10 +26,10 @@ from sunbeam.jobs.steps import ( AddMachineUnitsStep, DeployMachineApplicationStep, - RemoveMachineUnitStep, ) from anvil.jobs.manifest import Manifest +from anvil.jobs.steps import RemoveMachineUnitStep LOG = logging.getLogger(__name__) diff --git a/anvil-python/anvil/commands/juju.py b/anvil-python/anvil/commands/juju.py index 3fbaa3b..f8724c5 100644 --- a/anvil-python/anvil/commands/juju.py +++ b/anvil-python/anvil/commands/juju.py @@ -13,12 +13,19 @@ # See the License for the specific language governing permissions and # limitations under the License. +import logging from os import environ import os.path import subprocess from rich.status import Status +from sunbeam.commands.juju import ( + RemoveJujuMachineStep as SunbeamRemoveJujuMachineStep, +) from sunbeam.jobs.common import BaseStep, Result, ResultType +from sunbeam.jobs.juju import CONTROLLER_MODEL + +LOG = logging.getLogger(__name__) class JujuAddSSHKeyStep(BaseStep): @@ -54,3 +61,78 @@ def run(self, status: Status | None) -> Result: message="Could not find public ssh key (~/.ssh/id_rsa.pub)", ) return Result(ResultType.COMPLETED) + + +class RemoveJujuMachineStep(SunbeamRemoveJujuMachineStep): + def run(self, status: Status | None = None) -> Result: + try: + if self.machine_id == -1: + return Result( + ResultType.FAILED, + "Not able to retrieve machine id from Cluster database", + ) + + cmd = [ + self._get_juju_binary(), + "remove-machine", + "-m", + CONTROLLER_MODEL, + str(self.machine_id), + "--no-prompt", + ] + LOG.debug(f'Running command {" ".join(cmd)}') + process = subprocess.run( + cmd, capture_output=True, text=True, check=True + ) + LOG.debug( + f"Command finished. stdout={process.stdout}, stderr={process.stderr}" + ) + except subprocess.CalledProcessError as e: + # Despite the is_skip identified that machine is present in the model there + # is chance that when remove-machine invocation happens, the machine has already + # gone. This can happen since the machine is auto-removed if there is no unit of + # any application, including controller, deployed on it. + if f"machine {self.machine_id} not found" in e.stderr: + return Result(ResultType.COMPLETED) + + LOG.exception( + f"Error removing machine {self.machine_id} from Juju" + ) + LOG.warning(e.stderr) + return Result(ResultType.FAILED, str(e)) + + try: + cmd = [ + self._get_juju_binary(), + "wait-for", + "machine", + "-m", + CONTROLLER_MODEL, + str(self.machine_id), + "--query", + 'life=="dead"', + ] + LOG.debug(f'Running command {" ".join(cmd)}') + process = subprocess.run( + cmd, capture_output=True, text=True, check=True + ) + LOG.debug( + f"Command finished. stdout={process.stdout}, stderr={process.stderr}" + ) + except subprocess.CalledProcessError as e: + # wait-for does not support cases when the machine was not found with the initial query. + # In cases where the machine is removed before waiting-for its removal, the wait-for + # will timeout waiting. We need to check that in case of failure the machine could not + # be found from the beginning. + if e.stderr.startswith( + f'machine "{self.machine_id}" not found, waiting' + ): + LOG.debug("Machine was removed before waiting for it") + return Result(ResultType.COMPLETED) + LOG.exception( + f"Error waiting for removal of machine {self.machine_id} from Juju" + ) + LOG.warning(e.stderr) + return Result(ResultType.FAILED, str(e)) + + return Result(ResultType.COMPLETED) diff --git a/anvil-python/anvil/commands/maas_agent.py b/anvil-python/anvil/commands/maas_agent.py index 0b2934d..7fbc7c8 100644 --- a/anvil-python/anvil/commands/maas_agent.py +++ b/anvil-python/anvil/commands/maas_agent.py @@ -22,10 +22,10 @@ from sunbeam.jobs.steps import ( AddMachineUnitsStep, DeployMachineApplicationStep, - RemoveMachineUnitStep, ) from anvil.jobs.manifest import Manifest +from anvil.jobs.steps import RemoveMachineUnitStep APPLICATION = "maas-agent" CONFIG_KEY = "TerraformVarsMaasagentPlan" diff --git a/anvil-python/anvil/commands/maas_region.py b/anvil-python/anvil/commands/maas_region.py index cc2e9ad..046895c 100644 --- a/anvil-python/anvil/commands/maas_region.py +++ b/anvil-python/anvil/commands/maas_region.py @@ -22,10 +22,10 @@ from sunbeam.jobs.steps import ( AddMachineUnitsStep, DeployMachineApplicationStep, - RemoveMachineUnitStep, ) from anvil.jobs.manifest import Manifest +from anvil.jobs.steps import RemoveMachineUnitStep APPLICATION = "maas-region" CONFIG_KEY = "TerraformVarsMaasregionPlan" diff --git a/anvil-python/anvil/commands/postgresql.py b/anvil-python/anvil/commands/postgresql.py index d447835..9bd4a55 100644 --- a/anvil-python/anvil/commands/postgresql.py +++ b/anvil-python/anvil/commands/postgresql.py @@ -25,10 +25,10 @@ from sunbeam.jobs.steps import ( AddMachineUnitsStep, DeployMachineApplicationStep, - RemoveMachineUnitStep, ) from anvil.jobs.manifest import Manifest +from anvil.jobs.steps import RemoveMachineUnitStep LOG = logging.getLogger(__name__) APPLICATION = "postgresql" diff --git a/anvil-python/anvil/jobs/steps.py b/anvil-python/anvil/jobs/steps.py new file mode 100644 index 0000000..a71daa5 --- /dev/null +++ b/anvil-python/anvil/jobs/steps.py @@ -0,0 +1,67 @@ +# Copyright (c) 2024 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import subprocess + +from rich.status import Status +from sunbeam.commands.juju import JujuStepHelper +from sunbeam.jobs.common import Result, ResultType +from sunbeam.jobs.juju import CONTROLLER_MODEL +from sunbeam.jobs.steps import ( + RemoveMachineUnitStep as SunbeamRemoveMachineUnitStep, +) + +LOG = logging.getLogger(__name__) + + +class RemoveMachineUnitStep(SunbeamRemoveMachineUnitStep, JujuStepHelper): + def run(self, status: Status | None = None) -> Result: + res = super().run(status) + if res.result_type != ResultType.COMPLETED: + return res + try: + cmd = [ + self._get_juju_binary(), + "wait-for", + "unit", + "-m", + CONTROLLER_MODEL, + self.unit, + "--query", + 'life=="dead"', + ] + LOG.debug(f'Running command {" ".join(cmd)}') + process = subprocess.run( + cmd, capture_output=True, text=True, check=True + ) + LOG.debug( + f"Command finished. stdout={process.stdout}, stderr={process.stderr}" + ) + except subprocess.CalledProcessError as e: + # wait-for does not support cases when the unit was not found with the initial query. + # In cases where the unit is removed before waiting-for its removal, the wait-for + # will timeout waiting. We need to check that in case of failure the unit could not + # be found from the beginning. + if e.stderr.startswith(f'unit "{self.unit}" not found, waiting'): + LOG.debug("Unit was removed before waiting for it") + return Result(ResultType.COMPLETED) + LOG.exception( + f"Error waiting for removal of unit {self.unit} from Juju" + ) + LOG.warning(e.stderr) + return Result(ResultType.FAILED, str(e)) + + return Result(ResultType.COMPLETED) diff --git a/anvil-python/anvil/provider/local/commands.py b/anvil-python/anvil/provider/local/commands.py index 53997b3..8db1215 100644 --- a/anvil-python/anvil/provider/local/commands.py +++ b/anvil-python/anvil/provider/local/commands.py @@ -42,7 +42,6 @@ CreateJujuUserStep, JujuLoginStep, RegisterJujuUserStep, - RemoveJujuMachineStep, SaveJujuUserLocallyStep, ) from sunbeam.jobs.checks import ( @@ -74,7 +73,7 @@ RemoveHAProxyUnitStep, haproxy_install_steps, ) -from anvil.commands.juju import JujuAddSSHKeyStep +from anvil.commands.juju import JujuAddSSHKeyStep, RemoveJujuMachineStep from anvil.commands.maas_agent import ( RemoveMAASAgentUnitStep, maas_agent_install_steps, @@ -572,10 +571,7 @@ def remove(ctx: click.Context, name: str) -> None: plan = [ JujuLoginStep(deployment.juju_account), - RemovePostgreSQLUnitStep( - client, name, jhelper, deployment.infrastructure_model - ), - RemoveHAProxyUnitStep( + RemoveMAASAgentUnitStep( client, name, jhelper, deployment.infrastructure_model ), RemoveMAASRegionUnitStep( @@ -584,7 +580,10 @@ def remove(ctx: click.Context, name: str) -> None: ReapplyPostgreSQLTerraformPlanStep( client, manifest_obj, jhelper, deployment.infrastructure_model ), - RemoveMAASAgentUnitStep( + RemoveHAProxyUnitStep( + client, name, jhelper, deployment.infrastructure_model + ), + RemovePostgreSQLUnitStep( client, name, jhelper, deployment.infrastructure_model ), RemoveJujuMachineStep(client, name),