Skip to content

Commit

Permalink
bug: properly wait for removal of units and machines (canonical#42)
Browse files Browse the repository at this point in the history
  • Loading branch information
skatsaounis authored Jul 25, 2024
1 parent d796f30 commit f8730c3
Show file tree
Hide file tree
Showing 7 changed files with 159 additions and 11 deletions.
2 changes: 1 addition & 1 deletion anvil-python/anvil/commands/haproxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,10 @@
from sunbeam.jobs.steps import (
AddMachineUnitsStep,
DeployMachineApplicationStep,
RemoveMachineUnitStep,
)

from anvil.jobs.manifest import Manifest
from anvil.jobs.steps import RemoveMachineUnitStep

LOG = logging.getLogger(__name__)

Expand Down
82 changes: 82 additions & 0 deletions anvil-python/anvil/commands/juju.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,19 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
from os import environ
import os.path
import subprocess

from rich.status import Status
from sunbeam.commands.juju import (
RemoveJujuMachineStep as SunbeamRemoveJujuMachineStep,
)
from sunbeam.jobs.common import BaseStep, Result, ResultType
from sunbeam.jobs.juju import CONTROLLER_MODEL

LOG = logging.getLogger(__name__)


class JujuAddSSHKeyStep(BaseStep):
Expand Down Expand Up @@ -54,3 +61,78 @@ def run(self, status: Status | None) -> Result:
message="Could not find public ssh key (~/.ssh/id_rsa.pub)",
)
return Result(ResultType.COMPLETED)


class RemoveJujuMachineStep(SunbeamRemoveJujuMachineStep):
def run(self, status: Status | None = None) -> Result:
try:
if self.machine_id == -1:
return Result(
ResultType.FAILED,
"Not able to retrieve machine id from Cluster database",
)

cmd = [
self._get_juju_binary(),
"remove-machine",
"-m",
CONTROLLER_MODEL,
str(self.machine_id),
"--no-prompt",
]
LOG.debug(f'Running command {" ".join(cmd)}')
process = subprocess.run(
cmd, capture_output=True, text=True, check=True
)
LOG.debug(
f"Command finished. stdout={process.stdout}, stderr={process.stderr}"
)
except subprocess.CalledProcessError as e:
# Despite the is_skip identified that machine is present in the model there
# is chance that when remove-machine invocation happens, the machine has already
# gone. This can happen since the machine is auto-removed if there is no unit of
# any application, including controller, deployed on it.
if f"machine {self.machine_id} not found" in e.stderr:
return Result(ResultType.COMPLETED)

LOG.exception(
f"Error removing machine {self.machine_id} from Juju"
)
LOG.warning(e.stderr)
return Result(ResultType.FAILED, str(e))

try:
cmd = [
self._get_juju_binary(),
"wait-for",
"machine",
"-m",
CONTROLLER_MODEL,
str(self.machine_id),
"--query",
'life=="dead"',
]
LOG.debug(f'Running command {" ".join(cmd)}')
process = subprocess.run(
cmd, capture_output=True, text=True, check=True
)
LOG.debug(
f"Command finished. stdout={process.stdout}, stderr={process.stderr}"
)
except subprocess.CalledProcessError as e:
# wait-for does not support cases when the machine was not found with the initial query.
# In cases where the machine is removed before waiting-for its removal, the wait-for
# will timeout waiting. We need to check that in case of failure the machine could not
# be found from the beginning.
if e.stderr.startswith(
f'machine "{self.machine_id}" not found, waiting'
):
LOG.debug("Machine was removed before waiting for it")
return Result(ResultType.COMPLETED)
LOG.exception(
f"Error waiting for removal of machine {self.machine_id} from Juju"
)
LOG.warning(e.stderr)
return Result(ResultType.FAILED, str(e))

return Result(ResultType.COMPLETED)
2 changes: 1 addition & 1 deletion anvil-python/anvil/commands/maas_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@
from sunbeam.jobs.steps import (
AddMachineUnitsStep,
DeployMachineApplicationStep,
RemoveMachineUnitStep,
)

from anvil.jobs.manifest import Manifest
from anvil.jobs.steps import RemoveMachineUnitStep

APPLICATION = "maas-agent"
CONFIG_KEY = "TerraformVarsMaasagentPlan"
Expand Down
2 changes: 1 addition & 1 deletion anvil-python/anvil/commands/maas_region.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@
from sunbeam.jobs.steps import (
AddMachineUnitsStep,
DeployMachineApplicationStep,
RemoveMachineUnitStep,
)

from anvil.jobs.manifest import Manifest
from anvil.jobs.steps import RemoveMachineUnitStep

APPLICATION = "maas-region"
CONFIG_KEY = "TerraformVarsMaasregionPlan"
Expand Down
2 changes: 1 addition & 1 deletion anvil-python/anvil/commands/postgresql.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@
from sunbeam.jobs.steps import (
AddMachineUnitsStep,
DeployMachineApplicationStep,
RemoveMachineUnitStep,
)

from anvil.jobs.manifest import Manifest
from anvil.jobs.steps import RemoveMachineUnitStep

LOG = logging.getLogger(__name__)
APPLICATION = "postgresql"
Expand Down
67 changes: 67 additions & 0 deletions anvil-python/anvil/jobs/steps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Copyright (c) 2024 Canonical Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import subprocess

from rich.status import Status
from sunbeam.commands.juju import JujuStepHelper
from sunbeam.jobs.common import Result, ResultType
from sunbeam.jobs.juju import CONTROLLER_MODEL
from sunbeam.jobs.steps import (
RemoveMachineUnitStep as SunbeamRemoveMachineUnitStep,
)

LOG = logging.getLogger(__name__)


class RemoveMachineUnitStep(SunbeamRemoveMachineUnitStep, JujuStepHelper):
def run(self, status: Status | None = None) -> Result:
res = super().run(status)
if res.result_type != ResultType.COMPLETED:
return res
try:
cmd = [
self._get_juju_binary(),
"wait-for",
"unit",
"-m",
CONTROLLER_MODEL,
self.unit,
"--query",
'life=="dead"',
]
LOG.debug(f'Running command {" ".join(cmd)}')
process = subprocess.run(
cmd, capture_output=True, text=True, check=True
)
LOG.debug(
f"Command finished. stdout={process.stdout}, stderr={process.stderr}"
)
except subprocess.CalledProcessError as e:
# wait-for does not support cases when the unit was not found with the initial query.
# In cases where the unit is removed before waiting-for its removal, the wait-for
# will timeout waiting. We need to check that in case of failure the unit could not
# be found from the beginning.
if e.stderr.startswith(f'unit "{self.unit}" not found, waiting'):
LOG.debug("Unit was removed before waiting for it")
return Result(ResultType.COMPLETED)
LOG.exception(
f"Error waiting for removal of unit {self.unit} from Juju"
)
LOG.warning(e.stderr)
return Result(ResultType.FAILED, str(e))

return Result(ResultType.COMPLETED)
13 changes: 6 additions & 7 deletions anvil-python/anvil/provider/local/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@
CreateJujuUserStep,
JujuLoginStep,
RegisterJujuUserStep,
RemoveJujuMachineStep,
SaveJujuUserLocallyStep,
)
from sunbeam.jobs.checks import (
Expand Down Expand Up @@ -74,7 +73,7 @@
RemoveHAProxyUnitStep,
haproxy_install_steps,
)
from anvil.commands.juju import JujuAddSSHKeyStep
from anvil.commands.juju import JujuAddSSHKeyStep, RemoveJujuMachineStep
from anvil.commands.maas_agent import (
RemoveMAASAgentUnitStep,
maas_agent_install_steps,
Expand Down Expand Up @@ -572,10 +571,7 @@ def remove(ctx: click.Context, name: str) -> None:

plan = [
JujuLoginStep(deployment.juju_account),
RemovePostgreSQLUnitStep(
client, name, jhelper, deployment.infrastructure_model
),
RemoveHAProxyUnitStep(
RemoveMAASAgentUnitStep(
client, name, jhelper, deployment.infrastructure_model
),
RemoveMAASRegionUnitStep(
Expand All @@ -584,7 +580,10 @@ def remove(ctx: click.Context, name: str) -> None:
ReapplyPostgreSQLTerraformPlanStep(
client, manifest_obj, jhelper, deployment.infrastructure_model
),
RemoveMAASAgentUnitStep(
RemoveHAProxyUnitStep(
client, name, jhelper, deployment.infrastructure_model
),
RemovePostgreSQLUnitStep(
client, name, jhelper, deployment.infrastructure_model
),
RemoveJujuMachineStep(client, name),
Expand Down

0 comments on commit f8730c3

Please sign in to comment.