Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: scale Juju controllers according to anvil cluster size #30

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@ ubuntu@infra3:~$ maas-anvil cluster join \
--token eyJuYW1lIjoibWFhcy00Lm1hYXMiLCJzZWNyZXQiOiI3MmE512342abcdEASWWxOWNlYWNkYmJjMWRmMjk4OThkYWFkYzQzMDAzZjk4NmRkZDI2MWRhYWVkZTIxIiwiZmluZ2VycHJpbnQiOiJlODU5ZmY5NjAwMDU4OGFjZmQ5ZDM0NjFhMDk5NmU1YTU3YjhjN2Q2ZjE4M2NjZDRlOTg2NGRkZjQ3NWMwZWM1Iiwiam9pbl9hZGRyZXNzZXMiOlsiMTAuMjAuMC43OjcwMDAiLCIxMC4yMC4wLjg6NzAwMCJdfQ==
```

#### Juju Controller HA
Starting with the third machine joining the cluster, `maas-anvil` will automatically start adding Juju controllers to machines that are missing them. This will occur at every other join (When machines 3, 5, 7, and so on join)

### Confirm the cluster status

```bash
Expand Down
102 changes: 102 additions & 0 deletions anvil-python/anvil/commands/juju.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,21 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
from os import environ
import os.path
import random
import subprocess

from rich.status import Status
from sunbeam.commands.juju import JujuStepHelper
from sunbeam.jobs.common import BaseStep, Result, ResultType
from sunbeam.jobs.juju import JujuHelper, run_sync

from anvil.jobs.juju import CONTROLLER

LOG = logging.getLogger(__name__)
MAX_JUJU_CONTROLLERS = 3


class JujuAddSSHKeyStep(BaseStep):
Expand Down Expand Up @@ -54,3 +63,96 @@ def run(self, status: Status | None) -> Result:
message="Could not find public ssh key (~/.ssh/id_rsa.pub)",
)
return Result(ResultType.COMPLETED)


class ScaleJujuStep(BaseStep, JujuStepHelper):
"""Enable Juju HA."""

def __init__(
self,
jhelper: JujuHelper,
model: str,
):
super().__init__("Juju HA", "Enable Juju High Availability")

self.jhelper = jhelper
self.model = model

self.controller_machines: set[str] = set()
self.machines: set[str] = set()

def run(self, status: Status | None = None) -> Result:
"""Run the step to completion."""

available_machines = list(self.machines ^ self.controller_machines)
n_machines_to_join = min(
len(available_machines),
MAX_JUJU_CONTROLLERS - len(self.controller_machines),
)

cmd = [
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This command should be wrapped with try/expect since it can fail and we need to know why. This is a failure than can happen because of the linked bug:

ubuntu@maas-1:~$ juju enable-ha -n 3 --to 3
ERROR juju-ha-space is not set and a unique usable address was not found for machines: 0

With the current code, we fail in an unexpected way:

EBUG    Command '['/snap/maas-anvil/x1/juju/bin/juju', 'enable-ha', '-n', '3', '--to', '3']' returned non-zero exit status 1.                                                        utils.py:38
                    Traceback (most recent call last):                                                                                                                                                      
                      File "/snap/maas-anvil/x1/lib/python3.10/site-packages/anvil/utils.py", line 32, in __call__                                                                                          
                        return self.main(*args, **kwargs)                                                                                                                                                   
                      File "/snap/maas-anvil/x1/lib/python3.10/site-packages/click/core.py", line 1078, in main                                                                                             
                        rv = self.invoke(ctx)                                                                                                                                                               
                      File "/snap/maas-anvil/x1/lib/python3.10/site-packages/click/core.py", line 1688, in invoke                                                                                           
                        return _process_result(sub_ctx.command.invoke(sub_ctx))                                                                                                                             
                      File "/snap/maas-anvil/x1/lib/python3.10/site-packages/click/core.py", line 1688, in invoke                                                                                           
                        return _process_result(sub_ctx.command.invoke(sub_ctx))                                                                                                                             
                      File "/snap/maas-anvil/x1/lib/python3.10/site-packages/click/core.py", line 1434, in invoke                                                                                           
                        return ctx.invoke(self.callback, **ctx.params)                                                                                                                                      
                      File "/snap/maas-anvil/x1/lib/python3.10/site-packages/click/core.py", line 783, in invoke                                                                                            
                        return __callback(*args, **kwargs)                                                                                                                                                  
                      File "/snap/maas-anvil/x1/lib/python3.10/site-packages/click/decorators.py", line 33, in new_func                                                                                     
                        return f(get_current_context(), *args, **kwargs)                                                                                                                                    
                      File "/snap/maas-anvil/x1/lib/python3.10/site-packages/anvil/provider/local/commands.py", line 599, in remove                                                                         
                        run_plan(plan, console)                                                                                                                                                             
                      File "/snap/maas-anvil/x1/lib/python3.10/site-packages/sunbeam/jobs/common.py", line 277, in run_plan                                                                                 
                        result = step.run(status)                                                                                                                                                           
                      File "/snap/maas-anvil/x1/lib/python3.10/site-packages/anvil/commands/juju.py", line 186, in run                                                                                      
                        process = subprocess.run(                                                                                                                                                           
                      File "/usr/lib/python3.10/subprocess.py", line 526, in run                                                                                                                            
                        raise CalledProcessError(retcode, process.args,                                                                                                                                     
                    subprocess.CalledProcessError: Command '['/snap/maas-anvil/x1/juju/bin/juju', 'enable-ha', '-n', '3', '--to', '3']' returned non-zero exit status 1.                                    
           WARNING  An unexpected error has occurred. Please run 'maas-anvil inspect' to generate an inspection report.                                                                          utils.py:43
           ERROR    Error: Command '['/snap/maas-anvil/x1/juju/bin/juju', 'enable-ha', '-n', '3', '--to', '3']' returned non-zero exit status 1.                                                 utils.py:44
           ERROR    Task was destroyed but it is pending!                                                                                                                                base_events.py:1758
                    task: <Task pending name='Task-59' coro=<Connection._pinger.<locals>._do_ping() done, defined at                                                                                        
                    /snap/maas-anvil/x1/lib/python3.10/site-packages/juju/client/connection.py:599> wait_for=<Future cancelled>                                                                             
                    cb=[create_task_with_handler.<locals>._task_result_exp_handler(task_name='tmp', logger=<Logger juju....ction (ERROR)>)() at                                                             
                    /snap/maas-anvil/x1/lib/python3.10/site-packages/juju/jasyncio.py:39]>

self._get_juju_binary(),
"enable-ha",
"-n",
str(len(self.controller_machines) + n_machines_to_join),
"--to",
",".join(
str(s)
for s in random.sample(available_machines, n_machines_to_join)
),
]
LOG.debug(f'Running command {" ".join(cmd)}')
process = subprocess.run(
cmd, capture_output=True, text=True, check=True
)
LOG.debug(
f"Command finished. stdout={process.stdout}, stderr={process.stderr}"
)
cmd = [
self._get_juju_binary(),
"wait-for",
"application",
"-m",
"admin/controller",
"controller",
"--timeout",
"15m",
]
self.update_status(status, "scaling controller")
LOG.debug("Waiting for HA to be enabled")
LOG.debug(f'Running command {" ".join(cmd)}')
process = subprocess.run(
cmd, capture_output=True, text=True, check=True
)
LOG.debug(
f"Command finished. stdout={process.stdout}, stderr={process.stderr}"
)
return Result(ResultType.COMPLETED)

def is_skip(self, status: Status | None = None) -> Result:
"""Determines if the step should be skipped or not."""

self.controller_machines = set(
self.get_controller(CONTROLLER)["controller-machines"].keys()
)
self.machines = set(
run_sync(self.jhelper.get_machines(self.model)).keys()
)
available_machines = self.machines ^ self.controller_machines

if len(self.controller_machines) == MAX_JUJU_CONTROLLERS:
LOG.debug(
"Number of machines with controllers must not be greater than "
f"{MAX_JUJU_CONTROLLERS}, skipping scaling Juju controllers"
)
return Result(ResultType.SKIPPED)
if len(available_machines) == 0:
LOG.debug(
"No available machines, skipping scaling Juju controllers"
)
return Result(ResultType.SKIPPED)
if len(self.machines) < 3:
LOG.debug("Number of machines must be at least 3")
return Result(ResultType.SKIPPED)

return Result(ResultType.COMPLETED)
11 changes: 10 additions & 1 deletion anvil-python/anvil/jobs/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import shutil
from typing import Any, Dict, List

from pydantic import Field
from pydantic.dataclasses import dataclass
from snaphelpers import Snap
from sunbeam import utils
Expand All @@ -36,7 +37,6 @@
from sunbeam.jobs.deployment import Deployment
from sunbeam.jobs.manifest import (
CharmsManifest,
JujuManifest,
MissingTerraformInfoException,
TerraformManifest,
)
Expand All @@ -52,6 +52,15 @@
LOG = logging.getLogger(__name__)


@dataclass
class JujuManifest:
bootstrap_args: list[str] = Field(
default=[],
description="Extra args for juju bootstrap",
alias="bootstrap_args",
)


@dataclass(config=dict(extra="allow")) # type: ignore[call-overload]
class SoftwareConfig:
deployment: InitVar[Deployment]
Expand Down
6 changes: 3 additions & 3 deletions anvil-python/anvil/provider/local/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
from pathlib import Path
from typing import List
Expand Down Expand Up @@ -74,7 +73,7 @@
RemoveHAProxyUnitStep,
haproxy_install_steps,
)
from anvil.commands.juju import JujuAddSSHKeyStep
from anvil.commands.juju import JujuAddSSHKeyStep, ScaleJujuStep
from anvil.commands.maas_agent import (
RemoveMAASAgentUnitStep,
maas_agent_install_steps,
Expand Down Expand Up @@ -499,7 +498,7 @@ def join(
name,
)
)

plan2.append(ScaleJujuStep(jhelper, deployment.infrastructure_model))
run_plan(plan2, console)

click.echo(f"Node joined cluster with roles: {pretty_roles}")
Expand Down Expand Up @@ -591,6 +590,7 @@ def remove(ctx: click.Context, name: str) -> None:
# Cannot remove user as the same user name cannot be reused,
# so commenting the RemoveJujuUserStep
# RemoveJujuUserStep(name),
ScaleJujuStep(jhelper, deployment.infrastructure_model),
ClusterRemoveNodeStep(client, name),
]
run_plan(plan, console)
Expand Down
Loading