diff --git a/charmcraft.yaml b/charmcraft.yaml index 1d16ae06..e017ec9d 100644 --- a/charmcraft.yaml +++ b/charmcraft.yaml @@ -151,6 +151,12 @@ actions: stored by the charm. required: [secret-id] + bootstrap-raft: + description: >- + Bootstraps raft using a peers.json file. This can help recover when quorum is lost, + however, it may cause uncommitted Raft log entries to be committed. See + https://developer.hashicorp.com/vault/docs/concepts/integrated-storage#manual-recovery-using-peers-json + for more details. create-backup: description: >- Creates a snapshot of the Raft backend and saves it to the S3 storage. diff --git a/lib/charms/vault_k8s/v0/juju_facade.py b/lib/charms/vault_k8s/v0/juju_facade.py index a3c2bfa5..16444975 100644 --- a/lib/charms/vault_k8s/v0/juju_facade.py +++ b/lib/charms/vault_k8s/v0/juju_facade.py @@ -8,7 +8,9 @@ from ops.charm import CharmBase from ops.model import ( - Application, + Application as OpsApplication, +) +from ops.model import ( Binding, ModelError, Relation, @@ -16,7 +18,9 @@ RelationDataError, Secret, SecretNotFoundError, - Unit, +) +from ops.model import ( + Unit as OpsUnit, ) # The unique Charmhub library identifier, never change it @@ -80,11 +84,37 @@ class MultipleRelationsFoundError(FacadeError): """Exception raised when multiple relations are found.""" +class Application: + """Application wrapper class for JujuFacade.""" + + def __init__(self, app: OpsApplication): + self._app = app + + @property + def planned_units(self) -> int: + """Return the number of planned units for the application.""" + return self._app.planned_units() + + +class Unit: + """Unit wrapper class for JujuFacade.""" + + def __init__(self, unit: OpsUnit): + self._unit = unit + + @property + def is_leader(self) -> bool: + """Check if the unit is leader.""" + return self._unit.is_leader() + + class JujuFacade: """Juju API wrapper class.""" def __init__(self, charm: CharmBase): self.charm = charm + self.app = Application(charm.model.app) + self.unit = Unit(charm.model.unit) # Secret related methods def get_secret(self, label: str | None = None, id: str | None = None) -> Secret: @@ -414,7 +444,7 @@ def relation_exists(self, name: str) -> bool: def _read_relation_data( self, - entity: Unit | Application, + entity: OpsUnit | OpsApplication, id: int | None = None, name: str | None = None, relation: Relation | None = None, @@ -500,7 +530,7 @@ def get_unit_relation_data( def get_remote_unit_relation_data( self, - unit: Unit, + unit: OpsUnit, name: str | None = None, id: int | None = None, relation: Relation | None = None, @@ -535,7 +565,7 @@ def get_remote_units_relation_data( def _set_relation_data( self, data: dict[str, str], - entity: Unit | Application, + entity: OpsUnit | OpsApplication, name: str | None = None, id: int | None = None, relation: Relation | None = None, diff --git a/lib/charms/vault_k8s/v0/vault_kv.py b/lib/charms/vault_k8s/v0/vault_kv.py index a1efb8dc..706e36b5 100644 --- a/lib/charms/vault_k8s/v0/vault_kv.py +++ b/lib/charms/vault_k8s/v0/vault_kv.py @@ -335,7 +335,7 @@ class VaultKvProviderEvents(ops.ObjectEvents): class VaultKvProvides(ops.Object): - """Class to be instanciated by the providing side of the relation.""" + """Class to be instantiated by the providing side of the relation.""" on = VaultKvProviderEvents() # type: ignore @@ -516,7 +516,7 @@ class VaultKvRequireEvents(ops.ObjectEvents): class VaultKvRequires(ops.Object): - """Class to be instanciated by the requiring side of the relation.""" + """Class to be instantiated by the requiring side of the relation.""" on = VaultKvRequireEvents() # type: ignore diff --git a/lib/charms/vault_k8s/v0/vault_managers.py b/lib/charms/vault_k8s/v0/vault_managers.py index cb09195a..1fa60d5e 100644 --- a/lib/charms/vault_k8s/v0/vault_managers.py +++ b/lib/charms/vault_k8s/v0/vault_managers.py @@ -28,6 +28,7 @@ - Depend on each other unless the features explicitly require the dependency. """ +import json import logging import os from abc import ABC, abstractmethod @@ -457,7 +458,7 @@ def tls_file_available_in_charm(self, file: File) -> bool: raise def ca_certificate_is_saved(self) -> bool: - """Return wether a CA cert and its private key are saved in the charm.""" + """Return whether a CA cert and its private key are saved in the charm.""" return self.ca_certificate_secret_exists() or self.tls_file_pushed_to_workload(File.CA) def _restart_vault(self) -> None: @@ -1447,3 +1448,63 @@ def _get_s3_parameters(self) -> dict[str, str]: if isinstance(value, str): s3_parameters[key] = value.strip() return s3_parameters + + +class RaftManager: + """Encapsulates the business logic for managing the bootstrap of a Vault cluster in Raft mode.""" + + def __init__( + self, + charm: CharmBase, + workload: WorkloadBase, + ): + self._juju_facade = JujuFacade(charm) + self._workload = workload + + def bootstrap(self, node_id: str, address: str) -> None: + """Bootstrap a Vault cluster in Raft mode. + + This method will bootstrap a Vault cluster in Raft mode if it has not + already been bootstrapped. If the cluster is already bootstrapped, this + method will do nothing. + """ + if not self._juju_facade.unit.is_leader: + logger.debug("Only leader unit can bootstrap a Vault cluster") + raise ManagerError("Only the leader unit can bootstrap a Vault cluster") + if not self._juju_facade.app.planned_units == 1: + raise ManagerError("Bootstrapping a Vault cluster requires exactly one unit") + + # TODO: Should the service name be passed in? + self._workload.stop("vault") + self.create_peers_json(node_id, address) + # TODO: Should there be a `start()` method? + self._workload.restart("vault") + + logger.info("Vault cluster bootstrapped in Raft mode") + + def cleanup(self) -> None: + """Clean up the Raft bootstrap configuration. + + This method will clean up the Raft bootstrap configuration if it exists. + """ + self._workload.remove_path("/vault/raft/raft/peers.json") + + def create_peers_json(self, node_id: str, address: str) -> None: + """Create the peers.json file for the Vault cluster. + + This method will create the peers.json file for the Vault cluster based + on the current peers in the relation. + """ + pass + self._workload.push("/vault/raft/raft/peers.json", self._get_peers_json(node_id, address)) + + def _get_peers_json(self, node_id: str, address: str) -> str: + """Return the peers.json file content for the Vault cluster. + + This method will return the content of the peers.json file based on the + current peers in the relation. + + Returns: + The content of the peers.json file. + """ + return json.dumps([{"id": node_id, "address": address}]) diff --git a/pyproject.toml b/pyproject.toml index 0b63b5a3..c3996ac5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -91,5 +91,5 @@ max-complexity = 10 skip = "build,lib,venv,icon.svg,.tox,.git,.mypy_cache,.ruff_cache,.coverage" [tool.pyright] -include = ["src/**.py"] +include = ["src", "tests", "lib/charms/vault_k8s/v0"] reportMissingParameterType = true diff --git a/src/charm.py b/src/charm.py index b8a43c5a..7693dcf7 100755 --- a/src/charm.py +++ b/src/charm.py @@ -61,6 +61,7 @@ KVManager, ManagerError, PKIManager, + RaftManager, TLSManager, VaultCertsError, ) @@ -208,6 +209,7 @@ def __init__(self, *args: Any): self.framework.observe(self.on.create_backup_action, self._on_create_backup_action) self.framework.observe(self.on.list_backups_action, self._on_list_backups_action) self.framework.observe(self.on.restore_backup_action, self._on_restore_backup_action) + self.framework.observe(self.on.bootstrap_raft_action, self._on_bootstrap_raft_action) self.framework.observe( self.vault_kv.on.vault_kv_client_detached, self._on_vault_kv_client_detached ) @@ -280,7 +282,7 @@ def _on_collect_status(self, event: CollectStatusEvent): # noqa: C901 BlockedStatus("Please authorize charm (see `authorize-charm` action)") ) return - if not self._get_authenticated_vault_client(): + if not self._get_active_or_standby_vault_client(): event.add_status(WaitingStatus("Waiting for vault to finish raft leader election")) event.add_status(ActiveStatus()) @@ -323,7 +325,7 @@ def _configure(self, _: EventBase) -> None: # noqa: C901 return except VaultClientError: return - if not (vault := self._get_authenticated_vault_client()): + if not (vault := self._get_active_or_standby_vault_client()): return self._configure_pki_secrets_engine(vault) self._sync_vault_autounseal(vault) @@ -517,6 +519,21 @@ def _on_authorize_charm_action(self, event: ActionEvent) -> None: logger.exception("Vault returned an error while authorizing the charm") event.fail(f"Vault returned an error while authorizing the charm: {str(e)}") + def _on_bootstrap_raft_action(self, event: ActionEvent) -> None: + """Bootstraps the raft cluster when a single node is present. + + This is useful when Vault has lost quorum. The application must first + be reduced to a single unit. + """ + try: + manager = RaftManager(self, self._container) + manager.bootstrap(self._node_id, self._api_address) + except ManagerError as e: + logger.error("Failed to bootstrap raft: %s", e) + event.fail(message=f"Failed to bootstrap raft: {e}") + return + event.set_results({"result": "Raft cluster bootstrapped successfully."}) + def _on_create_backup_action(self, event: ActionEvent) -> None: """Handle the create-backup action. @@ -526,7 +543,7 @@ def _on_create_backup_action(self, event: ActionEvent) -> None: Args: event: ActionEvent """ - vault_client = self._get_authenticated_vault_client() + vault_client = self._get_active_or_standby_vault_client() if not vault_client: event.fail(message="Failed to initialize Vault client.") return @@ -737,6 +754,31 @@ def _get_active_vault_client(self) -> VaultClient | None: def _get_authenticated_vault_client(self) -> VaultClient | None: """Return an authenticated client for the Vault service on this unit. + Returns: + Vault: An active Vault client configured with the cluster address + and CA certificate, and authorized with the AppRole + credentials set upon initial authorization of the charm, or + `None` if the client could not be successfully created or + has not been authorized. + """ + try: + vault = VaultClient( + url=self._api_address, + ca_cert_path=self.tls.get_tls_file_path_in_charm(File.CA), + ) + except VaultCertsError as e: + logger.warning("Failed to get Vault client: %s", e) + return None + if not vault.is_api_available(): + return None + if not (approle := self._get_approle_auth_secret()): + return None + if not vault.authenticate(approle): + return None + + def _get_active_or_standby_vault_client(self) -> VaultClient | None: + """Return an authenticated client for the Vault service on this unit. + Returns: Vault: An active Vault client configured with the cluster address and CA certificate, and authorized with the AppRole diff --git a/tests/unit/lib/charms/vault_k8s/v0/test_vault_managers.py b/tests/unit/lib/charms/vault_k8s/v0/test_vault_managers.py index d49b1d2e..af8a7b54 100644 --- a/tests/unit/lib/charms/vault_k8s/v0/test_vault_managers.py +++ b/tests/unit/lib/charms/vault_k8s/v0/test_vault_managers.py @@ -23,6 +23,7 @@ PKIManager, PrivateKey, ProviderCertificate, + RaftManager, TLSCertificatesProvidesV4, TLSCertificatesRequiresV4, VaultKvProvides, @@ -30,6 +31,7 @@ from charms.vault_k8s.v0.vault_s3 import S3Error from charm import AUTOUNSEAL_MOUNT_PATH, VaultCharm +from container import Container from tests.unit.certificates import ( generate_example_provider_certificate, generate_example_requirer_csr, @@ -723,3 +725,36 @@ def test_given_s3_content_and_vault_client_available_when_restore_backup_then_ba ): self.manager.restore_backup(self.vault_client, "vault-backup-my-model-1") self.vault_client.restore_snapshot.assert_called_once_with(snapshot="snapshot content") + + +class TestRaftManager: + @pytest.fixture(autouse=True) + @patch("charms.vault_k8s.v0.vault_managers.JujuFacade") + def setup(self, juju_facade_mock: MagicMock, monkeypatch: pytest.MonkeyPatch): + self.juju_facade = juju_facade_mock.return_value + self.juju_facade.unit.is_leader = True + self.juju_facade.app.planned_units = 1 + self.charm = MagicMock(spec=VaultCharm) + self.workload = MagicMock(spec=Container) + self.manager = RaftManager(self.charm, self.workload) + + def test_given_non_leader_when_bootstrap_then_error_raised(self): + self.juju_facade.unit.is_leader = False + with pytest.raises(ManagerError) as e: + self.manager.bootstrap("my-node", "my-address") + assert str(e.value) == "Only the leader unit can bootstrap a Vault cluster" + + def test_given_many_units_when_bootstrap_then_error_raised(self): + self.juju_facade.app.planned_units = 2 + with pytest.raises(ManagerError) as e: + self.manager.bootstrap("my-node", "my-address") + assert str(e.value) == "Bootstrapping a Vault cluster requires exactly one unit" + + def test_given_one_unit_and_leader_when_bootstrap_then_peers_json_created(self): + self.manager.bootstrap("my-node", "my-address") + self.workload.stop.assert_called_once_with("vault") + self.workload.push.assert_called_once_with( + "/vault/raft/raft/peers.json", + '[{"id": "my-node", "address": "my-address"}]', + ) + self.workload.restart.assert_called_once_with("vault") diff --git a/tox.ini b/tox.ini index 656d2d32..d068a5fe 100644 --- a/tox.ini +++ b/tox.ini @@ -40,7 +40,7 @@ commands = [testenv:lint] description = Check code against coding style standards commands = - codespell {tox_root} + codespell {[vars]all_path} ruff check {[vars]all_path} ruff format --check {[vars]all_path}