Skip to content

Commit

Permalink
add restore_availability_zone_power_after_interruption action
Browse files Browse the repository at this point in the history
Signed-off-by: Sylvain Hellegouarch <sh@defuze.org>
  • Loading branch information
Lawouach committed Jan 21, 2024
1 parent 41100d6 commit c6f90fa
Show file tree
Hide file tree
Showing 3 changed files with 162 additions and 6 deletions.
46 changes: 45 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,51 @@

## [Unreleased][]

[Unreleased]: https://github.com/chaostoolkit-incubator/chaostoolkit-aws/compare/0.30.0...HEAD
[Unreleased]: https://github.com/chaostoolkit-incubator/chaostoolkit-aws/compare/0.31.0...HEAD

## [0.31.0][] - 2024-01-21

[0.31.0]: https://github.com/chaostoolkit-incubator/chaostoolkit-aws/compare/0.30.0...0.31.0

### Added

- `restore_availability_zone_power_after_interruption` action to rollback
`start_availability_zone_power_interruption_scenario`

```json
{
"version": "1.0.0",
"title": "Run the 'AZ Availability - Power Interruption' scenario",
"description": "n/a",
"method": [
{
"name": "start-availability-zone-power-interruption-scenario",
"type": "action",
"provider": {
"func": "start_availability_zone_power_interruption_scenario",
"type": "python",
"module": "chaosaws.fis.actions",
"arguments": {
"az": "eu-central-1a",
"tags": "chaoseengineering=true",
"duration": "PT30M"
}
}
}
],
"rollbacks": [
{
"name": "stop-experiments-by-tags",
"type": "action",
"provider": {
"func": "restore_availability_zone_power_after_interruption",
"type": "python",
"module": "chaosaws.fis.actions"
}
}
]
}
```

## [0.30.0][] - 2024-01-20

Expand Down
2 changes: 1 addition & 1 deletion chaosaws/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from chaoslib.types import Configuration, DiscoveredActivities, Discovery, Secrets
from logzero import logger

__version__ = "0.30.0"
__version__ = "0.31.0"
__all__ = ["__version__", "discover", "aws_client", "signed_api_call"]


Expand Down
120 changes: 116 additions & 4 deletions chaosaws/fis/actions.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import json
import threading
import time
from secrets import token_hex
from typing import Dict, List, Optional, Union

from chaoslib.exceptions import FailedActivity
Expand All @@ -16,6 +16,7 @@
"stop_experiments_by_tags",
"start_stress_pod_delete_scenario",
"start_availability_zone_power_interruption_scenario",
"restore_availability_zone_power_after_interruption",
]


Expand Down Expand Up @@ -158,17 +159,35 @@ def stop_experiments_by_tags(
except Exception as ex:
raise FailedActivity(f"Listing Experiments failed, reason was: {ex}")

logger.debug(f"Trying to stop experiments which are supersets of {tags}")
stopped = []
template_ids = []
for x in experiments["experiments"]:
try:
if x["tags"] == tags:
if tags.items() <= x["tags"].items():
status = x["state"]["status"]
if status in ("pending", "initiating", "running", "completed"):
result = fis_client.stop_experiment(id=x["id"])
stopped.append(result)
template_ids.append(x["experimentTemplateId"])
except Exception as ex:
raise FailedActivity(f"Stop Experiment failed, reason was: {ex}")

logger.debug(f"Stopped experiments {stopped}")

if delete_templates:
logger.debug(f"Deleting experiments templates {template_ids}")

for template_id in template_ids:
try:
fis_client.delete_experiment_template(id=template_id)
logger.debug(f"Experiment template {template_id} deleted")
except Exception as ex:
raise FailedActivity(
f"Delete Experiment template {template_id} failed, "
f"reason was: {ex}"
)

return stopped


Expand Down Expand Up @@ -315,7 +334,7 @@ def start_availability_zone_power_interruption_scenario(
)
account_id = sts_client.get_caller_identity().get("Account")

suffix = token_hex(4)
suffix = f"{threading.get_ident()}"

tags = convert_tags(tags)

Expand Down Expand Up @@ -481,7 +500,7 @@ def start_availability_zone_power_interruption_scenario(
resource_name="iam", configuration=configuration, secrets=secrets
)

role_name = f"ChaosToolkit-AWSFISIAMRole-{suffix}"
role_name = f"ChaosToolkit-FIS-{suffix}"

assume_role_policy = json.dumps(
{
Expand Down Expand Up @@ -785,3 +804,96 @@ def start_availability_zone_power_interruption_scenario(
return fis_client.start_experiment(**params)
except Exception as ex:
raise FailedActivity(f"Start Experiment failed, reason was: {ex}")


def restore_availability_zone_power_after_interruption(
tags: Union[str, Dict[str, str], None] = None,
delete_roles_and_policies: bool = True,
delete_templates: bool = True,
configuration: Configuration = None,
secrets: Secrets = None,
) -> List[AWSResponse]:
"""
Restore Availability-Zone and clean any resources created for the experiment
:param tags: str | Dict[str, str] | None representing tags to lookup
experiments. When left empty, using a special tag key that was set
on all resources during the start of the experiment
:param delete_roles_and_policies: boolean, true means any created resources
such as roles and policies will be deleted too
:param delete_templates: boolean delete the template for the experiment
:param configuration: Configuration object representing the CTK Configuration
:param secrets: Secret object representing the CTK Secrets
:returns: AWSResponse representing the response from FIS upon stopping the
experiment
"""
suffix = f"{threading.get_ident()}"

if not tags:
tags = {"chaostoolkit-experiment-key": suffix}

logger.debug("Deleting experiment and restoring AZ")

payload = stop_experiments_by_tags(
tags=tags,
delete_templates=delete_templates,
configuration=configuration,
secrets=secrets,
)

if delete_roles_and_policies:
iam_client = aws_client(
resource_name="iam", configuration=configuration, secrets=secrets
)

role_name = f"ChaosToolkit-FIS-{suffix}"
logger.info(f"Deleting role {role_name}")

try:
response = iam_client.list_attached_role_policies(RoleName=role_name)
except iam_client.exceptions.NoSuchEntityException:
logger.debug("Failed to list attached role policies")

try:
response = iam_client.delete_role(RoleName=role_name)
except iam_client.exceptions.NoSuchEntityException:
logger.debug(f"Failed to delete role {role_name}")

return payload

logger.debug(f"Detaching policies {response}")

policies = list(response["AttachedPolicies"])

for policy in policies:
logger.debug(f"Detaching policy {policy['PolicyName']}")
try:
iam_client.detach_role_policy(
RoleName=role_name, PolicyArn=policy["PolicyArn"]
)
except iam_client.exceptions.NoSuchEntityException:
logger.debug(f"Failed to detach policy {policy['PolicyArn']}")
continue

try:
response = iam_client.delete_role(RoleName=role_name)
except iam_client.exceptions.NoSuchEntityException:
logger.debug(f"Failed to delete role {role_name}")
return payload

for policy in policies:
policy_name = policy["PolicyName"]
# don't delete managed policies
if policy_name in ["AWSFaultInjectionSimulatorRDSAccess"]:
continue

logger.debug(f"Deleting policy {policy_name}")
try:
iam_client.delete_role_policy(
RoleName=role_name, PolicyName=policy_name
)
except iam_client.exceptions.NoSuchEntityException:
logger.debug(f"Failed to delete policy {policy_name}")
continue

return payload

0 comments on commit c6f90fa

Please sign in to comment.