Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Resiliency scenarios #1

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ocs_ci/ocs/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@
PROVIDER_CLIENT_DEPLOYMENT_DIR = os.path.join(
TEMPLATE_DIR, "provider-client-deployment"
)
RESILIENCY_DIR = os.path.join(TOP_DIR, "ocs_ci", "resiliency")

# OCP Deployment constants
CHRONY_TEMPLATE = os.path.join(
Expand Down
2 changes: 1 addition & 1 deletion ocs_ci/ocs/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,7 @@ def get_node_ips(node_type="worker"):
ocp = OCP(kind=constants.NODE)
if node_type == "worker":
nodes = ocp.get(selector=constants.WORKER_LABEL).get("items")
if node_type == "master:":
if node_type == "master":
nodes = ocp.get(selector=constants.MASTER_LABEL).get("items")

if config.ENV_DATA["platform"].lower() == constants.AWS_PLATFORM:
Expand Down
182 changes: 182 additions & 0 deletions ocs_ci/resiliency/cluster_failures.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
import logging
from ocs_ci.ocs.node import get_node_ips
from abc import ABC, abstractmethod
from ocs_ci.framework import config
from ocs_ci.utility.vsphere import VSPHERE
from ocs_ci.ocs import constants
import random
import time

log = logging.getLogger(__name__)


class ClusterFailures(ABC):
def __init__(self, cluster_name):
self.cluster_name = cluster_name

def random_node_ip(self, node_type="worker"):
"""Return a random node IP of a given node type."""
ips = get_node_ips(node_type=node_type)
return random.choice(ips)

@abstractmethod
def shutdown_node(self, node_ip=None, node_type="worker"):
pass

@abstractmethod
def change_node_network_interface_state(
self, node_ip=None, node_type="worker", interface_name=None, connect=False
):
pass

@abstractmethod
def network_split(self, nodes):
pass


class VsphereClusterFailures(ClusterFailures):
def __init__(self):
super().__init__(cluster_name="vSphere")
self.vsphere_host = config.ENV_DATA["vsphere_server"]
self.vsphere_password = config.ENV_DATA["vsphere_password"]
self.vsphere_username = config.ENV_DATA["vsphere_user"]
self.dc = config.ENV_DATA["vsphere_datacenter"]
self.vsobj = VSPHERE(
self.vsphere_host, self.vsphere_username, self.vsphere_password
)

def shutdown_node(self, node_ip=None, node_type="worker"):
if not node_ip:
node_ip = self.random_node_ip(node_type=node_type)
log.info(f"Shutting down node {node_ip} on vSphere cluster {self.cluster_name}")
vm = self.vsobj.get_vm_by_ip(node_ip, self.dc)
self.vsobj.stop_vms([vm])
log.info(f"Node {node_ip} VM instance stopped.")

def reboot_node(self, node_ip=None, node_type="worker"):
if not node_ip:
node_ip = self.random_node_ip(node_type=node_type)
vm = self.vsobj.get_vm_by_ip(node_ip, self.dc)
vm_name = vm.name
self.vsobj.stop_vms([vm])
log.info(f"VM instance {vm_name} is stopped.")
time.sleep(20)
self.vsobj.start_vms([vm])
log.info(f"VM instance {vm_name} is started.")

def change_node_network_interface_state(
self, node_ip=None, node_type="worker", interface_name=None, connect=False
):
if not node_ip:
node_ip = self.random_node_ip(node_type=node_type)
log.info(
f"{'Connecting' if connect else 'Disconnecting'} network interface"
f"of node {node_ip} on vSphere cluster {self.cluster_name}"
)
self.vsobj.change_vm_network_state(node_ip, self.dc, connect=connect)

def network_split(self, nodes):
log.warning("Function 'network_split' is not implemented.")
raise NotImplementedError("Function 'network_split' is not implemented.")


class IbmCloudClusterFailures(ClusterFailures):
def __init__(self):
super().__init__(cluster_name="IBM Cloud")

def shutdown_node(self, node_ip=None, node_type="worker"):
if not node_ip:
node_ip = self.random_node_ip(node_type=node_type)
log.info(
f"Shutting down node {node_ip} on IBM Cloud cluster {self.cluster_name}"
)
raise NotImplementedError("IBM Cloud shutdown logic is not implemented.")

def change_node_network_interface_state(
self, node_ip=None, node_type="worker", interface_name=None, connect=False
):
if not node_ip:
node_ip = self.random_node_ip(node_type=node_type)
log.info(
f"{'Connecting' if connect else 'Disconnecting'} network interface"
f" of node {node_ip} on IBM Cloud cluster {self.cluster_name}"
)
# Add IBM Cloud-specific logic here

def network_split(self, nodes):
log.info(
f"Simulating network split on nodes {nodes} in IBM Cloud cluster {self.cluster_name}"
)
# Add IBM Cloud-specific network split logic


class AwsClusterFailures(ClusterFailures):
def __init__(self):
super().__init__(cluster_name="AWS")

def shutdown_node(self, node_ip=None, node_type="worker"):
if not node_ip:
node_ip = self.random_node_ip(node_type=node_type)
log.info(f"Shutting down node {node_ip} on AWS cluster {self.cluster_name}")
# Add AWS-specific shutdown logic

def change_node_network_interface_state(
self, node_ip=None, node_type="worker", interface_name=None, connect=False
):
if not node_ip:
node_ip = self.random_node_ip(node_type=node_type)
log.info(
f"{'Connecting' if connect else 'Disconnecting'} network interface"
f"of node {node_ip} on AWS cluster {self.cluster_name}"
)
# Add AWS-specific logic here

def network_split(self, nodes):
log.info(
f"Simulating network split on nodes {nodes} in AWS cluster {self.cluster_name}"
)
# Add AWS-specific network split logic


class BaremetalClusterFailures(ClusterFailures):
def __init__(self):
super().__init__(cluster_name="Bare Metal")

def shutdown_node(self, node_ip=None, node_type="worker"):
if not node_ip:
node_ip = self.random_node_ip(node_type=node_type)
log.info(
f"Shutting down node {node_ip} on Bare Metal cluster {self.cluster_name}"
)
# Add bare metal-specific shutdown logic

def change_node_network_interface_state(
self, node_ip=None, node_type="worker", interface_name=None, connect=False
):
if not node_ip:
node_ip = self.random_node_ip(node_type=node_type)
log.info(
f"{'Connecting' if connect else 'Disconnecting'} network interface"
f" of node {node_ip} on Bare Metal cluster {self.cluster_name}"
)
# Add bare metal-specific logic here

def network_split(self, nodes):
log.info(
f"Simulating network split on nodes {nodes} in Bare Metal cluster {self.cluster_name}"
)
# Add bare metal-specific network split logic


def get_cluster_object():
platform = config.ENV_DATA["platform"].lower()
if platform == constants.VSPHERE_PLATFORM:
return VsphereClusterFailures()
elif platform == constants.AWS_PLATFORM:
return AwsClusterFailures()
elif platform == constants.IBMCLOUD_PLATFORM:
return IbmCloudClusterFailures()
elif platform == constants.BAREMETAL_PLATFORM:
return BaremetalClusterFailures()
else:
raise ValueError(f"Unsupported platform: {platform}")
20 changes: 20 additions & 0 deletions ocs_ci/resiliency/conf/network_failures.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
NETWORK_FAILURES:
WAIT_TILL_NODE_JOIN: true
FAILURES:
- NODE_NETWORK_DOWN:
NETWORK_FAILURE_DURATION: 30
node_selector:
- labels: []
num_reboot_nodes: "0-3"
- POD_NETWORK_FAILURE:
node_selector:
- labels: []
WORKLOAD:
FIO:
- CEPHFS:
template: "fio_cephfs_template.yaml"
name: "cephfs-fio-workload"
- BLOCK:
template: "fio_block_template.yaml"
name: "block-fio-workload"

22 changes: 22 additions & 0 deletions ocs_ci/resiliency/conf/node_failures.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
NODE_FAILURES:
WAIT_TILL_NODE_JOIN: true
FAILURES:
- POWEROFF_NODE:
NODE_TYPE:
- "master"
- "worker"
node_selector:
- labels: []
iteration: 3
- NODE_DRAIN:
PRIORITY: 1
node_selector:
- labels: []
WORKLOAD:
FIO:
- CEPHFS:
template: "fio_cephfs_template.yaml"
name: "cephfs-fio-workload"
- BLOCK:
template: "fio_block_template.yaml"
name: "block-fio-workload"
7 changes: 7 additions & 0 deletions ocs_ci/resiliency/conf/resiliency.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
RESILIENCY:
RUN_CONFIG:
STOP_WHEN_CEPH_UNHEALTHY: true
ITERATE_SCENARIOS: True
FAILURE_SCENARIOS:
- NODE_FAILURES
- NETWORK_FAILURES
59 changes: 59 additions & 0 deletions ocs_ci/resiliency/network_failures.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import logging
import time

from ocs_ci.resiliency.cluster_failures import get_cluster_object

log = logging.getLogger(__name__)


class NetworkFailures:
SCENARIO_NAME = "NETWORK_FAILURES"
FAILURE_METHODS = {
"POD_NETWORK_FAILURE": "_run_pod_network_failures",
"NODE_NETWORK_DOWN": "_run_node_network_failure",
}

def __init__(self, failure_data):
self.scenario_name = self.SCENARIO_NAME
self.failure_data = failure_data
self.cluster_obj = get_cluster_object()

def failure_case(self):
"""Get the first failure case key from failure_data."""
if not self.failure_data:
raise ValueError("No failure case provided in failure_data.")
return next(iter(self.failure_data))

def run(self):
"""Dynamically call the appropriate method based on the failure case."""
case = self.failure_case()
method_name = self.FAILURE_METHODS.get(case)
if method_name and hasattr(self, method_name):
method = getattr(self, method_name)
method()
else:
raise NotImplementedError(
f"Failure method for case '{case}' is not implemented."
)

def _run_pod_network_failures(self):
"""Handle Pod Network Failure scenario."""
log.info("Bringing down Pod network interface.")
# Implement pod network failure logic here

def _run_node_network_failure(self):
"""Handle Node Network Failure scenario."""
log.info("Bringing down Node network interfaces.")
node_types = ["master", "worker"]
for node_type in node_types:
node_ip = self.cluster_obj.random_node_ip(node_type)
self.cluster_obj.change_node_network_interface_state(
node_ip=node_ip, node_type=node_type, connect=False
)
try:
time.sleep(60) # Simulate network being down
finally:
self.cluster_obj.change_node_network_interface_state(
node_ip=node_ip, node_type=node_type, connect=True
)
log.info(f"Network interface on node {node_ip} restored.")
65 changes: 65 additions & 0 deletions ocs_ci/resiliency/node_failures.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import logging
from ocs_ci.utility.utils import ceph_health_check
from ocs_ci.resiliency.cluster_failures import get_cluster_object

log = logging.getLogger(__name__)


class NodeFailures:
SCENARIO_NAME = "NODE_FAILURES"
FAILURE_METHODS = {
"POWEROFF_NODE": "_run_poweroff_node",
"NODE_DRAIN": "_run_node_drain",
}

def __init__(self, failure_data):
self.failure_data = failure_data
self.failure_case_name = self._get_failure_case()
self.scenario_name = self.SCENARIO_NAME
self.cluster_obj = get_cluster_object()

def _get_failure_case(self):
"""Retrieve the failure case name from the provided failure data."""
if not self.failure_data:
log.error("Failure data is empty.")
return None
return next(iter(self.failure_data))

def run(self):
"""Run the failure scenario based on the failure case."""
if not self.failure_case_name:
log.error("No valid failure case name found. Exiting run method.")
return

method_name = self.FAILURE_METHODS.get(self.failure_case_name)
if method_name and hasattr(self, method_name):
failure_method = getattr(self, method_name)
failure_method()
self._post_scenario_checks()
else:
raise NotImplementedError(
f"Failure method for '{self.failure_case_name}' is not implemented."
)

def _run_poweroff_node(self):
"""Simulate the reboot of nodes."""
log.info("Running Failure Case: POWEROFF_NODE.")
node_types = self.failure_data[self.failure_case_name].get("NODE_TYPE", [])
num_nodes = self.failure_data[self.failure_case_name].get("NUM_NODES", 2)
for node_type in node_types:
for _ in range(num_nodes):
log.info(f"Rebooting {node_type} node.")
self.cluster_obj.reboot_node(node_type=node_type)
log.info(f"{node_type.capitalize()} node rebooted.")

def _run_node_drain(self):
"""Simulate draining of nodes."""
log.info("Running Failure Case: NODE_DRAIN.")
# Implement node drain logic here
log.info("Draining node...")

def _post_scenario_checks(self):
"""Perform post-scenario checks to ensure the cluster is healthy."""
log.info(f"Running post-scenario checks for {self.scenario_name}.")
log.info("Verifying that Ceph health is OK (retrying if necessary).")
ceph_health_check(tries=45, delay=60)
Loading
Loading