From a733ae3acfb46b25677392c8735426300aca0d6d Mon Sep 17 00:00:00 2001
From: Parag Kamble <pakamble@redhat.com>
Date: Sat, 24 Aug 2024 22:50:11 +0530
Subject: [PATCH]  Resiliency tests scenarios automation

Signed-off-by: Parag Kamble <pakamble@redhat.com>
---
 ocs_ci/ocs/constants.py                       |   1 +
 ocs_ci/ocs/node.py                            |   2 +-
 ocs_ci/resiliency/cluster_failures.py         | 182 ++++++++++++++++++
 ocs_ci/resiliency/conf/network_failures.yaml  |  20 ++
 ocs_ci/resiliency/conf/node_failures.yaml     |  22 +++
 ocs_ci/resiliency/conf/resiliency.yaml        |   7 +
 ocs_ci/resiliency/network_failures.py         |  59 ++++++
 ocs_ci/resiliency/node_failures.py            |  65 +++++++
 ocs_ci/resiliency/resiliency_helper.py        | 169 ++++++++++++++++
 ocs_ci/resiliency/resiliency_workload.py      | 154 +++++++++++++++
 .../fio_block_workload_template.yaml          |  42 ++++
 .../workloads/fio_fs_workload_template.yaml   |  41 ++++
 ocs_ci/utility/vsphere.py                     |  34 ++++
 setup.py                                      |   1 +
 tests/conftest.py                             |  22 +++
 .../test_network_failures_scenarios.py        |  39 ++++
 .../resiliency/test_node_failure_scenarios.py |  40 ++++
 17 files changed, 899 insertions(+), 1 deletion(-)
 create mode 100644 ocs_ci/resiliency/cluster_failures.py
 create mode 100644 ocs_ci/resiliency/conf/network_failures.yaml
 create mode 100644 ocs_ci/resiliency/conf/node_failures.yaml
 create mode 100644 ocs_ci/resiliency/conf/resiliency.yaml
 create mode 100644 ocs_ci/resiliency/network_failures.py
 create mode 100644 ocs_ci/resiliency/node_failures.py
 create mode 100644 ocs_ci/resiliency/resiliency_helper.py
 create mode 100644 ocs_ci/resiliency/resiliency_workload.py
 create mode 100644 ocs_ci/resiliency/workloads/fio_block_workload_template.yaml
 create mode 100644 ocs_ci/resiliency/workloads/fio_fs_workload_template.yaml
 create mode 100644 tests/resiliency/test_network_failures_scenarios.py
 create mode 100644 tests/resiliency/test_node_failure_scenarios.py

diff --git a/ocs_ci/ocs/constants.py b/ocs_ci/ocs/constants.py
index b9734ff7d2d..310285022a8 100644
--- a/ocs_ci/ocs/constants.py
+++ b/ocs_ci/ocs/constants.py
@@ -94,6 +94,7 @@
 PROVIDER_CLIENT_DEPLOYMENT_DIR = os.path.join(
     TEMPLATE_DIR, "provider-client-deployment"
 )
+RESILIENCY_DIR = os.path.join(TOP_DIR, "ocs_ci", "resiliency")
 
 # OCP Deployment constants
 CHRONY_TEMPLATE = os.path.join(
diff --git a/ocs_ci/ocs/node.py b/ocs_ci/ocs/node.py
index 5514b469239..5b74db99f82 100644
--- a/ocs_ci/ocs/node.py
+++ b/ocs_ci/ocs/node.py
@@ -333,7 +333,7 @@ def get_node_ips(node_type="worker"):
     ocp = OCP(kind=constants.NODE)
     if node_type == "worker":
         nodes = ocp.get(selector=constants.WORKER_LABEL).get("items")
-    if node_type == "master:":
+    if node_type == "master":
         nodes = ocp.get(selector=constants.MASTER_LABEL).get("items")
 
     if config.ENV_DATA["platform"].lower() == constants.AWS_PLATFORM:
diff --git a/ocs_ci/resiliency/cluster_failures.py b/ocs_ci/resiliency/cluster_failures.py
new file mode 100644
index 00000000000..a1feb189c17
--- /dev/null
+++ b/ocs_ci/resiliency/cluster_failures.py
@@ -0,0 +1,182 @@
+import logging
+from ocs_ci.ocs.node import get_node_ips
+from abc import ABC, abstractmethod
+from ocs_ci.framework import config
+from ocs_ci.utility.vsphere import VSPHERE
+from ocs_ci.ocs import constants
+import random
+import time
+
+log = logging.getLogger(__name__)
+
+
+class ClusterFailures(ABC):
+    def __init__(self, cluster_name):
+        self.cluster_name = cluster_name
+
+    def random_node_ip(self, node_type="worker"):
+        """Return a random node IP of a given node type."""
+        ips = get_node_ips(node_type=node_type)
+        return random.choice(ips)
+
+    @abstractmethod
+    def shutdown_node(self, node_ip=None, node_type="worker"):
+        pass
+
+    @abstractmethod
+    def change_node_network_interface_state(
+        self, node_ip=None, node_type="worker", interface_name=None, connect=False
+    ):
+        pass
+
+    @abstractmethod
+    def network_split(self, nodes):
+        pass
+
+
+class VsphereClusterFailures(ClusterFailures):
+    def __init__(self):
+        super().__init__(cluster_name="vSphere")
+        self.vsphere_host = config.ENV_DATA["vsphere_server"]
+        self.vsphere_password = config.ENV_DATA["vsphere_password"]
+        self.vsphere_username = config.ENV_DATA["vsphere_user"]
+        self.dc = config.ENV_DATA["vsphere_datacenter"]
+        self.vsobj = VSPHERE(
+            self.vsphere_host, self.vsphere_username, self.vsphere_password
+        )
+
+    def shutdown_node(self, node_ip=None, node_type="worker"):
+        if not node_ip:
+            node_ip = self.random_node_ip(node_type=node_type)
+        log.info(f"Shutting down node {node_ip} on vSphere cluster {self.cluster_name}")
+        vm = self.vsobj.get_vm_by_ip(node_ip, self.dc)
+        self.vsobj.stop_vms([vm])
+        log.info(f"Node {node_ip} VM instance stopped.")
+
+    def reboot_node(self, node_ip=None, node_type="worker"):
+        if not node_ip:
+            node_ip = self.random_node_ip(node_type=node_type)
+        vm = self.vsobj.get_vm_by_ip(node_ip, self.dc)
+        vm_name = vm.name
+        self.vsobj.stop_vms([vm])
+        log.info(f"VM instance {vm_name} is stopped.")
+        time.sleep(20)
+        self.vsobj.start_vms([vm])
+        log.info(f"VM instance {vm_name} is started.")
+
+    def change_node_network_interface_state(
+        self, node_ip=None, node_type="worker", interface_name=None, connect=False
+    ):
+        if not node_ip:
+            node_ip = self.random_node_ip(node_type=node_type)
+        log.info(
+            f"{'Connecting' if connect else 'Disconnecting'} network interface"
+            f"of node {node_ip} on vSphere cluster {self.cluster_name}"
+        )
+        self.vsobj.change_vm_network_state(node_ip, self.dc, connect=connect)
+
+    def network_split(self, nodes):
+        log.warning("Function 'network_split' is not implemented.")
+        raise NotImplementedError("Function 'network_split' is not implemented.")
+
+
+class IbmCloudClusterFailures(ClusterFailures):
+    def __init__(self):
+        super().__init__(cluster_name="IBM Cloud")
+
+    def shutdown_node(self, node_ip=None, node_type="worker"):
+        if not node_ip:
+            node_ip = self.random_node_ip(node_type=node_type)
+        log.info(
+            f"Shutting down node {node_ip} on IBM Cloud cluster {self.cluster_name}"
+        )
+        raise NotImplementedError("IBM Cloud shutdown logic is not implemented.")
+
+    def change_node_network_interface_state(
+        self, node_ip=None, node_type="worker", interface_name=None, connect=False
+    ):
+        if not node_ip:
+            node_ip = self.random_node_ip(node_type=node_type)
+        log.info(
+            f"{'Connecting' if connect else 'Disconnecting'} network interface"
+            f" of node {node_ip} on IBM Cloud cluster {self.cluster_name}"
+        )
+        # Add IBM Cloud-specific logic here
+
+    def network_split(self, nodes):
+        log.info(
+            f"Simulating network split on nodes {nodes} in IBM Cloud cluster {self.cluster_name}"
+        )
+        # Add IBM Cloud-specific network split logic
+
+
+class AwsClusterFailures(ClusterFailures):
+    def __init__(self):
+        super().__init__(cluster_name="AWS")
+
+    def shutdown_node(self, node_ip=None, node_type="worker"):
+        if not node_ip:
+            node_ip = self.random_node_ip(node_type=node_type)
+        log.info(f"Shutting down node {node_ip} on AWS cluster {self.cluster_name}")
+        # Add AWS-specific shutdown logic
+
+    def change_node_network_interface_state(
+        self, node_ip=None, node_type="worker", interface_name=None, connect=False
+    ):
+        if not node_ip:
+            node_ip = self.random_node_ip(node_type=node_type)
+        log.info(
+            f"{'Connecting' if connect else 'Disconnecting'} network interface"
+            f"of node {node_ip} on AWS cluster {self.cluster_name}"
+        )
+        # Add AWS-specific logic here
+
+    def network_split(self, nodes):
+        log.info(
+            f"Simulating network split on nodes {nodes} in AWS cluster {self.cluster_name}"
+        )
+        # Add AWS-specific network split logic
+
+
+class BaremetalClusterFailures(ClusterFailures):
+    def __init__(self):
+        super().__init__(cluster_name="Bare Metal")
+
+    def shutdown_node(self, node_ip=None, node_type="worker"):
+        if not node_ip:
+            node_ip = self.random_node_ip(node_type=node_type)
+        log.info(
+            f"Shutting down node {node_ip} on Bare Metal cluster {self.cluster_name}"
+        )
+        # Add bare metal-specific shutdown logic
+
+    def change_node_network_interface_state(
+        self, node_ip=None, node_type="worker", interface_name=None, connect=False
+    ):
+        if not node_ip:
+            node_ip = self.random_node_ip(node_type=node_type)
+        log.info(
+            f"{'Connecting' if connect else 'Disconnecting'} network interface"
+            f" of node {node_ip} on Bare Metal cluster {self.cluster_name}"
+        )
+        # Add bare metal-specific logic here
+
+    def network_split(self, nodes):
+        log.info(
+            f"Simulating network split on nodes {nodes} in Bare Metal cluster {self.cluster_name}"
+        )
+        # Add bare metal-specific network split logic
+
+
+def get_cluster_object():
+    platform = config.ENV_DATA["platform"].lower()
+    if platform == constants.VSPHERE_PLATFORM:
+        return VsphereClusterFailures()
+    elif platform == constants.AWS_PLATFORM:
+        return AwsClusterFailures()
+    elif platform == constants.IBMCLOUD_PLATFORM:
+        return IbmCloudClusterFailures()
+    elif platform == constants.BAREMETAL_PLATFORM:
+        return BaremetalClusterFailures()
+    else:
+        raise ValueError(f"Unsupported platform: {platform}")
diff --git a/ocs_ci/resiliency/conf/network_failures.yaml b/ocs_ci/resiliency/conf/network_failures.yaml
new file mode 100644
index 00000000000..ad252fbc3a7
--- /dev/null
+++ b/ocs_ci/resiliency/conf/network_failures.yaml
@@ -0,0 +1,20 @@
+NETWORK_FAILURES:
+  WAIT_TILL_NODE_JOIN: true
+  FAILURES:
+  - NODE_NETWORK_DOWN:
+      NETWORK_FAILURE_DURATION: 30
+      node_selector: 
+        - labels: []
+      num_reboot_nodes: "0-3"
+  - POD_NETWORK_FAILURE:
+      node_selector:
+        - labels: []
+  WORKLOAD:
+    FIO:
+      - CEPHFS:
+        template: "fio_cephfs_template.yaml"
+        name: "cephfs-fio-workload"
+      - BLOCK:
+        template: "fio_block_template.yaml"
+        name: "block-fio-workload"
+
diff --git a/ocs_ci/resiliency/conf/node_failures.yaml b/ocs_ci/resiliency/conf/node_failures.yaml
new file mode 100644
index 00000000000..898f337dd22
--- /dev/null
+++ b/ocs_ci/resiliency/conf/node_failures.yaml
@@ -0,0 +1,22 @@
+NODE_FAILURES:
+  WAIT_TILL_NODE_JOIN: true
+  FAILURES:
+  - POWEROFF_NODE:
+      NODE_TYPE:
+        - "master"
+        - "worker"
+      node_selector: 
+        - labels: []
+      iteration: 3
+  - NODE_DRAIN:
+      PRIORITY: 1
+      node_selector:
+        - labels: []
+  WORKLOAD:
+    FIO:
+      - CEPHFS:
+        template: "fio_cephfs_template.yaml"
+        name: "cephfs-fio-workload"
+      - BLOCK:
+        template: "fio_block_template.yaml"
+        name: "block-fio-workload"
\ No newline at end of file
diff --git a/ocs_ci/resiliency/conf/resiliency.yaml b/ocs_ci/resiliency/conf/resiliency.yaml
new file mode 100644
index 00000000000..dbecf322822
--- /dev/null
+++ b/ocs_ci/resiliency/conf/resiliency.yaml
@@ -0,0 +1,7 @@
+RESILIENCY:
+  RUN_CONFIG:
+    STOP_WHEN_CEPH_UNHEALTHY: true
+    ITERATE_SCENARIOS: True
+  FAILURE_SCENARIOS:
+    - NODE_FAILURES
+    - NETWORK_FAILURES
diff --git a/ocs_ci/resiliency/network_failures.py b/ocs_ci/resiliency/network_failures.py
new file mode 100644
index 00000000000..ef6dd3c9ca3
--- /dev/null
+++ b/ocs_ci/resiliency/network_failures.py
@@ -0,0 +1,59 @@
+import logging
+import time
+
+from ocs_ci.resiliency.cluster_failures import get_cluster_object
+
+log = logging.getLogger(__name__)
+
+
+class NetworkFailures:
+    SCENARIO_NAME = "NETWORK_FAILURES"
+    FAILURE_METHODS = {
+        "POD_NETWORK_FAILURE": "_run_pod_network_failures",
+        "NODE_NETWORK_DOWN": "_run_node_network_failure",
+    }
+
+    def __init__(self, failure_data):
+        self.scenario_name = self.SCENARIO_NAME
+        self.failure_data = failure_data
+        self.cluster_obj = get_cluster_object()
+
+    def failure_case(self):
+        """Get the first failure case key from failure_data."""
+        if not self.failure_data:
+            raise ValueError("No failure case provided in failure_data.")
+        return next(iter(self.failure_data))
+
+    def run(self):
+        """Dynamically call the appropriate method based on the failure case."""
+        case = self.failure_case()
+        method_name = self.FAILURE_METHODS.get(case)
+        if method_name and hasattr(self, method_name):
+            method = getattr(self, method_name)
+            method()
+        else:
+            raise NotImplementedError(
+                f"Failure method for case '{case}' is not implemented."
+            )
+
+    def _run_pod_network_failures(self):
+        """Handle Pod Network Failure scenario."""
+        log.info("Bringing down Pod network interface.")
+        # Implement pod network failure logic here
+
+    def _run_node_network_failure(self):
+        """Handle Node Network Failure scenario."""
+        log.info("Bringing down Node network interfaces.")
+        node_types = ["master", "worker"]
+        for node_type in node_types:
+            node_ip = self.cluster_obj.random_node_ip(node_type)
+            self.cluster_obj.change_node_network_interface_state(
+                node_ip=node_ip, node_type=node_type, connect=False
+            )
+            try:
+                time.sleep(60)  # Simulate network being down
+            finally:
+                self.cluster_obj.change_node_network_interface_state(
+                    node_ip=node_ip, node_type=node_type, connect=True
+                )
+                log.info(f"Network interface on node {node_ip} restored.")
diff --git a/ocs_ci/resiliency/node_failures.py b/ocs_ci/resiliency/node_failures.py
new file mode 100644
index 00000000000..3da88d32275
--- /dev/null
+++ b/ocs_ci/resiliency/node_failures.py
@@ -0,0 +1,65 @@
+import logging
+from ocs_ci.utility.utils import ceph_health_check
+from ocs_ci.resiliency.cluster_failures import get_cluster_object
+
+log = logging.getLogger(__name__)
+
+
+class NodeFailures:
+    SCENARIO_NAME = "NODE_FAILURES"
+    FAILURE_METHODS = {
+        "POWEROFF_NODE": "_run_poweroff_node",
+        "NODE_DRAIN": "_run_node_drain",
+    }
+
+    def __init__(self, failure_data):
+        self.failure_data = failure_data
+        self.failure_case_name = self._get_failure_case()
+        self.scenario_name = self.SCENARIO_NAME
+        self.cluster_obj = get_cluster_object()
+
+    def _get_failure_case(self):
+        """Retrieve the failure case name from the provided failure data."""
+        if not self.failure_data:
+            log.error("Failure data is empty.")
+            return None
+        return next(iter(self.failure_data))
+
+    def run(self):
+        """Run the failure scenario based on the failure case."""
+        if not self.failure_case_name:
+            log.error("No valid failure case name found. Exiting run method.")
+            return
+
+        method_name = self.FAILURE_METHODS.get(self.failure_case_name)
+        if method_name and hasattr(self, method_name):
+            failure_method = getattr(self, method_name)
+            failure_method()
+            self._post_scenario_checks()
+        else:
+            raise NotImplementedError(
+                f"Failure method for '{self.failure_case_name}' is not implemented."
+            )
+
+    def _run_poweroff_node(self):
+        """Simulate the reboot of nodes."""
+        log.info("Running Failure Case: POWEROFF_NODE.")
+        node_types = self.failure_data[self.failure_case_name].get("NODE_TYPE", [])
+        num_nodes = self.failure_data[self.failure_case_name].get("NUM_NODES", 2)
+        for node_type in node_types:
+            for _ in range(num_nodes):
+                log.info(f"Rebooting {node_type} node.")
+                self.cluster_obj.reboot_node(node_type=node_type)
+                log.info(f"{node_type.capitalize()} node rebooted.")
+
+    def _run_node_drain(self):
+        """Simulate draining of nodes."""
+        log.info("Running Failure Case: NODE_DRAIN.")
+        # Implement node drain logic here
+        log.info("Draining node...")
+
+    def _post_scenario_checks(self):
+        """Perform post-scenario checks to ensure the cluster is healthy."""
+        log.info(f"Running post-scenario checks for {self.scenario_name}.")
+        log.info("Verifying that Ceph health is OK (retrying if necessary).")
+        ceph_health_check(tries=45, delay=60)
diff --git a/ocs_ci/resiliency/resiliency_helper.py b/ocs_ci/resiliency/resiliency_helper.py
new file mode 100644
index 00000000000..5500fb2c2a8
--- /dev/null
+++ b/ocs_ci/resiliency/resiliency_helper.py
@@ -0,0 +1,169 @@
+import yaml
+import os
+import logging
+from ocs_ci.ocs import constants
+from ocs_ci.resiliency.node_failures import NodeFailures
+from ocs_ci.resiliency.network_failures import NetworkFailures
+from ocs_ci.helpers.sanity_helpers import Sanity
+
+log = logging.getLogger(__name__)
+
+
+class ResiliencyConfig:
+    """Handles loading and parsing of the resiliency configuration."""
+
+    CONFIG_FILE = os.path.join(constants.RESILIENCY_DIR, "conf", "resiliency.yaml")
+
+    def __init__(self):
+        self.data = self.load_yaml(self.CONFIG_FILE)
+        resiliency = self.data.get("RESILIENCY", {})
+        self.run_config = resiliency.get("RUN_CONFIG", {})
+        self.stop_when_ceph_unhealthy = self.run_config.get(
+            "STOP_WHEN_CEPH_UNHEALTHY", False
+        )
+        self.iterate_scenarios = self.run_config.get("ITERATE_SCENARIOS", False)
+        self.failure_scenarios = resiliency.get("FAILURE_SCENARIOS", [])
+
+    @staticmethod
+    def load_yaml(file_path):
+        """Load and parse the YAML file."""
+        try:
+            with open(file_path, "r") as file:
+                return yaml.safe_load(file) or {}
+        except FileNotFoundError:
+            log.error(f"YAML file not found: {file_path}")
+            return {}
+        except yaml.YAMLError as exc:
+            log.error(f"Error parsing YAML file {file_path}: {exc}")
+            return {}
+
+    def __repr__(self):
+        """Representation of the ResiliencyConfig object."""
+        return (
+            f"ResiliencyConfig("
+            f"stop_when_ceph_unhealthy={self.stop_when_ceph_unhealthy}, "
+            f"iterate_scenarios={self.iterate_scenarios}, "
+            f"failure_scenarios={self.failure_scenarios})"
+        )
+
+
+class ResiliencyFailures:
+    """Handles loading failure cases from the configuration and iterating over them."""
+
+    SCENARIO_DIR = os.path.join(constants.RESILIENCY_DIR, "conf")
+
+    def __init__(self, scenario_name, failure_method=None):
+        self.scenario_name = scenario_name
+        self.failure_method = failure_method
+        self.failure_cases_data = self.get_failure_cases_data()
+        self.workload = self.failure_cases_data.get("WORKLOAD", "")
+        self.failure_list = self.get_failure_list()
+        self._iterator = iter(self.failure_list)
+
+    def get_failure_cases_data(self):
+        """Load the YAML file containing failure case details for the given scenario."""
+        log.info(
+            f"Searching for scenario '{self.scenario_name}' in directory: {self.SCENARIO_DIR}"
+        )
+        scenario_file = f"{self.scenario_name.lower()}.yaml"
+        file_path = os.path.join(self.SCENARIO_DIR, scenario_file)
+
+        if os.path.isfile(file_path):
+            data = ResiliencyConfig.load_yaml(file_path)
+            if self.scenario_name in data:
+                log.info(
+                    f"Found scenario '{self.scenario_name}' in file: {scenario_file}"
+                )
+                return data[self.scenario_name]
+            else:
+                log.error(
+                    f"Scenario '{self.scenario_name}' not found in file: {scenario_file}"
+                )
+        else:
+            log.error(
+                f"Scenario file '{scenario_file}' not found in directory: {self.SCENARIO_DIR}"
+            )
+        return {}
+
+    def get_failure_list(self):
+        """Retrieve and optionally filter the failure list based on the failure method."""
+        failures = self.failure_cases_data.get("FAILURES", [])
+        if self.failure_method:
+            # Filter the failures to include only those matching the failure method
+            filtered_failures = [
+                {self.failure_method: failure[self.failure_method]}
+                for failure in failures
+                if self.failure_method in failure
+            ]
+            if not filtered_failures:
+                log.warning(
+                    f"No failures found for failure method '{self.failure_method}' in scenario '{self.scenario_name}'."
+                )
+            return filtered_failures
+        return failures
+
+    def __iter__(self):
+        """Return an iterator over the failure list."""
+        self._iterator = iter(self.failure_list)
+        return self._iterator
+
+
+class Resiliency:
+    """Main class for running resiliency tests."""
+
+    def __init__(self, scenario, failure_method=None):
+        self.scenario_name = scenario
+        self.resiliency_failures = ResiliencyFailures(scenario, failure_method)
+        self.sanity_helpers = Sanity()
+
+    def post_scenario_check(self):
+        """Perform post-scenario checks like Ceph health and logs."""
+        log.info("Checking Ceph health...")
+        self.sanity_helpers.health_check(tries=40)
+        log.info("Running must-gather logs...")
+
+    def start(self):
+        """Iterate over and inject the failures one by one."""
+        for failure_case in self.resiliency_failures:
+            self.inject_failure(failure_case)
+            self.post_scenario_check()
+
+    def inject_failure(self, failure):
+        """Inject the failure into the system."""
+        log.info(f"Running failure case for scenario '{self.scenario_name}': {failure}")
+        failure_obj = InjectFailures(self.scenario_name, failure)
+        failure_obj.run_failure_case()
+
+    def cleanup(self):
+        """Cleanup method after the scenario is completed."""
+        log.info("Cleaning up after the scenario...")
+
+
+class InjectFailures:
+    """Handles the actual injection of failures based on the scenario."""
+
+    SCENARIO_CLASSES = {
+        NetworkFailures.SCENARIO_NAME: NetworkFailures,
+        NodeFailures.SCENARIO_NAME: NodeFailures,
+    }
+
+    def __init__(self, scenario, failure_case):
+        self.scenario = scenario
+        self.failure_case = failure_case
+
+    def failure_object(self):
+        scenario_class = self.SCENARIO_CLASSES.get(self.scenario)
+        if scenario_class:
+            return scenario_class(self.failure_case)
+        else:
+            raise NotImplementedError(
+                f"No implementation for scenario '{self.scenario}'"
+            )
+
+    def run_failure_case(self):
+        """Inject the failure into the cluster."""
+        log.info(
+            f"Injecting failure into the cluster for scenario '{self.scenario}'..."
+        )
+        failure_obj = self.failure_object()
+        failure_obj.run()
diff --git a/ocs_ci/resiliency/resiliency_workload.py b/ocs_ci/resiliency/resiliency_workload.py
new file mode 100644
index 00000000000..0b5a3b59c9f
--- /dev/null
+++ b/ocs_ci/resiliency/resiliency_workload.py
@@ -0,0 +1,154 @@
+import logging
+import os
+
+from abc import ABC, abstractmethod
+from jinja2 import Environment, FileSystemLoader
+import fauxfactory
+
+from ocs_ci.ocs import constants
+from ocs_ci.utility.utils import run_cmd
+
+log = logging.getLogger(__name__)
+
+
+class Workload(ABC):
+    """
+    Abstract Base Class for all workloads.
+    """
+
+    def __init__(self, namespace="default", image=None):
+        self.namespace = namespace
+        self.image = image
+        self.template_dir = os.path.join(constants.RESILIENCY_DIR, "workloads")
+        self.workload_env = Environment(loader=FileSystemLoader(self.template_dir))
+
+    @abstractmethod
+    def start_workload(self):
+        pass
+
+    @abstractmethod
+    def scale_up_pods(self, desired_count):
+        pass
+
+    @abstractmethod
+    def scale_down_pods(self, desired_count):
+        pass
+
+    @abstractmethod
+    def stop_workload(self):
+        pass
+
+    @abstractmethod
+    def cleanup_workload(self):
+        pass
+
+
+class FioWorkload(Workload):
+    """
+    FIO-specific implementation of Workload.
+    """
+
+    def __init__(self, pvc):
+        super().__init__(namespace=pvc.namespace)
+        self.pvc = pvc
+        self.pvc.reload()
+        self.deployment_name = f"fio-app-{fauxfactory.gen_alpha(8).lower()}"
+        self.volume_mode = self.pvc.data["spec"]["volumeMode"]
+        self.template_file = (
+            "fio_fs_workload_template.yaml"
+            if self.volume_mode == "Filesystem"
+            else "fio_block_workload_template.yaml"
+        )
+        self.template = self.workload_env.get_template(self.template_file)
+        self.output_file = f"/tmp/{fauxfactory.gen_alpha(8).lower()}.yaml"
+        self.render_template()
+
+    def start_workload(self):
+        log.info("Starting FIO workload.")
+        run_cmd(f"oc create -f {self.output_file}")
+        log.info("FIO workload started.")
+
+    def scale_up_pods(self, desired_count):
+        log.info(f"Scaling up FIO pods to {desired_count}.")
+        # Implement logic to scale up pods
+
+    def scale_down_pods(self, desired_count):
+        log.info(f"Scaling down FIO pods to {desired_count}.")
+        # Implement logic to scale down pods
+
+    def stop_workload(self):
+        log.info("Stopping FIO workload.")
+        run_cmd(f"oc delete -f {self.output_file}")
+        log.info("FIO workload stopped.")
+
+    def cleanup_workload(self):
+        log.info("Cleaning up FIO workload.")
+        # Implement cleanup logic, e.g., deleting all pods in the workload
+
+    def render_template(self):
+        rendered_yaml = self.template.render(
+            fio_name=self.deployment_name,
+            namespace=self.namespace,
+            pvc_claim_name=self.pvc.name,
+        )
+        with open(self.output_file, "w") as f:
+            f.write(rendered_yaml)
+        log.info("Rendered FIO workload template.")
+
+
+class SmallFilesWorkload(Workload):
+    """
+    SmallFiles-specific implementation of Workload.
+    """
+
+    def __init__(self, namespace="default", image="smallfiles-image:latest"):
+        super().__init__(namespace, image)
+
+    def start_workload(self):
+        log.info(f"Starting SmallFiles workload in namespace: {self.namespace}.")
+        # Implement pod creation logic
+
+    def scale_up_pods(self, desired_count):
+        log.info(f"Scaling up SmallFiles pods to {desired_count}.")
+        # Implement logic to scale up pods
+
+    def scale_down_pods(self, desired_count):
+        log.info(f"Scaling down SmallFiles pods to {desired_count}.")
+        # Implement logic to scale down pods
+
+    def stop_workload(self):
+        log.info("Stopping SmallFiles workload.")
+        # Implement pod deletion logic
+
+    def cleanup_workload(self):
+        log.info("Cleaning up SmallFiles workload.")
+        # Implement cleanup logic
+
+
+class VdbenchWorkload(Workload):
+    """
+    Vdbench-specific implementation of Workload.
+    """
+
+    def __init__(self, namespace="default", image="vdbench-image:latest"):
+        super().__init__(namespace, image)
+
+    def start_workload(self):
+        log.info(f"Starting Vdbench workload in namespace: {self.namespace}.")
+        # Implement pod creation logic
+
+    def scale_up_pods(self, desired_count):
+        log.info(f"Scaling up Vdbench pods to {desired_count}.")
+        # Implement logic to scale up pods
+
+    def scale_down_pods(self, desired_count):
+        log.info(f"Scaling down Vdbench pods to {desired_count}.")
+        # Implement logic to scale down pods
+
+    def stop_workload(self):
+        log.info("Stopping Vdbench workload.")
+        # Implement pod deletion logic
+
+    def cleanup_workload(self):
+        log.info("Cleaning up Vdbench workload.")
+        # Implement cleanup logic
diff --git a/ocs_ci/resiliency/workloads/fio_block_workload_template.yaml b/ocs_ci/resiliency/workloads/fio_block_workload_template.yaml
new file mode 100644
index 00000000000..a844ff3445e
--- /dev/null
+++ b/ocs_ci/resiliency/workloads/fio_block_workload_template.yaml
@@ -0,0 +1,42 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ fio_name }}  # FIO workload name
+  namespace: {{ namespace }}  # OpenShift namespace
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: {{ fio_name }}
+  template:
+    metadata:
+      labels:
+        app: {{ fio_name }}
+    spec:
+      replicas: 1  
+      containers:
+        - name: fio
+          image: quay.io/ocsci/nginx:fio
+          command:
+            - fio
+          args:
+            - "--name={{ fio_name }}"
+            - "--rw=randwrite"
+            - "--size=4G"
+            - "--bs=4k"
+            - "--numjobs=4"
+            - "--runtime=0"
+            - "--group_reporting"
+            - "--time_based"
+            - --output=/path/to/your/result_file.txt
+            - "--filename=/dev/rbdblock"  # Block device path
+          volumeDevices:  # Using volumeDevices for Block mode
+            - name: fio-volume
+              devicePath: /dev/rbdblock  # Path to block device inside the container
+      restartPolicy: Always
+      volumes:
+        - name: fio-volume
+          persistentVolumeClaim:
+            claimName: {{ pvc_claim_name }}  # PVC claim name for block device
+  strategy:
+    type: Recreate
\ No newline at end of file
diff --git a/ocs_ci/resiliency/workloads/fio_fs_workload_template.yaml b/ocs_ci/resiliency/workloads/fio_fs_workload_template.yaml
new file mode 100644
index 00000000000..9645f7b20f3
--- /dev/null
+++ b/ocs_ci/resiliency/workloads/fio_fs_workload_template.yaml
@@ -0,0 +1,41 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ fio_name }}  # FIO workload name
+  namespace: {{ namespace }}  # OpenShift namespace
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: {{ fio_name }}
+  template:
+    metadata:
+      labels:
+        app: {{ fio_name }}
+    spec:
+      replicas: 1
+      containers:
+        - name: fio
+          image: quay.io/ocsci/nginx:fio
+          command:
+            - fio
+          args:
+            - "--name={{ fio_name }}"
+            - "--rw=randwrite"
+            - "--size=4G"
+            - "--bs=4k"
+            - "--numjobs=4"
+            - "--runtime=0"
+            - "--group_reporting"
+            - "--time_based"
+            - "--filename=/mnt/fio_file_workload"
+          volumeMounts:
+            - name: fio-volume
+              mountPath: /mnt
+      restartPolicy: Always
+      volumes:
+        - name: fio-volume
+          persistentVolumeClaim:
+            claimName: {{ pvc_claim_name }}  # PVC claim name
+  strategy:
+    type: Recreate
\ No newline at end of file
diff --git a/ocs_ci/utility/vsphere.py b/ocs_ci/utility/vsphere.py
index 857b930ad59..fafade9f729 100644
--- a/ocs_ci/utility/vsphere.py
+++ b/ocs_ci/utility/vsphere.py
@@ -1727,3 +1727,37 @@ def get_volume_path(self, volume_id, datastore_name, datacenter_name):
         volume_path = vstorage_object.config.backing.filePath
         logger.debug(f"File path for volume {volume_id} is `{volume_path}`")
         return volume_path
+
+    def wait_for_vm_status(self, vm, desired_status, timeout=300, interval=10):
+        """
+        Wait for the VM to reach the desired status.
+
+        :param vm: The virtual machine object
+        :param desired_status: The desired status (e.g., 'poweredOn', 'poweredOff')
+        :param timeout: Maximum time (in seconds) to wait for the VM to reach the desired status
+        :param interval: Time (in seconds) between each status check
+        :return: True if the VM reaches the desired status, False otherwise
+        """
+        import time
+
+        start_time = time.time()
+
+        while time.time() - start_time < timeout:
+            current_status = vm.runtime.powerState
+
+            # Check if the VM has reached the desired status
+            if current_status == desired_status:
+                logger.info(
+                    f"VM {vm.name} has reached the desired status: {desired_status}"
+                )
+                return True
+
+            logger.info(
+                f"Current status of VM {vm.name} is {current_status}, waiting for {desired_status}..."
+            )
+            time.sleep(interval)
+
+        logger.info(
+            f"VM {vm.name} did not reach the desired status {desired_status} within the timeout period."
+        )
+        return False
diff --git a/setup.py b/setup.py
index 25b34490b26..425e57f4979 100644
--- a/setup.py
+++ b/setup.py
@@ -100,6 +100,7 @@
         "psycopg2-binary==2.9.9",
         "azure-keyvault-secrets==4.8.0",
         "pytest-jira==0.3.21",
+        "dynaconf==3.2.5",
     ],
     entry_points={
         "console_scripts": [
diff --git a/tests/conftest.py b/tests/conftest.py
index d15bbaf2091..b6743f37c48 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -8351,6 +8351,27 @@ def finalizer():
     return repo_dir
 
 
+
+@pytest.fixture
+def fio_resiliency_workload(request):
+    """
+    Pytest fixture to start and stop an FIO workload for resiliency testing.
+    """
+    from ocs_ci.resiliency.resiliency_workload import FioWorkload
+
+    def factory(pvc_obj):
+        fio = FioWorkload(pvc_obj)
+        fio.start_workload()
+        # yield
+
+        def finalizer():
+            fio.stop_workload()
+
+        request.addfinalizer(finalizer)
+        return fio  # Return the FioWorkload instance if needed
+
+    return factory
+
 @pytest.fixture(scope="session", autouse=True)
 def run_description():
     """
@@ -8395,3 +8416,4 @@ def finalizer():
             cnv_obj.uninstall_cnv()
 
     request.addfinalizer(finalizer)
+
diff --git a/tests/resiliency/test_network_failures_scenarios.py b/tests/resiliency/test_network_failures_scenarios.py
new file mode 100644
index 00000000000..5d69aaa0b1c
--- /dev/null
+++ b/tests/resiliency/test_network_failures_scenarios.py
@@ -0,0 +1,39 @@
+from ocs_ci.resiliency.resiliency_helper import Resiliency
+
+# from ocs_ci.resiliency.resiliency_workload import workload_object
+import logging
+from ocs_ci.ocs import constants
+
+log = logging.getLogger(__name__)
+
+
+class TestNetworkFailures:
+    def test_node_network_failure(self, multi_pvc_factory, fio_resiliency_workload):
+        """ """
+        scenario = "NETWORK_FAILURES"
+        failure_method = "NODE_NETWORK_DOWN"
+
+        # Create pvcs with different access_modes
+        size = 5
+        access_modes = [constants.ACCESS_MODE_RWO]
+        cephfs_pvc_objs = multi_pvc_factory(
+            interface=constants.CEPHFILESYSTEM,
+            access_modes=access_modes,
+            size=size,
+            num_of_pvc=2,
+        )
+
+        rbd_pvc_objs = multi_pvc_factory(
+            interface=constants.CEPHBLOCKPOOL,
+            access_modes=access_modes,
+            size=size,
+            num_of_pvc=2,
+        )
+
+        # Starting Workload on the cluster
+        for pv_obj in cephfs_pvc_objs + rbd_pvc_objs:
+            fio_resiliency_workload(pv_obj)
+
+        node_failures = Resiliency(scenario, failure_method=failure_method)
+        node_failures.start()
+        node_failures.cleanup()
diff --git a/tests/resiliency/test_node_failure_scenarios.py b/tests/resiliency/test_node_failure_scenarios.py
new file mode 100644
index 00000000000..0195c554237
--- /dev/null
+++ b/tests/resiliency/test_node_failure_scenarios.py
@@ -0,0 +1,40 @@
+from ocs_ci.resiliency.resiliency_helper import Resiliency
+
+# from ocs_ci.resiliency.resiliency_workload import workload_object
+import logging
+from ocs_ci.ocs import constants
+
+log = logging.getLogger(__name__)
+
+
+class TestResiliencyNodeFailures:
+    def test_node_poweroff(self, multi_pvc_factory, fio_resiliency_workload):
+        """Resiliency tests"""
+        scenario = "NODE_FAILURES"
+        failure_method = "POWEROFF_NODE"
+
+        # Create pvcs with different access_modes
+        size = 5
+        access_modes = [constants.ACCESS_MODE_RWO]
+        cephfs_pvc_objs = multi_pvc_factory(
+            interface=constants.CEPHFILESYSTEM,
+            access_modes=access_modes,
+            size=size,
+            num_of_pvc=2,
+        )
+
+        rbd_pvc_objs = multi_pvc_factory(
+            interface=constants.CEPHBLOCKPOOL,
+            access_modes=access_modes,
+            size=size,
+            num_of_pvc=2,
+        )
+
+        # Starting Workload on the cluster
+        for pv_obj in cephfs_pvc_objs + rbd_pvc_objs:
+            fio_resiliency_workload(pv_obj)
+
+        # Injecting Resiliency failures
+        node_failures = Resiliency(scenario, failure_method=failure_method)
+        node_failures.start()
+        node_failures.cleanup()