Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Test CLI tool for disk replacement proceudre #9655

Merged
merged 6 commits into from
May 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ocs_ci/helpers/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4586,7 +4586,7 @@ def retrieve_cli_binary(cli_type="mcg"):
if cli_type == "mcg":
local_cli_path = constants.NOOBAA_OPERATOR_LOCAL_CLI_PATH
elif cli_type == "odf":
local_cli_path = constants.CLI_TOOL_LOCAL_PATH
local_cli_path = os.path.join(config.RUN["bin_dir"], "odf-cli")
local_cli_dir = os.path.dirname(local_cli_path)
live_deployment = config.DEPLOYMENT["live_deployment"]
if live_deployment and semantic_version >= version.VERSION_4_13:
Expand Down
176 changes: 97 additions & 79 deletions ocs_ci/ocs/osd_operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,20 @@
delete_osd_removal_job,
)
from ocs_ci.helpers.sanity_helpers import Sanity
from ocs_ci.helpers.helpers import retrieve_cli_binary
from ocs_ci.utility.utils import run_cmd_interactive


logger = logging.getLogger(__name__)


def osd_device_replacement(nodes):
def osd_device_replacement(nodes, cli_tool=False):
"""
Replacing randomly picked osd device
Args:
node (OCS): The OCS object representing the node
nodes (OCS): The OCS object representing the node
cli_tool (bool): using cli tool to replace the disk if cli_tool is True otherwise use "oc" commands

"""
logger.info("Picking a PV which to be deleted from the platform side")
osd_pvs = get_deviceset_pvs()
Expand Down Expand Up @@ -101,91 +105,105 @@ def osd_device_replacement(nodes):
== claim_name
][0]
osd_deployment_name = osd_deployment.name
osd_pod_name = osd_pod.name

# Delete the volume from the platform side
logger.info(f"Deleting {volume_path} from the platform side")
nodes.detach_volume(volume_path, osd_node)

# Scale down OSD deployment
logger.info(f"Scaling down OSD deployment {osd_deployment_name} to 0")
ocp_obj = ocp.OCP(namespace=config.ENV_DATA["cluster_namespace"])
ocp_obj.exec_oc_cmd(f"scale --replicas=0 deployment/{osd_deployment_name}")

# Force delete OSD pod if necessary
osd_pod_name = osd_pod.name
logger.info(f"Waiting for OSD pod {osd_pod.name} to get deleted")
try:
osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name)
except TimeoutError:
osd_pod.delete(force=True)
osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name)

# Run ocs-osd-removal job
osd_removal_job = run_osd_removal_job([osd_id])
assert osd_removal_job, "ocs-osd-removal failed to create"
is_completed = verify_osd_removal_job_completed_successfully(osd_id)
assert is_completed, "ocs-osd-removal-job is not in status 'completed'"
logger.info("ocs-osd-removal-job completed successfully")

osd_pvc_name = osd_pvc.name

if ocp_version < version.VERSION_4_6:
# Delete the OSD prepare job
logger.info(f"Deleting OSD prepare job {osd_prepare_job_name}")
osd_prepare_job.delete()
osd_prepare_job.ocp.wait_for_delete(
resource_name=osd_prepare_job_name, timeout=120
)

# Delete the OSD PVC
logger.info(f"Deleting OSD PVC {osd_pvc_name}")
osd_pvc.delete()
osd_pvc.ocp.wait_for_delete(resource_name=osd_pvc_name)

# Delete the OSD deployment
logger.info(f"Deleting OSD deployment {osd_deployment_name}")
osd_deployment.delete()
osd_deployment.ocp.wait_for_delete(
resource_name=osd_deployment_name, timeout=120
if cli_tool:
retrieve_cli_binary(cli_type="odf")
run_cmd_interactive(
cmd=f"odf-cli purge-osd {osd_id}",
prompts_answers={
"yes-force-destroy-osd": "yes-force-destroy-osd",
"completed removal of OSD": "",
},
string_answer=True,
raise_exception=False,
)
else:
# If ocp version is '4.6' and above the osd removal job should
# delete the OSD prepare job, OSD PVC, OSD deployment
# We just need to verify the old PV is in the expected status
logger.info(f"Verify that the old PV '{osd_pv_name}' is in the expected status")
if cluster.is_lso_cluster():
expected_old_pv_statuses = [constants.STATUS_RELEASED]
# Scale down OSD deployment
logger.info(f"Scaling down OSD deployment {osd_deployment_name} to 0")
ocp_obj = ocp.OCP(namespace=config.ENV_DATA["cluster_namespace"])
ocp_obj.exec_oc_cmd(f"scale --replicas=0 deployment/{osd_deployment_name}")

# Force delete OSD pod if necessary
logger.info(f"Waiting for OSD pod {osd_pod.name} to get deleted")
try:
osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name)
except TimeoutError:
osd_pod.delete(force=True)
osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name)

# Run ocs-osd-removal job
osd_removal_job = run_osd_removal_job([osd_id])
assert osd_removal_job, "ocs-osd-removal failed to create"
is_completed = verify_osd_removal_job_completed_successfully(osd_id)
assert is_completed, "ocs-osd-removal-job is not in status 'completed'"
logger.info("ocs-osd-removal-job completed successfully")

osd_pvc_name = osd_pvc.name

if ocp_version < version.VERSION_4_6:
# Delete the OSD prepare job
logger.info(f"Deleting OSD prepare job {osd_prepare_job_name}")
osd_prepare_job.delete()
osd_prepare_job.ocp.wait_for_delete(
resource_name=osd_prepare_job_name, timeout=120
)

# Delete the OSD PVC
logger.info(f"Deleting OSD PVC {osd_pvc_name}")
osd_pvc.delete()
osd_pvc.ocp.wait_for_delete(resource_name=osd_pvc_name)

# Delete the OSD deployment
logger.info(f"Deleting OSD deployment {osd_deployment_name}")
osd_deployment.delete()
osd_deployment.ocp.wait_for_delete(
resource_name=osd_deployment_name, timeout=120
)
else:
expected_old_pv_statuses = [
constants.STATUS_RELEASED,
constants.STATUS_FAILED,
]
try:
if osd_pv.ocp.get_resource_status(osd_pv_name) in expected_old_pv_statuses:
try:
logger.info(f"Verifying deletion of PV {osd_pv_name}")
osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name)
except TimeoutError:
osd_pv.delete()
osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name)
except Exception as e:
logger.error(f"Old PV does not exist {e}")

# If we use LSO, we need to create and attach a new disk manually
if cluster.is_lso_cluster():
node.add_disk_to_node(osd_node)

if ocp_version < version.VERSION_4_6:
# Delete the rook ceph operator pod to trigger reconciliation
rook_operator_pod = get_operator_pods()[0]
logger.info(f"deleting Rook Ceph operator pod {rook_operator_pod.name}")
rook_operator_pod.delete()

# Delete the OSD removal job
logger.info(f"Deleting OSD removal job ocs-osd-removal-{osd_id}")
is_deleted = delete_osd_removal_job(osd_id)
assert is_deleted, "Failed to delete ocs-osd-removal-job"
logger.info("ocs-osd-removal-job deleted successfully")
# If ocp version is '4.6' and above the osd removal job should
# delete the OSD prepare job, OSD PVC, OSD deployment
# We just need to verify the old PV is in the expected status
logger.info(
f"Verify that the old PV '{osd_pv_name}' is in the expected status"
)
if cluster.is_lso_cluster():
expected_old_pv_statuses = [constants.STATUS_RELEASED]
else:
expected_old_pv_statuses = [
constants.STATUS_RELEASED,
constants.STATUS_FAILED,
]
try:
if osd_pv.ocp.get_resource_status(osd_pv_name) in expected_old_pv_statuses:
try:
logger.info(f"Verifying deletion of PV {osd_pv_name}")
osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name)
except TimeoutError:
osd_pv.delete()
osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name)
except Exception as e:
logger.error(f"Old PV does not exist {e}")

# If we use LSO, we need to create and attach a new disk manually
if cluster.is_lso_cluster():
node.add_disk_to_node(osd_node)

if ocp_version < version.VERSION_4_6:
# Delete the rook ceph operator pod to trigger reconciliation
rook_operator_pod = get_operator_pods()[0]
logger.info(f"deleting Rook Ceph operator pod {rook_operator_pod.name}")
rook_operator_pod.delete()

# Delete the OSD removal job
logger.info(f"Deleting OSD removal job ocs-osd-removal-{osd_id}")
is_deleted = delete_osd_removal_job(osd_id)
assert is_deleted, "Failed to delete ocs-osd-removal-job"
logger.info("ocs-osd-removal-job deleted successfully")

timeout = 600
# Wait for OSD PVC to get created and reach Bound state
Expand Down
17 changes: 12 additions & 5 deletions ocs_ci/utility/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -497,25 +497,32 @@ def run_cmd(
return mask_secrets(completed_process.stdout.decode(), secrets)


def run_cmd_interactive(cmd, prompts_answers, timeout=300):
def run_cmd_interactive(
cmd, prompts_answers, timeout=300, string_answer=False, raise_exception=True
):
"""
Handle interactive prompts with answers during subctl command

Args:
cmd(str): Command to be executed
prompts_answers(dict): Prompts as keys and answers as values
timeout(int): Timeout in seconds, for pexpect to wait for prompt

string_answer (bool): string answer
raise_exception (bool): raise excption
Raises:
InteractivePromptException: in case something goes wrong

"""
child = pexpect.spawn(cmd)
for prompt, answer in prompts_answers.items():
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we add a condition when there are no prompts?

Copy link
Contributor Author

@OdedViner OdedViner May 9, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in this case, we dont need to use this function

if child.expect(prompt, timeout=timeout):
raise InteractivePromptException("Unexpected Prompt")

if not child.sendline("".join([answer, constants.ENTER_KEY])):
if raise_exception:
raise InteractivePromptException("Unexpected Prompt")
if string_answer:
send_line = answer
else:
send_line = "".join([answer, constants.ENTER_KEY])
if not child.sendline(send_line):
raise InteractivePromptException("Failed to provide answer to the prompt")


Expand Down
20 changes: 20 additions & 0 deletions tests/functional/z_cluster/nodes/test_disk_failures.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
skipif_external_mode,
skipif_managed_service,
skipif_hci_provider_and_client,
skipif_ocs_version,
)
from ocs_ci.helpers.sanity_helpers import Sanity
from ocs_ci.helpers.helpers import (
Expand Down Expand Up @@ -236,3 +237,22 @@ def test_recovery_from_volume_deletion(
self.sanity_helpers.create_resources(
pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory
)

@bugzilla("2234479")
@vsphere_platform_required
@skipif_ocs_version("<4.15")
@pytest.mark.polarion_id("OCS-5502")
@skipif_external_mode
def test_recovery_from_volume_deletion_cli_tool(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we want to still test both scenarios, mean without CLI Tool? As the TC test_recovery_from_volume_deletion will still get triggered as well.

self, nodes, pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory
):
"""
Test cluster recovery from disk deletion from the platform side.
Based on documented procedure detailed in
https://bugzilla.redhat.com/show_bug.cgi?id=1823183

"""
osd_operations.osd_device_replacement(nodes, cli_tool=True)
self.sanity_helpers.create_resources(
pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory
)
Loading