diff --git a/ocs_ci/deployment/deployment.py b/ocs_ci/deployment/deployment.py index fd93df65be7..2cbf2e7cf70 100644 --- a/ocs_ci/deployment/deployment.py +++ b/ocs_ci/deployment/deployment.py @@ -57,6 +57,7 @@ UnexpectedDeploymentConfiguration, ResourceNotFoundError, ACMClusterConfigurationException, + ACMObservabilityNotEnabled, ) from ocs_ci.deployment.cert_manager import deploy_cert_manager from ocs_ci.deployment.zones import create_dummy_zone_labels @@ -91,6 +92,7 @@ setup_ceph_debug, get_osd_count, StorageCluster, + validate_serviceexport, ) from ocs_ci.ocs.uninstall import uninstall_ocs from ocs_ci.ocs.utils import ( @@ -148,13 +150,13 @@ from ocs_ci.helpers import helpers from ocs_ci.helpers.helpers import ( set_configmap_log_level_rook_ceph_operator, + get_default_storage_class, ) from ocs_ci.ocs.ui.helpers_ui import ui_deployment_conditions from ocs_ci.utility.utils import get_az_count from ocs_ci.utility.ibmcloud import run_ibmcloud_cmd from ocs_ci.deployment.cnv import CNVInstaller - logger = logging.getLogger(__name__) @@ -331,7 +333,7 @@ def do_gitops_deploy(self): for cluster in managed_clusters: if cluster["metadata"]["name"] != constants.ACM_LOCAL_CLUSTER: config.switch_to_cluster_by_name(cluster["metadata"]["name"]) - run_cmd( + exec_cmd( f"oc create -f {constants.CLUSTERROLEBINDING_APPSET_PULLMODEL_PATH}" ) @@ -413,6 +415,7 @@ def do_deploy_ocs(self): .get("multiClusterService") .get("enabled") ), "Failed to update StorageCluster globalnet" + validate_serviceexport() ocs_install_verification( timeout=2000, ocs_registry_image=ocs_registry_image ) @@ -664,8 +667,8 @@ def deploy_cluster(self, log_cli_level="DEBUG"): self.do_deploy_lvmo() self.do_deploy_submariner() self.do_gitops_deploy() - self.do_deploy_ocs() self.do_deploy_oadp() + self.do_deploy_ocs() self.do_deploy_rdr() self.do_deploy_fusion() self.do_deploy_odf_provider_mode() @@ -2599,6 +2602,7 @@ class RBDDRDeployOps(object): def deploy(self): self.configure_rbd() + @retry(ResourceWrongStatusException, tries=10, delay=5) def configure_rbd(self): st_string = '{.items[?(@.metadata.ownerReferences[*].kind=="StorageCluster")].spec.mirroring.enabled}' query_mirroring = ( @@ -3209,7 +3213,7 @@ def validate_dpa(self): if veleropod[0]["status"]["phase"] != "Running": raise ACMClusterConfigurationException("Velero pod not in 'Running' phase") - # Check backupstoragelocation resource in "Available" phase + # Check backupstoragelocation resource is in "Available" phase backupstorage = ocp.OCP( kind="BackupStorageLocation", resource_name="default", @@ -3218,7 +3222,7 @@ def validate_dpa(self): resource = backupstorage.get() if resource["status"].get("phase") != "Available": raise ACMClusterConfigurationException( - "Backupstoragelocation resource is not in 'Avaialble' phase" + "Backupstoragelocation resource is not in 'Available' phase" ) logger.info("Dataprotection application successful") @@ -3411,6 +3415,7 @@ def deploy(self): rbddops = RBDDRDeployOps() self.configure_mirror_peer() rbddops.deploy() + self.enable_acm_observability() self.deploy_dr_policy() # Enable cluster backup on both ACMs @@ -3444,6 +3449,107 @@ def deploy(self): else: self.enable_managed_serviceaccount() + @retry(ACMObservabilityNotEnabled, tries=10, delay=30) + def check_observability_status(self): + """ + Check observability status + + Raises: + ACMObservabilityNotEnabled: if the cmd returns False, ACM observability is not enabled + + """ + + acm_observability_status = bool( + exec_cmd( + "oc get MultiClusterObservability observability -o jsonpath='{.status.conditions[1].status}'" + ) + ) + + if acm_observability_status: + logger.info("ACM observability is successfully enabled") + else: + logger.error("ACM observability could not be enabled, re-trying...") + raise ACMObservabilityNotEnabled + + def thanos_secret(self): + """ + Create thanos secret yaml by using Noobaa or AWS bucket (AWS bucket is used in this function) + + """ + acm_indexes = get_all_acm_indexes() + self.meta_obj.get_meta_access_secret_keys() + thanos_secret_data = templating.load_yaml(constants.THANOS_PATH) + thanos_bucket_name = ( + f"dr-thanos-bucket-{config.clusters[0].ENV_DATA['cluster_name']}" + ) + self.create_s3_bucket( + self.meta_obj.access_key, + self.meta_obj.secret_key, + thanos_bucket_name, + ) + logger.info(f"ACM indexes {acm_indexes}") + navigate_thanos_yaml = thanos_secret_data["stringData"]["thanos.yaml"] + navigate_thanos_yaml = yaml.safe_load(navigate_thanos_yaml) + navigate_thanos_yaml["config"]["bucket"] = thanos_bucket_name + navigate_thanos_yaml["config"]["endpoint"] = "s3.amazonaws.com" + navigate_thanos_yaml["config"]["access_key"] = self.meta_obj.access_key + navigate_thanos_yaml["config"]["secret_key"] = self.meta_obj.secret_key + thanos_secret_data["stringData"]["thanos.yaml"] = str(navigate_thanos_yaml) + thanos_data_yaml = tempfile.NamedTemporaryFile( + mode="w+", prefix="thanos", delete=False + ) + templating.dump_data_to_temp_yaml(thanos_secret_data, thanos_data_yaml.name) + + logger.info( + "Creating thanos.yaml needed for ACM observability after passing required params" + ) + exec_cmd(f"oc create -f {thanos_data_yaml.name}") + + self.check_observability_status() + + def enable_acm_observability(self): + """ + Function to enable ACM observability for enabling DR monitoring dashboard for Regional DR on the RHACM console. + + """ + config.switch_acm_ctx() + + defaultstorageclass = get_default_storage_class() + + logger.info( + "Enabling ACM MultiClusterObservability for DR monitoring dashboard" + ) + + # load multiclusterobservability.yaml + multiclusterobservability_yaml_data = templating.load_yaml( + constants.MULTICLUSTEROBSERVABILITY_PATH + ) + multiclusterobservability_yaml_data["spec"]["storageConfig"][ + "storageClass" + ] = defaultstorageclass[0] + multiclusterobservability_data_yaml = tempfile.NamedTemporaryFile( + mode="w+", prefix="multiclusterobservability", delete=False + ) + templating.dump_data_to_temp_yaml( + multiclusterobservability_yaml_data, + multiclusterobservability_data_yaml.name, + ) + + exec_cmd(f"oc create -f {multiclusterobservability_data_yaml.name}") + + logger.info("Create thanos secret yaml") + self.thanos_secret() + + logger.info("Whitelist RBD metrics by creating configmap") + exec_cmd(f"oc create -f {constants.OBSERVABILITYMETRICSCONFIGMAP_PATH}") + + logger.info( + "Add label for cluster-monitoring needed to fire VolumeSyncronizationDelayAlert on the Hub cluster" + ) + exec_cmd( + "oc label namespace openshift-operators openshift.io/cluster-monitoring='true'" + ) + class MDRMultiClusterDROperatorsDeploy(MultiClusterDROperatorsDeploy): """ diff --git a/ocs_ci/ocs/acm/acm.py b/ocs_ci/ocs/acm/acm.py index 413e0d2b2b8..24fb6a6c513 100644 --- a/ocs_ci/ocs/acm/acm.py +++ b/ocs_ci/ocs/acm/acm.py @@ -7,7 +7,9 @@ from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions as ec from selenium.webdriver.common.by import By -from selenium.common.exceptions import NoSuchElementException +from selenium.common.exceptions import ( + NoSuchElementException, +) from ocs_ci.helpers.helpers import create_unique_resource_name from ocs_ci.ocs import constants from ocs_ci.ocs.acm.acm_constants import ( @@ -56,6 +58,7 @@ class AcmAddClusters(AcmPageNavigator): def __init__(self): super().__init__() self.page_nav = self.acm_page_nav + self.driver = SeleniumDriver() def import_cluster_ui(self, cluster_name, kubeconfig_location): """ @@ -234,7 +237,11 @@ def install_submariner_ui(self, globalnet=True): log.info("Click on 'Submariner add-ons' tab") self.do_click(self.page_nav["submariner-tab"]) log.info("Click on 'Install Submariner add-ons' button") - self.do_click(self.page_nav["install-submariner-btn"], timeout=120) + self.do_click( + self.page_nav["install-submariner-btn"], + enable_screenshot=True, + avoid_stale=True, + ) log.info("Click on 'Target clusters'") self.do_click(self.page_nav["target-clusters"]) log.info(f"Select 1st cluster which is {cluster_name_a}") diff --git a/ocs_ci/ocs/constants.py b/ocs_ci/ocs/constants.py index 4a1486306c5..5f19f5fe1a7 100644 --- a/ocs_ci/ocs/constants.py +++ b/ocs_ci/ocs/constants.py @@ -2760,6 +2760,13 @@ CLUSTERROLEBINDING_APPSET_PULLMODEL_PATH = os.path.join( TEMPLATE_DIR, "DR", "clusterrolebinding_appset_pullmodel.yaml" ) +THANOS_PATH = os.path.join(TEMPLATE_DIR, "DR", "thanos.yaml") +MULTICLUSTEROBSERVABILITY_PATH = os.path.join( + TEMPLATE_DIR, "DR", "multiclusterobservability.yaml" +) +OBSERVABILITYMETRICSCONFIGMAP_PATH = os.path.join( + TEMPLATE_DIR, "DR", "observability-metrics-configmap.yaml" +) APPLICATION_SET = "ApplicationSet" PLACEMENT = "Placement" GITOPS_CLUSTER_NAMESPACE = "openshift-gitops" diff --git a/ocs_ci/ocs/exceptions.py b/ocs_ci/ocs/exceptions.py index 116ba7a65c9..2097566c9d8 100644 --- a/ocs_ci/ocs/exceptions.py +++ b/ocs_ci/ocs/exceptions.py @@ -712,5 +712,9 @@ class APIRequestError(Exception): pass +class ACMObservabilityNotEnabled(Exception): + pass + + class ProviderModeNotFoundException(Exception): pass diff --git a/ocs_ci/ocs/resources/storage_cluster.py b/ocs_ci/ocs/resources/storage_cluster.py index 880c853c450..46198648033 100644 --- a/ocs_ci/ocs/resources/storage_cluster.py +++ b/ocs_ci/ocs/resources/storage_cluster.py @@ -2675,7 +2675,7 @@ def patch_storage_cluster_for_custom_storage_class( return False -@retry(AssertionError, 50, 10, 1) +@retry(AssertionError, 50, 20, 5) def validate_serviceexport(): """ validate the serviceexport resource diff --git a/ocs_ci/ocs/ui/acm_ui.py b/ocs_ci/ocs/ui/acm_ui.py index 1f9c79041e5..72921130d2d 100644 --- a/ocs_ci/ocs/ui/acm_ui.py +++ b/ocs_ci/ocs/ui/acm_ui.py @@ -82,7 +82,12 @@ def navigate_clusters_page(self, timeout=120): self.choose_expanded_mode( mode=True, locator=self.acm_page_nav["Infrastructure"] ) - self.do_click(locator=self.acm_page_nav["Clusters_page"], timeout=timeout) + self.do_click( + locator=self.acm_page_nav["Clusters_page"], + timeout=timeout, + enable_screenshot=True, + avoid_stale=True, + ) def navigate_bare_metal_assets_page(self): """ @@ -810,7 +815,7 @@ def fill_network_info(self): left_shift_offset = len(remote_text) - index self.do_send_keys( self.acm_page_nav["cc_vsphere_network_name"], - f"{left_shift_offset*Keys.ARROW_LEFT}{constants.SPACE}", + f"{left_shift_offset * Keys.ARROW_LEFT}{constants.SPACE}", ) except ValueError: raise ACMClusterDeployException( diff --git a/ocs_ci/templates/DR/multiclusterobservability.yaml b/ocs_ci/templates/DR/multiclusterobservability.yaml new file mode 100644 index 00000000000..5537d28601e --- /dev/null +++ b/ocs_ci/templates/DR/multiclusterobservability.yaml @@ -0,0 +1,22 @@ +# This config file is used to enable ACM observability + +--- +apiVersion: observability.open-cluster-management.io/v1beta2 +kind: MultiClusterObservability +metadata: + name: observability +spec: + enableDownsampling: true + observabilityAddonSpec: + enableMetrics: true + interval: 300 + storageConfig: + alertmanagerStorageSize: 1Gi + compactStorageSize: 100Gi + metricObjectStorage: + key: thanos.yaml + name: thanos-object-storage + receiveStorageSize: 100Gi + ruleStorageSize: 1Gi + storageClass: PLACEHOLDER + storeStorageSize: 10Gi diff --git a/ocs_ci/templates/DR/observability-metrics-configmap.yaml b/ocs_ci/templates/DR/observability-metrics-configmap.yaml new file mode 100644 index 00000000000..aae9c543533 --- /dev/null +++ b/ocs_ci/templates/DR/observability-metrics-configmap.yaml @@ -0,0 +1,27 @@ +# This config file is allow data to reflect on the DR monitoring dashboard +# by whitelisting ceph_rbd_* metrics +# Additionally we whitelist a few odf_* metrics but those are optional. + +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: mp-custom-allowlist + namespace: open-cluster-management-observability +data: + metrics_list.yaml: | + names: + - odf_system_health_status + - odf_system_map + - odf_system_raw_capacity_total_bytes + - odf_system_raw_capacity_used_bytes + - ceph_rbd_mirror_snapshot_sync_bytes + - ceph_rbd_mirror_snapshot_snapshots + - ceph_rbd_mirror_snapshot_sync_time_sum + matches: + - __name__="csv_succeeded",exported_namespace="openshift-storage",name=~"odf-operator.*" + - __name__="csv_succeeded",exported_namespace="openshift-dr-system",name=~"odr-cluster-operator.*" + - __name__="csv_succeeded",exported_namespace="openshift-operators",name=~"volsync.*" + recording_rules: + - record: count_persistentvolumeclaim_total + expr: count(kube_persistentvolumeclaim_info) diff --git a/ocs_ci/templates/DR/thanos.yaml b/ocs_ci/templates/DR/thanos.yaml new file mode 100644 index 00000000000..4e7e6db780b --- /dev/null +++ b/ocs_ci/templates/DR/thanos.yaml @@ -0,0 +1,21 @@ +# This config file is used to enable ACM observability. +# When the Observability service is enabled, the hub cluster is always configured +# to collect and send metrics to the configured Thanos instance, regardless of whether +# hub self-management is enabled or not. + +--- +apiVersion: v1 +kind: Secret +metadata: + name: thanos-object-storage + namespace: open-cluster-management-observability +type: Opaque +stringData: + thanos.yaml: | + type: s3 + config: + bucket: PLACEHOLDER + endpoint: PLACEHOLDER + insecure: true + access_key: PLACEHOLDER + secret_key: PLACEHOLDER