Skip to content

Commit

Permalink
[RDR] Enable ACM observability for DR monitoring dashboard on RHACM c…
Browse files Browse the repository at this point in the history
…onsole (#9646)

Signed-off-by: am-agrawa <amagrawa@redhat.com>
  • Loading branch information
am-agrawa authored Aug 6, 2024
1 parent 5a893a2 commit 21d85b9
Show file tree
Hide file tree
Showing 9 changed files with 209 additions and 10 deletions.
116 changes: 111 additions & 5 deletions ocs_ci/deployment/deployment.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
UnexpectedDeploymentConfiguration,
ResourceNotFoundError,
ACMClusterConfigurationException,
ACMObservabilityNotEnabled,
)
from ocs_ci.deployment.cert_manager import deploy_cert_manager
from ocs_ci.deployment.zones import create_dummy_zone_labels
Expand Down Expand Up @@ -91,6 +92,7 @@
setup_ceph_debug,
get_osd_count,
StorageCluster,
validate_serviceexport,
)
from ocs_ci.ocs.uninstall import uninstall_ocs
from ocs_ci.ocs.utils import (
Expand Down Expand Up @@ -148,13 +150,13 @@
from ocs_ci.helpers import helpers
from ocs_ci.helpers.helpers import (
set_configmap_log_level_rook_ceph_operator,
get_default_storage_class,
)
from ocs_ci.ocs.ui.helpers_ui import ui_deployment_conditions
from ocs_ci.utility.utils import get_az_count
from ocs_ci.utility.ibmcloud import run_ibmcloud_cmd
from ocs_ci.deployment.cnv import CNVInstaller


logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -331,7 +333,7 @@ def do_gitops_deploy(self):
for cluster in managed_clusters:
if cluster["metadata"]["name"] != constants.ACM_LOCAL_CLUSTER:
config.switch_to_cluster_by_name(cluster["metadata"]["name"])
run_cmd(
exec_cmd(
f"oc create -f {constants.CLUSTERROLEBINDING_APPSET_PULLMODEL_PATH}"
)

Expand Down Expand Up @@ -413,6 +415,7 @@ def do_deploy_ocs(self):
.get("multiClusterService")
.get("enabled")
), "Failed to update StorageCluster globalnet"
validate_serviceexport()
ocs_install_verification(
timeout=2000, ocs_registry_image=ocs_registry_image
)
Expand Down Expand Up @@ -664,8 +667,8 @@ def deploy_cluster(self, log_cli_level="DEBUG"):
self.do_deploy_lvmo()
self.do_deploy_submariner()
self.do_gitops_deploy()
self.do_deploy_ocs()
self.do_deploy_oadp()
self.do_deploy_ocs()
self.do_deploy_rdr()
self.do_deploy_fusion()
self.do_deploy_odf_provider_mode()
Expand Down Expand Up @@ -2599,6 +2602,7 @@ class RBDDRDeployOps(object):
def deploy(self):
self.configure_rbd()

@retry(ResourceWrongStatusException, tries=10, delay=5)
def configure_rbd(self):
st_string = '{.items[?(@.metadata.ownerReferences[*].kind=="StorageCluster")].spec.mirroring.enabled}'
query_mirroring = (
Expand Down Expand Up @@ -3209,7 +3213,7 @@ def validate_dpa(self):
if veleropod[0]["status"]["phase"] != "Running":
raise ACMClusterConfigurationException("Velero pod not in 'Running' phase")

# Check backupstoragelocation resource in "Available" phase
# Check backupstoragelocation resource is in "Available" phase
backupstorage = ocp.OCP(
kind="BackupStorageLocation",
resource_name="default",
Expand All @@ -3218,7 +3222,7 @@ def validate_dpa(self):
resource = backupstorage.get()
if resource["status"].get("phase") != "Available":
raise ACMClusterConfigurationException(
"Backupstoragelocation resource is not in 'Avaialble' phase"
"Backupstoragelocation resource is not in 'Available' phase"
)
logger.info("Dataprotection application successful")

Expand Down Expand Up @@ -3411,6 +3415,7 @@ def deploy(self):
rbddops = RBDDRDeployOps()
self.configure_mirror_peer()
rbddops.deploy()
self.enable_acm_observability()
self.deploy_dr_policy()

# Enable cluster backup on both ACMs
Expand Down Expand Up @@ -3444,6 +3449,107 @@ def deploy(self):
else:
self.enable_managed_serviceaccount()

@retry(ACMObservabilityNotEnabled, tries=10, delay=30)
def check_observability_status(self):
"""
Check observability status
Raises:
ACMObservabilityNotEnabled: if the cmd returns False, ACM observability is not enabled
"""

acm_observability_status = bool(
exec_cmd(
"oc get MultiClusterObservability observability -o jsonpath='{.status.conditions[1].status}'"
)
)

if acm_observability_status:
logger.info("ACM observability is successfully enabled")
else:
logger.error("ACM observability could not be enabled, re-trying...")
raise ACMObservabilityNotEnabled

def thanos_secret(self):
"""
Create thanos secret yaml by using Noobaa or AWS bucket (AWS bucket is used in this function)
"""
acm_indexes = get_all_acm_indexes()
self.meta_obj.get_meta_access_secret_keys()
thanos_secret_data = templating.load_yaml(constants.THANOS_PATH)
thanos_bucket_name = (
f"dr-thanos-bucket-{config.clusters[0].ENV_DATA['cluster_name']}"
)
self.create_s3_bucket(
self.meta_obj.access_key,
self.meta_obj.secret_key,
thanos_bucket_name,
)
logger.info(f"ACM indexes {acm_indexes}")
navigate_thanos_yaml = thanos_secret_data["stringData"]["thanos.yaml"]
navigate_thanos_yaml = yaml.safe_load(navigate_thanos_yaml)
navigate_thanos_yaml["config"]["bucket"] = thanos_bucket_name
navigate_thanos_yaml["config"]["endpoint"] = "s3.amazonaws.com"
navigate_thanos_yaml["config"]["access_key"] = self.meta_obj.access_key
navigate_thanos_yaml["config"]["secret_key"] = self.meta_obj.secret_key
thanos_secret_data["stringData"]["thanos.yaml"] = str(navigate_thanos_yaml)
thanos_data_yaml = tempfile.NamedTemporaryFile(
mode="w+", prefix="thanos", delete=False
)
templating.dump_data_to_temp_yaml(thanos_secret_data, thanos_data_yaml.name)

logger.info(
"Creating thanos.yaml needed for ACM observability after passing required params"
)
exec_cmd(f"oc create -f {thanos_data_yaml.name}")

self.check_observability_status()

def enable_acm_observability(self):
"""
Function to enable ACM observability for enabling DR monitoring dashboard for Regional DR on the RHACM console.
"""
config.switch_acm_ctx()

defaultstorageclass = get_default_storage_class()

logger.info(
"Enabling ACM MultiClusterObservability for DR monitoring dashboard"
)

# load multiclusterobservability.yaml
multiclusterobservability_yaml_data = templating.load_yaml(
constants.MULTICLUSTEROBSERVABILITY_PATH
)
multiclusterobservability_yaml_data["spec"]["storageConfig"][
"storageClass"
] = defaultstorageclass[0]
multiclusterobservability_data_yaml = tempfile.NamedTemporaryFile(
mode="w+", prefix="multiclusterobservability", delete=False
)
templating.dump_data_to_temp_yaml(
multiclusterobservability_yaml_data,
multiclusterobservability_data_yaml.name,
)

exec_cmd(f"oc create -f {multiclusterobservability_data_yaml.name}")

logger.info("Create thanos secret yaml")
self.thanos_secret()

logger.info("Whitelist RBD metrics by creating configmap")
exec_cmd(f"oc create -f {constants.OBSERVABILITYMETRICSCONFIGMAP_PATH}")

logger.info(
"Add label for cluster-monitoring needed to fire VolumeSyncronizationDelayAlert on the Hub cluster"
)
exec_cmd(
"oc label namespace openshift-operators openshift.io/cluster-monitoring='true'"
)


class MDRMultiClusterDROperatorsDeploy(MultiClusterDROperatorsDeploy):
"""
Expand Down
11 changes: 9 additions & 2 deletions ocs_ci/ocs/acm/acm.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import (
NoSuchElementException,
)
from ocs_ci.helpers.helpers import create_unique_resource_name
from ocs_ci.ocs import constants
from ocs_ci.ocs.acm.acm_constants import (
Expand Down Expand Up @@ -56,6 +58,7 @@ class AcmAddClusters(AcmPageNavigator):
def __init__(self):
super().__init__()
self.page_nav = self.acm_page_nav
self.driver = SeleniumDriver()

def import_cluster_ui(self, cluster_name, kubeconfig_location):
"""
Expand Down Expand Up @@ -234,7 +237,11 @@ def install_submariner_ui(self, globalnet=True):
log.info("Click on 'Submariner add-ons' tab")
self.do_click(self.page_nav["submariner-tab"])
log.info("Click on 'Install Submariner add-ons' button")
self.do_click(self.page_nav["install-submariner-btn"], timeout=120)
self.do_click(
self.page_nav["install-submariner-btn"],
enable_screenshot=True,
avoid_stale=True,
)
log.info("Click on 'Target clusters'")
self.do_click(self.page_nav["target-clusters"])
log.info(f"Select 1st cluster which is {cluster_name_a}")
Expand Down
7 changes: 7 additions & 0 deletions ocs_ci/ocs/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -2760,6 +2760,13 @@
CLUSTERROLEBINDING_APPSET_PULLMODEL_PATH = os.path.join(
TEMPLATE_DIR, "DR", "clusterrolebinding_appset_pullmodel.yaml"
)
THANOS_PATH = os.path.join(TEMPLATE_DIR, "DR", "thanos.yaml")
MULTICLUSTEROBSERVABILITY_PATH = os.path.join(
TEMPLATE_DIR, "DR", "multiclusterobservability.yaml"
)
OBSERVABILITYMETRICSCONFIGMAP_PATH = os.path.join(
TEMPLATE_DIR, "DR", "observability-metrics-configmap.yaml"
)
APPLICATION_SET = "ApplicationSet"
PLACEMENT = "Placement"
GITOPS_CLUSTER_NAMESPACE = "openshift-gitops"
Expand Down
4 changes: 4 additions & 0 deletions ocs_ci/ocs/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -712,5 +712,9 @@ class APIRequestError(Exception):
pass


class ACMObservabilityNotEnabled(Exception):
pass


class ProviderModeNotFoundException(Exception):
pass
2 changes: 1 addition & 1 deletion ocs_ci/ocs/resources/storage_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -2675,7 +2675,7 @@ def patch_storage_cluster_for_custom_storage_class(
return False


@retry(AssertionError, 50, 10, 1)
@retry(AssertionError, 50, 20, 5)
def validate_serviceexport():
"""
validate the serviceexport resource
Expand Down
9 changes: 7 additions & 2 deletions ocs_ci/ocs/ui/acm_ui.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,12 @@ def navigate_clusters_page(self, timeout=120):
self.choose_expanded_mode(
mode=True, locator=self.acm_page_nav["Infrastructure"]
)
self.do_click(locator=self.acm_page_nav["Clusters_page"], timeout=timeout)
self.do_click(
locator=self.acm_page_nav["Clusters_page"],
timeout=timeout,
enable_screenshot=True,
avoid_stale=True,
)

def navigate_bare_metal_assets_page(self):
"""
Expand Down Expand Up @@ -810,7 +815,7 @@ def fill_network_info(self):
left_shift_offset = len(remote_text) - index
self.do_send_keys(
self.acm_page_nav["cc_vsphere_network_name"],
f"{left_shift_offset*Keys.ARROW_LEFT}{constants.SPACE}",
f"{left_shift_offset * Keys.ARROW_LEFT}{constants.SPACE}",
)
except ValueError:
raise ACMClusterDeployException(
Expand Down
22 changes: 22 additions & 0 deletions ocs_ci/templates/DR/multiclusterobservability.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# This config file is used to enable ACM observability

---
apiVersion: observability.open-cluster-management.io/v1beta2
kind: MultiClusterObservability
metadata:
name: observability
spec:
enableDownsampling: true
observabilityAddonSpec:
enableMetrics: true
interval: 300
storageConfig:
alertmanagerStorageSize: 1Gi
compactStorageSize: 100Gi
metricObjectStorage:
key: thanos.yaml
name: thanos-object-storage
receiveStorageSize: 100Gi
ruleStorageSize: 1Gi
storageClass: PLACEHOLDER
storeStorageSize: 10Gi
27 changes: 27 additions & 0 deletions ocs_ci/templates/DR/observability-metrics-configmap.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# This config file is allow data to reflect on the DR monitoring dashboard
# by whitelisting ceph_rbd_* metrics
# Additionally we whitelist a few odf_* metrics but those are optional.

---
apiVersion: v1
kind: ConfigMap
metadata:
name: mp-custom-allowlist
namespace: open-cluster-management-observability
data:
metrics_list.yaml: |
names:
- odf_system_health_status
- odf_system_map
- odf_system_raw_capacity_total_bytes
- odf_system_raw_capacity_used_bytes
- ceph_rbd_mirror_snapshot_sync_bytes
- ceph_rbd_mirror_snapshot_snapshots
- ceph_rbd_mirror_snapshot_sync_time_sum
matches:
- __name__="csv_succeeded",exported_namespace="openshift-storage",name=~"odf-operator.*"
- __name__="csv_succeeded",exported_namespace="openshift-dr-system",name=~"odr-cluster-operator.*"
- __name__="csv_succeeded",exported_namespace="openshift-operators",name=~"volsync.*"
recording_rules:
- record: count_persistentvolumeclaim_total
expr: count(kube_persistentvolumeclaim_info)
21 changes: 21 additions & 0 deletions ocs_ci/templates/DR/thanos.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# This config file is used to enable ACM observability.
# When the Observability service is enabled, the hub cluster is always configured
# to collect and send metrics to the configured Thanos instance, regardless of whether
# hub self-management is enabled or not.

---
apiVersion: v1
kind: Secret
metadata:
name: thanos-object-storage
namespace: open-cluster-management-observability
type: Opaque
stringData:
thanos.yaml: |
type: s3
config:
bucket: PLACEHOLDER
endpoint: PLACEHOLDER
insecure: true
access_key: PLACEHOLDER
secret_key: PLACEHOLDER

0 comments on commit 21d85b9

Please sign in to comment.