Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[RDR] Enable ACM observability for DR monitoring dashboard on RHACM console #9646

Merged
merged 32 commits into from
Aug 6, 2024
Merged
Show file tree
Hide file tree
Changes from 31 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
215df40
code base
am-agrawa Apr 5, 2024
8c57bed
code base to enable ACM observability
am-agrawa Apr 6, 2024
e66879c
Remove file
am-agrawa Apr 6, 2024
692be43
added notes in yaml files
am-agrawa Apr 6, 2024
b5b336f
code fixes and feedback
am-agrawa May 17, 2024
6a2991e
revert unwanted code changes
am-agrawa May 17, 2024
f6f83ff
revert unwanted code changes-2
am-agrawa May 17, 2024
4ec1402
rebase on 29may24
am-agrawa May 29, 2024
4da9c25
feedback fixes
am-agrawa May 29, 2024
bb7c4d3
fix retry marker
am-agrawa May 29, 2024
426fd3a
import error correction
am-agrawa May 29, 2024
ab50d4c
increase timeout for gitops to be be ready
am-agrawa May 30, 2024
ff71c75
add path in constants for yaml
am-agrawa May 31, 2024
257de49
validate_serviceexport before ocs-install-verification
am-agrawa Jun 3, 2024
55544b6
insatll oadp first, enable obs. before policy creation
am-agrawa Jun 4, 2024
f6b89da
rebase, 4th june24
am-agrawa Jun 4, 2024
1b74928
increase serviceexport time
am-agrawa Jun 5, 2024
8c0f228
add retry to blockpool status check
am-agrawa Jun 5, 2024
f980e65
rebase on 18july24
am-agrawa Jul 18, 2024
fe28316
rebase on 18july24
am-agrawa Jul 18, 2024
c9fc357
new rebase on 18july24
am-agrawa Jul 18, 2024
73d8615
re-try
am-agrawa Jul 18, 2024
493b59d
flake8 issue
am-agrawa Jul 18, 2024
9a6dc61
rebase on 23july24
am-agrawa Jul 23, 2024
e5e2d1c
code fixes
am-agrawa Jul 23, 2024
019af84
code fixes after local run
am-agrawa Jul 23, 2024
ca12f5a
fix stale element
am-agrawa Jul 30, 2024
73d02b5
revert all changes, mark stale to true in do_click
am-agrawa Jul 31, 2024
9aca886
another stale element fix
am-agrawa Aug 2, 2024
fabb65b
feedback cmt
am-agrawa Aug 5, 2024
6ab0598
replace run_cmd to exec_cmd
am-agrawa Aug 6, 2024
ba6ab91
doctext formatting
am-agrawa Aug 6, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 110 additions & 5 deletions ocs_ci/deployment/deployment.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
UnexpectedDeploymentConfiguration,
ResourceNotFoundError,
ACMClusterConfigurationException,
ACMObservabilityNotEnabled,
)
from ocs_ci.deployment.cert_manager import deploy_cert_manager
from ocs_ci.deployment.zones import create_dummy_zone_labels
Expand Down Expand Up @@ -91,6 +92,7 @@
setup_ceph_debug,
get_osd_count,
StorageCluster,
validate_serviceexport,
)
from ocs_ci.ocs.uninstall import uninstall_ocs
from ocs_ci.ocs.utils import (
Expand Down Expand Up @@ -148,13 +150,13 @@
from ocs_ci.helpers import helpers
from ocs_ci.helpers.helpers import (
set_configmap_log_level_rook_ceph_operator,
get_default_storage_class,
)
from ocs_ci.ocs.ui.helpers_ui import ui_deployment_conditions
from ocs_ci.utility.utils import get_az_count
from ocs_ci.utility.ibmcloud import run_ibmcloud_cmd
from ocs_ci.deployment.cnv import CNVInstaller


logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -331,7 +333,7 @@ def do_gitops_deploy(self):
for cluster in managed_clusters:
if cluster["metadata"]["name"] != constants.ACM_LOCAL_CLUSTER:
config.switch_to_cluster_by_name(cluster["metadata"]["name"])
run_cmd(
exec_cmd(
f"oc create -f {constants.CLUSTERROLEBINDING_APPSET_PULLMODEL_PATH}"
)

Expand Down Expand Up @@ -413,6 +415,7 @@ def do_deploy_ocs(self):
.get("multiClusterService")
.get("enabled")
), "Failed to update StorageCluster globalnet"
validate_serviceexport()
ocs_install_verification(
timeout=2000, ocs_registry_image=ocs_registry_image
)
Expand Down Expand Up @@ -664,8 +667,8 @@ def deploy_cluster(self, log_cli_level="DEBUG"):
self.do_deploy_lvmo()
self.do_deploy_submariner()
self.do_gitops_deploy()
self.do_deploy_ocs()
self.do_deploy_oadp()
self.do_deploy_ocs()
self.do_deploy_rdr()
self.do_deploy_fusion()
self.do_deploy_odf_provider_mode()
Expand Down Expand Up @@ -2599,6 +2602,7 @@ class RBDDRDeployOps(object):
def deploy(self):
self.configure_rbd()

@retry(ResourceWrongStatusException, tries=10, delay=5)
def configure_rbd(self):
st_string = '{.items[?(@.metadata.ownerReferences[*].kind=="StorageCluster")].spec.mirroring.enabled}'
query_mirroring = (
Expand Down Expand Up @@ -3209,7 +3213,7 @@ def validate_dpa(self):
if veleropod[0]["status"]["phase"] != "Running":
raise ACMClusterConfigurationException("Velero pod not in 'Running' phase")

# Check backupstoragelocation resource in "Available" phase
# Check backupstoragelocation resource is in "Available" phase
backupstorage = ocp.OCP(
kind="BackupStorageLocation",
resource_name="default",
Expand All @@ -3218,7 +3222,7 @@ def validate_dpa(self):
resource = backupstorage.get()
if resource["status"].get("phase") != "Available":
raise ACMClusterConfigurationException(
"Backupstoragelocation resource is not in 'Avaialble' phase"
"Backupstoragelocation resource is not in 'Available' phase"
)
logger.info("Dataprotection application successful")

Expand Down Expand Up @@ -3411,6 +3415,7 @@ def deploy(self):
rbddops = RBDDRDeployOps()
self.configure_mirror_peer()
rbddops.deploy()
self.enable_acm_observability()
self.deploy_dr_policy()

# Enable cluster backup on both ACMs
Expand Down Expand Up @@ -3444,6 +3449,106 @@ def deploy(self):
else:
self.enable_managed_serviceaccount()

@retry(ACMObservabilityNotEnabled, tries=10, delay=30)
def check_observability_status(self):
"""
Check observability status

raises ACMObservabilityNotEnabled: if the cmd returns False, ACM observability is not enabled
sidhant-agrawal marked this conversation as resolved.
Show resolved Hide resolved

"""

acm_observability_status = bool(
exec_cmd(
"oc get MultiClusterObservability observability -o jsonpath='{.status.conditions[1].status}'"
)
)

if acm_observability_status:
logger.info("ACM observability is successfully enabled")
else:
logger.error("ACM observability could not be enabled, re-trying...")
ebondare marked this conversation as resolved.
Show resolved Hide resolved
raise ACMObservabilityNotEnabled

def thanos_secret(self):
"""
Create thanos secret yaml by using Noobaa or AWS bucket (AWS bucket is used in this function)

"""
acm_indexes = get_all_acm_indexes()
self.meta_obj.get_meta_access_secret_keys()
thanos_secret_data = templating.load_yaml(constants.THANOS_PATH)
thanos_bucket_name = (
f"dr-thanos-bucket-{config.clusters[0].ENV_DATA['cluster_name']}"
)
self.create_s3_bucket(
self.meta_obj.access_key,
self.meta_obj.secret_key,
thanos_bucket_name,
)
logger.info(f"ACM indexes {acm_indexes}")
navigate_thanos_yaml = thanos_secret_data["stringData"]["thanos.yaml"]
navigate_thanos_yaml = yaml.safe_load(navigate_thanos_yaml)
navigate_thanos_yaml["config"]["bucket"] = thanos_bucket_name
navigate_thanos_yaml["config"]["endpoint"] = "s3.amazonaws.com"
navigate_thanos_yaml["config"]["access_key"] = self.meta_obj.access_key
navigate_thanos_yaml["config"]["secret_key"] = self.meta_obj.secret_key
thanos_secret_data["stringData"]["thanos.yaml"] = str(navigate_thanos_yaml)
thanos_data_yaml = tempfile.NamedTemporaryFile(
mode="w+", prefix="thanos", delete=False
)
templating.dump_data_to_temp_yaml(thanos_secret_data, thanos_data_yaml.name)

logger.info(
"Creating thanos.yaml needed for ACM observability after passing required params"
)
exec_cmd(f"oc create -f {thanos_data_yaml.name}")

self.check_observability_status()

def enable_acm_observability(self):
sidhant-agrawal marked this conversation as resolved.
Show resolved Hide resolved
"""
Function to enable ACM observability for enabling DR monitoring dashboard for Regional DR on the RHACM console.

"""
config.switch_acm_ctx()

defaultstorageclass = get_default_storage_class()

logger.info(
"Enabling ACM MultiClusterObservability for DR monitoring dashboard"
)

# load multiclusterobservability.yaml
multiclusterobservability_yaml_data = templating.load_yaml(
constants.MULTICLUSTEROBSERVABILITY_PATH
)
multiclusterobservability_yaml_data["spec"]["storageConfig"][
"storageClass"
] = defaultstorageclass[0]
multiclusterobservability_data_yaml = tempfile.NamedTemporaryFile(
mode="w+", prefix="multiclusterobservability", delete=False
)
templating.dump_data_to_temp_yaml(
multiclusterobservability_yaml_data,
multiclusterobservability_data_yaml.name,
)

exec_cmd(f"oc create -f {multiclusterobservability_data_yaml.name}")

logger.info("Create thanos secret yaml")
self.thanos_secret()

logger.info("Whitelist RBD metrics by creating configmap")
exec_cmd(f"oc create -f {constants.OBSERVABILITYMETRICSCONFIGMAP_PATH}")

logger.info(
"Add label for cluster-monitoring needed to fire VolumeSyncronizationDelayAlert on the Hub cluster"
)
exec_cmd(
"oc label namespace openshift-operators openshift.io/cluster-monitoring='true'"
)


class MDRMultiClusterDROperatorsDeploy(MultiClusterDROperatorsDeploy):
"""
Expand Down
11 changes: 9 additions & 2 deletions ocs_ci/ocs/acm/acm.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import (
NoSuchElementException,
)
from ocs_ci.helpers.helpers import create_unique_resource_name
from ocs_ci.ocs import constants
from ocs_ci.ocs.acm.acm_constants import (
Expand Down Expand Up @@ -56,6 +58,7 @@ class AcmAddClusters(AcmPageNavigator):
def __init__(self):
super().__init__()
self.page_nav = self.acm_page_nav
self.driver = SeleniumDriver()

def import_cluster_ui(self, cluster_name, kubeconfig_location):
"""
Expand Down Expand Up @@ -234,7 +237,11 @@ def install_submariner_ui(self, globalnet=True):
log.info("Click on 'Submariner add-ons' tab")
self.do_click(self.page_nav["submariner-tab"])
log.info("Click on 'Install Submariner add-ons' button")
self.do_click(self.page_nav["install-submariner-btn"], timeout=120)
self.do_click(
self.page_nav["install-submariner-btn"],
enable_screenshot=True,
avoid_stale=True,
)
log.info("Click on 'Target clusters'")
self.do_click(self.page_nav["target-clusters"])
log.info(f"Select 1st cluster which is {cluster_name_a}")
Expand Down
7 changes: 7 additions & 0 deletions ocs_ci/ocs/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -2760,6 +2760,13 @@
CLUSTERROLEBINDING_APPSET_PULLMODEL_PATH = os.path.join(
TEMPLATE_DIR, "DR", "clusterrolebinding_appset_pullmodel.yaml"
)
THANOS_PATH = os.path.join(TEMPLATE_DIR, "DR", "thanos.yaml")
MULTICLUSTEROBSERVABILITY_PATH = os.path.join(
TEMPLATE_DIR, "DR", "multiclusterobservability.yaml"
)
OBSERVABILITYMETRICSCONFIGMAP_PATH = os.path.join(
TEMPLATE_DIR, "DR", "observability-metrics-configmap.yaml"
)
APPLICATION_SET = "ApplicationSet"
PLACEMENT = "Placement"
GITOPS_CLUSTER_NAMESPACE = "openshift-gitops"
Expand Down
4 changes: 4 additions & 0 deletions ocs_ci/ocs/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -712,5 +712,9 @@ class APIRequestError(Exception):
pass


class ACMObservabilityNotEnabled(Exception):
pass


class ProviderModeNotFoundException(Exception):
pass
2 changes: 1 addition & 1 deletion ocs_ci/ocs/resources/storage_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -2675,7 +2675,7 @@ def patch_storage_cluster_for_custom_storage_class(
return False


@retry(AssertionError, 50, 10, 1)
@retry(AssertionError, 50, 20, 5)
def validate_serviceexport():
"""
validate the serviceexport resource
Expand Down
9 changes: 7 additions & 2 deletions ocs_ci/ocs/ui/acm_ui.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,12 @@ def navigate_clusters_page(self, timeout=120):
self.choose_expanded_mode(
mode=True, locator=self.acm_page_nav["Infrastructure"]
)
self.do_click(locator=self.acm_page_nav["Clusters_page"], timeout=timeout)
self.do_click(
locator=self.acm_page_nav["Clusters_page"],
timeout=timeout,
enable_screenshot=True,
avoid_stale=True,
)

def navigate_bare_metal_assets_page(self):
"""
Expand Down Expand Up @@ -810,7 +815,7 @@ def fill_network_info(self):
left_shift_offset = len(remote_text) - index
self.do_send_keys(
self.acm_page_nav["cc_vsphere_network_name"],
f"{left_shift_offset*Keys.ARROW_LEFT}{constants.SPACE}",
f"{left_shift_offset * Keys.ARROW_LEFT}{constants.SPACE}",
)
except ValueError:
raise ACMClusterDeployException(
Expand Down
22 changes: 22 additions & 0 deletions ocs_ci/templates/DR/multiclusterobservability.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# This config file is used to enable ACM observability

---
apiVersion: observability.open-cluster-management.io/v1beta2
kind: MultiClusterObservability
metadata:
name: observability
spec:
enableDownsampling: true
observabilityAddonSpec:
enableMetrics: true
interval: 300
storageConfig:
alertmanagerStorageSize: 1Gi
compactStorageSize: 100Gi
metricObjectStorage:
key: thanos.yaml
name: thanos-object-storage
receiveStorageSize: 100Gi
ruleStorageSize: 1Gi
storageClass: PLACEHOLDER
storeStorageSize: 10Gi
27 changes: 27 additions & 0 deletions ocs_ci/templates/DR/observability-metrics-configmap.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# This config file is allow data to reflect on the DR monitoring dashboard
# by whitelisting ceph_rbd_* metrics
# Additionally we whitelist a few odf_* metrics but those are optional.

---
apiVersion: v1
kind: ConfigMap
metadata:
name: mp-custom-allowlist
namespace: open-cluster-management-observability
data:
metrics_list.yaml: |
names:
- odf_system_health_status
- odf_system_map
- odf_system_raw_capacity_total_bytes
- odf_system_raw_capacity_used_bytes
- ceph_rbd_mirror_snapshot_sync_bytes
- ceph_rbd_mirror_snapshot_snapshots
- ceph_rbd_mirror_snapshot_sync_time_sum
matches:
- __name__="csv_succeeded",exported_namespace="openshift-storage",name=~"odf-operator.*"
- __name__="csv_succeeded",exported_namespace="openshift-dr-system",name=~"odr-cluster-operator.*"
- __name__="csv_succeeded",exported_namespace="openshift-operators",name=~"volsync.*"
recording_rules:
- record: count_persistentvolumeclaim_total
expr: count(kube_persistentvolumeclaim_info)
21 changes: 21 additions & 0 deletions ocs_ci/templates/DR/thanos.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# This config file is used to enable ACM observability.
# When the Observability service is enabled, the hub cluster is always configured
# to collect and send metrics to the configured Thanos instance, regardless of whether
# hub self-management is enabled or not.

---
apiVersion: v1
kind: Secret
metadata:
name: thanos-object-storage
namespace: open-cluster-management-observability
type: Opaque
stringData:
thanos.yaml: |
type: s3
config:
bucket: PLACEHOLDER
endpoint: PLACEHOLDER
insecure: true
access_key: PLACEHOLDER
secret_key: PLACEHOLDER
Loading