From 04a1a64c565adbe03b3c5494eccae1b24a6b9a94 Mon Sep 17 00:00:00 2001 From: Benamar Mekhissi Date: Mon, 11 Dec 2023 09:17:02 -0500 Subject: [PATCH] Enhancing Hub Recovery: Reworking DRPC State Rebuilding Algorithm This commit tackles hub recovery issues by reworking the algorithm responsible for rebuilding the DRPC state. The changes align with the following expectations: 1. Stop Condition for Both Failed Queries: If attempts to query 2 clusters result in failure for both, the process is halted. 2. Initial Deployment without VRGs: If 2 clusters are successfully queried, and no VRGs are found, proceed with the initial deployment. 3. Handling Failures with S3 Store Check: - If 2 clusters are queried, 1 fails, and 0 VRGs are found, perform the following checks: - If the VRG is found in the S3 store, ensure that the DRPC action matches the VRG action. If not, stop until the action is corrected, allowing failover if necessary (set PeerReady). - If the VRG is not found in the S3 store and the failed cluster is not the destination cluster, continue with the initial deployment. 4. Verification and Failover for VRGs on Failover Cluster: If 2 clusters are queried, 1 fails, and 1 VRG is found on the failover cluster, check the action: - If the actions don't match, stop until corrected by the user. - If they match, also stop but allow failover if the VRG in-hand is a secondary. Otherwise, continue. 5. Handling VRGs on Destination Cluster: If 2 clusters are queried successfully and 1 or more VRGs are found, and one of the VRGs is on the destination cluster, perform the following checks: - Continue with the action only if the DRPC and the found VRG action match. - Stop until someone investigates if there is a mismatch, but allow failover to take place (set PeerReady). 6. Otherwise, default to allowing Failover: If none of the above conditions apply, allow failover (set PeerReady) but stop until someone makes the necessary change. Signed-off-by: Benamar Mekhissi --- api/v1alpha1/drplacementcontrol_types.go | 2 + controllers/drcluster_mmode.go | 2 +- controllers/drplacementcontrol.go | 210 +++--------- controllers/drplacementcontrol_controller.go | 324 ++++++++++++++++-- .../drplacementcontrol_controller_test.go | 231 +++++-------- controllers/drplacementcontrolvolsync.go | 2 +- 6 files changed, 420 insertions(+), 351 deletions(-) diff --git a/api/v1alpha1/drplacementcontrol_types.go b/api/v1alpha1/drplacementcontrol_types.go index 911ebdd6e5..3692908580 100644 --- a/api/v1alpha1/drplacementcontrol_types.go +++ b/api/v1alpha1/drplacementcontrol_types.go @@ -69,6 +69,7 @@ const ( ReasonCleaning = "Cleaning" ReasonSuccess = "Success" ReasonNotStarted = "NotStarted" + ReasonPaused = "Paused" ) type ProgressionStatus string @@ -93,6 +94,7 @@ const ( ProgressionEnsuringVolSyncSetup = ProgressionStatus("EnsuringVolSyncSetup") ProgressionSettingupVolsyncDest = ProgressionStatus("SettingUpVolSyncDest") ProgressionDeleting = ProgressionStatus("Deleting") + ProgressionActionPaused = ProgressionStatus("Paused") ) // DRPlacementControlSpec defines the desired state of DRPlacementControl diff --git a/controllers/drcluster_mmode.go b/controllers/drcluster_mmode.go index b151edfd39..83bea1fb88 100644 --- a/controllers/drcluster_mmode.go +++ b/controllers/drcluster_mmode.go @@ -115,7 +115,7 @@ func (u *drclusterInstance) getVRGs(drpcCollection DRPCAndPolicy) (map[string]*r return nil, err } - vrgs, failedToQueryCluster, err := getVRGsFromManagedClusters( + vrgs, _, failedToQueryCluster, err := getVRGsFromManagedClusters( u.reconciler.MCVGetter, drpcCollection.drpc, drClusters, diff --git a/controllers/drplacementcontrol.go b/controllers/drplacementcontrol.go index 7f0efea911..5253c29a02 100644 --- a/controllers/drplacementcontrol.go +++ b/controllers/drplacementcontrol.go @@ -18,7 +18,6 @@ import ( "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" - "k8s.io/apimachinery/pkg/util/sets" "sigs.k8s.io/yaml" rmn "github.com/ramendr/ramen/api/v1alpha1" @@ -323,13 +322,6 @@ func (d *DRPCInstance) startDeploying(homeCluster, homeClusterNamespace string) func (d *DRPCInstance) RunFailover() (bool, error) { d.log.Info("Entering RunFailover", "state", d.getLastDRState()) - if d.isPlacementNeedsFixing() { - err := d.fixupPlacementForFailover() - if err != nil { - d.log.Info("Couldn't fix up placement for Failover") - } - } - const done = true // We are done if empty @@ -662,7 +654,7 @@ func GetLastKnownVRGPrimaryFromS3( vrg := &rmn.VolumeReplicationGroup{} if err := vrgObjectDownload(objectStorer, sourcePathNamePrefix, vrg); err != nil { - log.Error(err, "Kube objects capture-to-recover-from identifier get error") + log.Info(fmt.Sprintf("Failed to get VRG from s3 store - s3ProfileName %s. Err %v", s3ProfileName, err)) continue } @@ -773,13 +765,6 @@ func checkActivationForStorageIdentifier( func (d *DRPCInstance) RunRelocate() (bool, error) { d.log.Info("Entering RunRelocate", "state", d.getLastDRState(), "progression", d.getProgression()) - if d.isPlacementNeedsFixing() { - err := d.fixupPlacementForRelocate() - if err != nil { - d.log.Info("Couldn't fix up placement for Relocate") - } - } - const done = true preferredCluster := d.instance.Spec.PreferredCluster @@ -816,7 +801,7 @@ func (d *DRPCInstance) RunRelocate() (bool, error) { } if d.getLastDRState() != rmn.Relocating && !d.validatePeerReady() { - return !done, fmt.Errorf("clean up on secondaries pending (%+v)", d.instance) + return !done, fmt.Errorf("clean up secondaries is pending (%+v)", d.instance.Status.Conditions) } if curHomeCluster != "" && curHomeCluster != preferredCluster { @@ -841,6 +826,11 @@ func (d *DRPCInstance) ensureActionCompleted(srcCluster string) (bool, error) { return !done, err } + err = d.ensurePlacement(srcCluster) + if err != nil { + return !done, err + } + d.setProgression(rmn.ProgressionCleaningUp) // Cleanup and setup VolSync if enabled @@ -1009,12 +999,12 @@ func (d *DRPCInstance) areMultipleVRGsPrimary() bool { func (d *DRPCInstance) validatePeerReady() bool { condition := findCondition(d.instance.Status.Conditions, rmn.ConditionPeerReady) - d.log.Info(fmt.Sprintf("validatePeerReady -- Condition %v", condition)) - if condition == nil || condition.Status == metav1.ConditionTrue { return true } + d.log.Info("validatePeerReady", "Condition", condition) + return false } @@ -1281,15 +1271,13 @@ func (d *DRPCInstance) vrgExistsAndPrimary(targetCluster string) bool { return false } - clusterDecision := d.reconciler.getClusterDecision(d.userPlacement) - if clusterDecision.ClusterName != "" && - targetCluster == clusterDecision.ClusterName { - d.log.Info(fmt.Sprintf("Already %q to cluster %s", d.getLastDRState(), targetCluster)) - - return true + if !vrg.GetDeletionTimestamp().IsZero() { + return false } - return false + d.log.Info(fmt.Sprintf("Already %q to cluster %s", d.getLastDRState(), targetCluster)) + + return true } func (d *DRPCInstance) mwExistsAndPlacementUpdated(targetCluster string) (bool, error) { @@ -1435,7 +1423,7 @@ func (d *DRPCInstance) createVRGManifestWork(homeCluster string, repState rmn.Re d.log.Info("Creating VRG ManifestWork", "Last State:", d.getLastDRState(), "cluster", homeCluster) - vrg := d.generateVRG(repState) + vrg := d.generateVRG(homeCluster, repState) vrg.Spec.VolSync.Disabled = d.volSyncDisabled annotations := make(map[string]string) @@ -1468,6 +1456,18 @@ func (d *DRPCInstance) ensureVRGManifestWork(homeCluster string) error { return d.createVRGManifestWork(homeCluster, cachedVrg.Spec.ReplicationState) } +func (d *DRPCInstance) ensurePlacement(homeCluster string) error { + clusterDecision := d.reconciler.getClusterDecision(d.userPlacement) + if clusterDecision.ClusterName == "" || + homeCluster != clusterDecision.ClusterName { + d.updatePreferredDecision() + + return d.updateUserPlacementRule(homeCluster, homeCluster) + } + + return nil +} + func vrgAction(drpcAction rmn.DRAction) rmn.VRGAction { switch drpcAction { case rmn.ActionFailover: @@ -1488,14 +1488,20 @@ func (d *DRPCInstance) setVRGAction(vrg *rmn.VolumeReplicationGroup) { vrg.Spec.Action = action } -func (d *DRPCInstance) generateVRG(repState rmn.ReplicationState) rmn.VolumeReplicationGroup { +func (d *DRPCInstance) generateVRG(dstCluster string, repState rmn.ReplicationState) rmn.VolumeReplicationGroup { vrg := rmn.VolumeReplicationGroup{ - TypeMeta: metav1.TypeMeta{Kind: "VolumeReplicationGroup", APIVersion: "ramendr.openshift.io/v1alpha1"}, - ObjectMeta: metav1.ObjectMeta{Name: d.instance.Name, Namespace: d.vrgNamespace}, + TypeMeta: metav1.TypeMeta{Kind: "VolumeReplicationGroup", APIVersion: "ramendr.openshift.io/v1alpha1"}, + ObjectMeta: metav1.ObjectMeta{ + Name: d.instance.Name, + Namespace: d.vrgNamespace, + Annotations: map[string]string{ + DestinationClusterAnnotationKey: dstCluster, + }, + }, Spec: rmn.VolumeReplicationGroupSpec{ PVCSelector: d.instance.Spec.PVCSelector, ReplicationState: repState, - S3Profiles: d.availableS3Profiles(), + S3Profiles: AvailableS3Profiles(d.drClusters), KubeObjectProtection: d.instance.Spec.KubeObjectProtection, }, } @@ -1507,21 +1513,6 @@ func (d *DRPCInstance) generateVRG(repState rmn.ReplicationState) rmn.VolumeRepl return vrg } -func (d *DRPCInstance) availableS3Profiles() []string { - profiles := sets.New[string]() - - for i := range d.drClusters { - drCluster := &d.drClusters[i] - if drClusterIsDeleted(drCluster) { - continue - } - - profiles.Insert(drCluster.Spec.S3ProfileName) - } - - return sets.List(profiles) -} - func (d *DRPCInstance) generateVRGSpecAsync() *rmn.VRGAsyncSpec { if dRPolicySupportsRegional(d.drPolicy, d.drClusters) { return &rmn.VRGAsyncSpec{ @@ -1651,10 +1642,9 @@ func (d *DRPCInstance) ensureClusterDataRestored(homeCluster string) (*rmn.Volum annotations[DRPCNameAnnotation] = d.instance.Name annotations[DRPCNamespaceAnnotation] = d.instance.Namespace - vrg, err := d.reconciler.MCVGetter.GetVRGFromManagedCluster(d.instance.Name, - d.vrgNamespace, homeCluster, annotations) - if err != nil { - return nil, false, fmt.Errorf("failed to get VRG %s from cluster %s (err: %w)", d.instance.Name, homeCluster, err) + vrg := d.vrgs[homeCluster] + if vrg == nil { + return nil, false, fmt.Errorf("failed to get VRG %s from cluster %s", d.instance.Name, homeCluster) } // ClusterDataReady condition tells us whether the PVs have been applied on the @@ -2315,126 +2305,6 @@ func (d *DRPCInstance) setConditionOnInitialDeploymentCompletion() { metav1.ConditionTrue, rmn.ReasonSuccess, "Ready") } -func (d *DRPCInstance) isPlacementNeedsFixing() bool { - // Needs fixing if and only if the DRPC Status is empty, the Placement decision is empty, and - // the we have VRG(s) in the managed clusters - clusterDecision := d.reconciler.getClusterDecision(d.userPlacement) - d.log.Info(fmt.Sprintf("Check placement if needs fixing: PrD %v, PlD %v, VRGs %d", - d.instance.Status.PreferredDecision, clusterDecision, len(d.vrgs))) - - if reflect.DeepEqual(d.instance.Status.PreferredDecision, plrv1.PlacementDecision{}) && - (clusterDecision == nil || clusterDecision.ClusterName == "") && len(d.vrgs) > 0 { - return true - } - - return false -} - -func (d *DRPCInstance) selectCurrentPrimaries() []string { - var primaries []string - - for clusterName, vrg := range d.vrgs { - if isVRGPrimary(vrg) { - primaries = append(primaries, clusterName) - } - } - - return primaries -} - -func (d *DRPCInstance) selectPrimaryForFailover(primaries []string) string { - for _, clusterName := range primaries { - if clusterName == d.instance.Spec.FailoverCluster { - return clusterName - } - } - - return "" -} - -func (d *DRPCInstance) fixupPlacementForFailover() error { - d.log.Info("Fixing PlacementRule for failover...") - - var primary string - - var primaries []string - - var secondaries []string - - if d.areMultipleVRGsPrimary() { - primaries := d.selectCurrentPrimaries() - primary = d.selectPrimaryForFailover(primaries) - } else { - primary, secondaries = d.selectCurrentPrimaryAndSecondaries() - } - - // IFF we have a primary cluster, and it points to the failoverCluster, then rebuild the - // drpc status with it. - if primary != "" && primary == d.instance.Spec.FailoverCluster { - err := d.updateUserPlacementRule(primary, "") - if err != nil { - return err - } - - // Update our 'well known' preferred placement - d.updatePreferredDecision() - - peerReadyConditionStatus := metav1.ConditionTrue - peerReadyMsg := "Ready" - // IFF more than one primary then the failover hasn't entered the cleanup phase. - // IFF we have a secondary, then the failover hasn't completed the cleanup phase. - // We need to start where it was left off (best guess). - if len(primaries) > 1 || len(secondaries) > 0 { - d.instance.Status.Phase = rmn.FailingOver - peerReadyConditionStatus = metav1.ConditionFalse - peerReadyMsg = "NotReady" - } - - addOrUpdateCondition(&d.instance.Status.Conditions, rmn.ConditionPeerReady, d.instance.Generation, - peerReadyConditionStatus, rmn.ReasonSuccess, peerReadyMsg) - - addOrUpdateCondition(&d.instance.Status.Conditions, rmn.ConditionAvailable, d.instance.Generation, - metav1.ConditionTrue, string(d.instance.Status.Phase), "Available") - - return nil - } - - return fmt.Errorf("detected a failover, but it can't rebuild the state") -} - -func (d *DRPCInstance) fixupPlacementForRelocate() error { - if d.areMultipleVRGsPrimary() { - return fmt.Errorf("unconstructable state. Can't have multiple primaries on 'Relocate'") - } - - primary, secondaries := d.selectCurrentPrimaryAndSecondaries() - d.log.Info(fmt.Sprintf("Fixing PlacementRule for relocation. Primary (%s), Secondaries (%v)", - primary, secondaries)) - - // IFF we have a primary cluster, then update the PlacementRule and reset PeerReady to false - // Setting the PeerReady condition status to false allows peer cleanup if necessary - if primary != "" { - err := d.updateUserPlacementRule(primary, "") - if err != nil { - return err - } - - // Assume that it is not clean - addOrUpdateCondition(&d.instance.Status.Conditions, rmn.ConditionPeerReady, d.instance.Generation, - metav1.ConditionFalse, rmn.ReasonNotStarted, "NotReady") - } else if len(secondaries) > 1 { - // Use case 3: After Hub Recovery, the DRPC Action found to be 'Relocate', and ALL VRGs are secondary - addOrUpdateCondition(&d.instance.Status.Conditions, rmn.ConditionPeerReady, d.instance.Generation, - metav1.ConditionTrue, rmn.ReasonSuccess, - fmt.Sprintf("Fixed for relocation to %q", d.instance.Spec.PreferredCluster)) - } - - // Update our 'well known' preferred placement - d.updatePreferredDecision() - - return nil -} - func (d *DRPCInstance) setStatusInitiating() { if !(d.instance.Status.Phase == "" || d.instance.Status.Phase == rmn.Deployed || diff --git a/controllers/drplacementcontrol_controller.go b/controllers/drplacementcontrol_controller.go index 4cd05ca4eb..bd6644f0f7 100644 --- a/controllers/drplacementcontrol_controller.go +++ b/controllers/drplacementcontrol_controller.go @@ -20,6 +20,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/sets" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" @@ -57,6 +58,8 @@ const ( // Maximum retries to create PlacementDecisionName with an increasing index in case of conflicts // with existing PlacementDecision resources MaxPlacementDecisionConflictCount = 5 + + DestinationClusterAnnotationKey = "drplacementcontrol.ramendr.openshift.io/destination-cluster" ) var InitialWaitTimeForDRPCPlacementRule = errorswrapper.New("Waiting for DRPC Placement to produces placement decision") @@ -587,7 +590,7 @@ func (r *DRPlacementControlReconciler) SetupWithManager(mgr ctrl.Manager) error // For more details, check Reconcile and its Result here: // - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.7.0/pkg/reconcile // -//nolint:cyclop,funlen,gocognit +//nolint:funlen,gocognit,gocyclo,cyclop func (r *DRPlacementControlReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { logger := r.Log.WithValues("DRPC", req.NamespacedName, "rid", uuid.New()) @@ -651,6 +654,16 @@ func (r *DRPlacementControlReconciler) Reconcile(ctx context.Context, req ctrl.R return ctrl.Result{Requeue: true}, nil } + // Rebuild DRPC state if needed + requeue, err := r.ensureDRPCStatusConsistency(ctx, drpc, drPolicy, placementObj, logger) + if err != nil { + return ctrl.Result{}, err + } + + if requeue { + return ctrl.Result{Requeue: true}, r.updateDRPCStatus(ctx, drpc, placementObj, logger) + } + d, err := r.createDRPCInstance(ctx, drPolicy, drpc, placementObj, logger) if err != nil && !errorswrapper.Is(err, InitialWaitTimeForDRPCPlacementRule) { err2 := r.updateDRPCStatus(ctx, drpc, placementObj, logger) @@ -775,7 +788,7 @@ func (r *DRPlacementControlReconciler) createDRPCInstance( return nil, err } - vrgs, err := updateVRGsFromManagedClusters(r.MCVGetter, drpc, drClusters, vrgNamespace, log) + vrgs, _, _, err := getVRGsFromManagedClusters(r.MCVGetter, drpc, drClusters, vrgNamespace, log) if err != nil { return nil, err } @@ -1070,7 +1083,7 @@ func (r *DRPlacementControlReconciler) finalizeDRPC(ctx context.Context, drpc *r } // Verify VRGs have been deleted - vrgs, _, err := getVRGsFromManagedClusters(r.MCVGetter, drpc, drClusters, vrgNamespace, log) + vrgs, _, _, err := getVRGsFromManagedClusters(r.MCVGetter, drpc, drClusters, vrgNamespace, log) if err != nil { return fmt.Errorf("failed to retrieve VRGs. We'll retry later. Error (%w)", err) } @@ -1425,30 +1438,13 @@ func (r *DRPlacementControlReconciler) clonePlacementRule(ctx context.Context, return clonedPlRule, nil } -func updateVRGsFromManagedClusters(mcvGetter rmnutil.ManagedClusterViewGetter, drpc *rmn.DRPlacementControl, - drClusters []rmn.DRCluster, vrgNamespace string, log logr.Logger, -) (map[string]*rmn.VolumeReplicationGroup, error) { - vrgs, failedClusterToQuery, err := getVRGsFromManagedClusters(mcvGetter, drpc, drClusters, vrgNamespace, log) - if err != nil { - return nil, err - } - - if len(vrgs) == 0 && failedClusterToQuery != drpc.Spec.FailoverCluster { - condition := findCondition(drpc.Status.Conditions, rmn.ConditionPeerReady) - if condition == nil { - addOrUpdateCondition(&drpc.Status.Conditions, rmn.ConditionAvailable, - drpc.Generation, metav1.ConditionTrue, rmn.ReasonSuccess, "Forcing Available") - addOrUpdateCondition(&drpc.Status.Conditions, rmn.ConditionPeerReady, drpc.Generation, - metav1.ConditionTrue, rmn.ReasonSuccess, "Forcing Ready") - } - } - - return vrgs, nil -} - -func getVRGsFromManagedClusters(mcvGetter rmnutil.ManagedClusterViewGetter, drpc *rmn.DRPlacementControl, - drClusters []rmn.DRCluster, vrgNamespace string, log logr.Logger, -) (map[string]*rmn.VolumeReplicationGroup, string, error) { +func getVRGsFromManagedClusters( + mcvGetter rmnutil.ManagedClusterViewGetter, + drpc *rmn.DRPlacementControl, + drClusters []rmn.DRCluster, + vrgNamespace string, + log logr.Logger, +) (map[string]*rmn.VolumeReplicationGroup, int, string, error) { vrgs := map[string]*rmn.VolumeReplicationGroup{} annotations := make(map[string]string) @@ -1456,10 +1452,10 @@ func getVRGsFromManagedClusters(mcvGetter rmnutil.ManagedClusterViewGetter, drpc annotations[DRPCNameAnnotation] = drpc.Name annotations[DRPCNamespaceAnnotation] = drpc.Namespace - var failedClusterToQuery string - var clustersQueriedSuccessfully int + var failedCluster string + for i := range drClusters { drCluster := &drClusters[i] @@ -1473,9 +1469,9 @@ func getVRGsFromManagedClusters(mcvGetter rmnutil.ManagedClusterViewGetter, drpc continue } - failedClusterToQuery = drCluster.Name + failedCluster = drCluster.Name - log.Info(fmt.Sprintf("failed to retrieve VRG from %s. err (%v)", drCluster.Name, err)) + log.Info(fmt.Sprintf("failed to retrieve VRG from %s. err (%v).", drCluster.Name, err)) continue } @@ -1495,14 +1491,14 @@ func getVRGsFromManagedClusters(mcvGetter rmnutil.ManagedClusterViewGetter, drpc // We are done if we successfully queried all drClusters if clustersQueriedSuccessfully == len(drClusters) { - return vrgs, "", nil + return vrgs, clustersQueriedSuccessfully, "", nil } if clustersQueriedSuccessfully == 0 { - return vrgs, "", fmt.Errorf("failed to retrieve VRGs from clusters") + return vrgs, 0, "", fmt.Errorf("failed to retrieve VRGs from clusters") } - return vrgs, failedClusterToQuery, nil + return vrgs, clustersQueriedSuccessfully, failedCluster, nil } func (r *DRPlacementControlReconciler) deleteClonedPlacementRule(ctx context.Context, @@ -1545,6 +1541,10 @@ func (r *DRPlacementControlReconciler) addClusterPeersToPlacementRule( // DRPC status is updated at least once every StatusCheckDelay in order to refresh // the VRG status. func (d *DRPCInstance) statusUpdateTimeElapsed() bool { + if d.instance.Status.LastUpdateTime == nil { + return false + } + return d.instance.Status.LastUpdateTime.Add(StatusCheckDelay).Before(time.Now()) } @@ -2051,3 +2051,259 @@ func ensureDRPCConditionsInited(conditions *[]metav1.Condition, observedGenerati Message: message, }) } + +func AvailableS3Profiles(drClusters []rmn.DRCluster) []string { + profiles := sets.New[string]() + + for i := range drClusters { + drCluster := &drClusters[i] + if drClusterIsDeleted(drCluster) { + continue + } + + profiles.Insert(drCluster.Spec.S3ProfileName) + } + + return sets.List(profiles) +} + +type Progress int + +const ( + Continue = 1 + AllowFailover = 2 + Stop = 3 +) + +func (r *DRPlacementControlReconciler) ensureDRPCStatusConsistency( + ctx context.Context, + drpc *rmn.DRPlacementControl, + drPolicy *rmn.DRPolicy, + placementObj client.Object, + log logr.Logger, +) (bool, error) { + requeue := true + + // This will always be false the first time the DRPC resource is first created OR after hub recovery + if drpc.Status.Phase != "" { + return !requeue, nil + } + + dstCluster := drpc.Spec.PreferredCluster + if drpc.Spec.Action == rmn.ActionFailover { + dstCluster = drpc.Spec.FailoverCluster + } + + progress, err := r.determineDRPCState(ctx, drpc, drPolicy, placementObj, dstCluster, log) + if err != nil { + return requeue, err + } + + switch progress { + case Continue: + return !requeue, nil + case AllowFailover: + updateDRPCProgression(drpc, rmn.ProgressionActionPaused, log) + addOrUpdateCondition(&drpc.Status.Conditions, rmn.ConditionAvailable, + drpc.Generation, metav1.ConditionTrue, rmn.ReasonSuccess, "Failover allowed") + addOrUpdateCondition(&drpc.Status.Conditions, rmn.ConditionPeerReady, drpc.Generation, + metav1.ConditionTrue, rmn.ReasonSuccess, "Failover allowed") + + return requeue, nil + default: + msg := "Operation Paused - User Intervention Required." + + log.Info(fmt.Sprintf("err:%v - msg:%s", err, msg)) + updateDRPCProgression(drpc, rmn.ProgressionActionPaused, log) + addOrUpdateCondition(&drpc.Status.Conditions, rmn.ConditionAvailable, + drpc.Generation, metav1.ConditionFalse, rmn.ReasonPaused, msg) + addOrUpdateCondition(&drpc.Status.Conditions, rmn.ConditionPeerReady, drpc.Generation, + metav1.ConditionFalse, rmn.ReasonPaused, msg) + + return requeue, nil + } +} + +// determineDRPCState runs the following algorithm +// 1. Stop Condition for Both Failed Queries: +// If attempts to query 2 clusters result in failure for both, the process is halted. + +// 2. Initial Deployment without VRGs: +// If 2 clusters are successfully queried, and no VRGs are found, proceed with the +// initial deployment. + +// 3. Handling Failures with S3 Store Check: +// - If 2 clusters are queried, 1 fails, and 0 VRGs are found, perform the following checks: +// - If the VRG is found in the S3 store, ensure that the DRPC action matches the VRG action. +// If not, stop until the action is corrected, allowing failover if necessary (set PeerReady). +// - If the VRG is not found in the S3 store and the failed cluster is not the destination +// cluster, continue with the initial deployment. + +// 4. Verification and Failover for VRGs on Failover Cluster: +// If 2 clusters are queried, 1 fails, and 1 VRG is found on the failover cluster, check +// the action: +// - If the actions don't match, stop until corrected by the user. +// - If they match, also stop but allow failover if the VRG in-hand is a secondary. +// Otherwise, continue. + +// 5. Handling VRGs on Destination Cluster: +// If 2 clusters are queried successfully and 1 or more VRGs are found, and one of the +// VRGs is on the destination cluster, perform the following checks: +// - Continue with the action only if the DRPC and the found VRG action match. +// - Stop until someone investigates if there is a mismatch, but allow failover to +// take place (set PeerReady). + +// 6. Otherwise, default to allowing Failover: +// If none of the above conditions apply, allow failover (set PeerReady) but stop until +// someone makes the necessary change. +// +//nolint:funlen,nestif,gocognit,gocyclo,cyclop +func (r *DRPlacementControlReconciler) determineDRPCState( + ctx context.Context, + drpc *rmn.DRPlacementControl, + drPolicy *rmn.DRPolicy, + placementObj client.Object, + dstCluster string, + log logr.Logger, +) (Progress, error) { + vrgNamespace, err := selectVRGNamespace(r.Client, log, drpc, placementObj) + if err != nil { + log.Info("Failed to select VRG namespace") + + return Stop, err + } + + drClusters, err := getDRClusters(ctx, r.Client, drPolicy) + if err != nil { + return Stop, err + } + + vrgs, successfullyQueriedClusterCount, failedCluster, err := getVRGsFromManagedClusters( + r.MCVGetter, drpc, drClusters, vrgNamespace, log) + if err != nil { + log.Info("Failed to get a list of VRGs") + + return Stop, err + } + + // IF 2 clusters queried, and both queries failed, then STOP + if successfullyQueriedClusterCount == 0 { + log.Info("Number of clusters queried is 0. Stop...") + + return Stop, nil + } + + // IF 2 clusters queried successfully and no VRGs, then continue with initial deployment + if successfullyQueriedClusterCount == 2 && len(vrgs) == 0 { + log.Info("Queried 2 clusters successfully") + + return Continue, nil + } + + // IF queried 2 clusters queried, 1 failed and 0 VRG found, then check s3 store. + // IF the VRG found in the s3 store, ensure that the DRPC action and the VRG action match. IF not, stop until + // the action is corrected, but allow failover to take place if needed (set PeerReady) + // If the VRG is not found in the s3 store and the failedCluster is not the destination cluster, then continue + // with initial deploy + if successfullyQueriedClusterCount == 1 && len(vrgs) == 0 { + vrg := GetLastKnownVRGPrimaryFromS3(ctx, r.APIReader, + AvailableS3Profiles(drClusters), drpc.GetName(), vrgNamespace, r.ObjStoreGetter, log) + if vrg == nil { + // IF the failed cluster is not the dest cluster, then this could be an initial deploy + if failedCluster != dstCluster { + return Continue, nil + } + + log.Info("Unable to query all clusters and failed to get VRG from s3 store") + + return Stop, nil + } + + log.Info("VRG From s3", "VRG Spec", vrg.Spec, "VRG Annotations", vrg.GetAnnotations()) + + if drpc.Spec.Action != rmn.DRAction(vrg.Spec.Action) { + log.Info(fmt.Sprintf("Two different actions - drpc action is '%s'/vrg action from s3 is '%s'", + drpc.Spec.Action, vrg.Spec.Action)) + + return AllowFailover, nil + } + + if dstCluster == vrg.GetAnnotations()[DestinationClusterAnnotationKey] && + dstCluster != failedCluster { + log.Info(fmt.Sprintf("VRG from s3. Same dstCluster %s/%s. Proceeding...", + dstCluster, vrg.GetAnnotations()[DestinationClusterAnnotationKey])) + + return Continue, nil + } + + log.Info(fmt.Sprintf("VRG from s3. DRPCAction/vrgAction/DRPCDstClstr/vrgDstClstr %s/%s/%s/%s. Allow Failover...", + drpc.Spec.Action, vrg.Spec.Action, dstCluster, vrg.GetAnnotations()[DestinationClusterAnnotationKey])) + + return AllowFailover, nil + } + + // IF 2 clusters queried, 1 failed and 1 VRG found on the failover cluster, then check the action, if they don't + // match, stop until corrected by the user. If they do match, then also stop but allow failover if the VRG in-hand + // is a secondary. Othewise, continue... + if successfullyQueriedClusterCount == 1 && len(vrgs) == 1 { + var clusterName string + + var vrg *rmn.VolumeReplicationGroup + for k, v := range vrgs { + clusterName, vrg = k, v + + break + } + + if drpc.Spec.Action != rmn.DRAction(vrg.Spec.Action) { + log.Info(fmt.Sprintf("Stop! Two different actions - drpc action is '%s'/vrg action is '%s'", + drpc.Spec.Action, vrg.Spec.Action)) + + return Stop, nil + } + + if dstCluster != clusterName && vrg.Spec.ReplicationState == rmn.Secondary { + log.Info(fmt.Sprintf("Same Action and dstCluster and ReplicationState %s/%s/%s", + drpc.Spec.Action, dstCluster, vrg.Spec.ReplicationState)) + + log.Info("Failover is allowed - Primary is assumed in the failed cluster") + + return AllowFailover, nil + } + + log.Info("Allow to continue") + + return Continue, nil + } + + // Finally, IF 2 clusters queried successfully and 1 or more VRGs found, and if one of the VRGs is on the dstCluster, + // then continue with action if and only if DRPC and the found VRG action match. otherwise, stop until someone + // investigates but allow failover to take place (set PeerReady) + if successfullyQueriedClusterCount == 2 && len(vrgs) >= 1 { + var clusterName string + + var vrg *rmn.VolumeReplicationGroup + + for k, v := range vrgs { + clusterName, vrg = k, v + + break + } + + if drpc.Spec.Action == rmn.DRAction(vrg.Spec.Action) { + log.Info(fmt.Sprintf("Same Action %s", drpc.Spec.Action)) + + return Continue, nil + } + + log.Info("Failover is allowed", "vrgs count", len(vrgs), "drpc action", + drpc.Spec.Action, "vrg action", vrg.Spec.Action, "dstCluster/clusterName", dstCluster+"/"+clusterName) + + return AllowFailover, nil + } + + // IF none of the above, then allow failover (set PeerReady), but stop until someone makes the change + log.Info("Failover is allowed, but user intervention is required") + + return AllowFailover, nil +} diff --git a/controllers/drplacementcontrol_controller_test.go b/controllers/drplacementcontrol_controller_test.go index 7d16cc4332..023d570256 100644 --- a/controllers/drplacementcontrol_controller_test.go +++ b/controllers/drplacementcontrol_controller_test.go @@ -60,6 +60,8 @@ const ( ) var ( + NumberOfVrgsToReturnWhenRebuildingState = 0 + UseApplicationSet = false west1Cluster = &spokeClusterV1.ManagedCluster{ @@ -432,58 +434,63 @@ func (f FakeMCVGetter) GetVRGFromManagedCluster(resourceName, resourceNamespace, return nil, errors.NewNotFound(schema.GroupResource{}, "requested resource not found in ManagedCluster") case "getVRGsFromManagedClusters": - vrgFromMW, err := getVRGFromManifestWork(managedCluster) - if err != nil { - if errors.IsNotFound(err) { - if getFunctionNameAtIndex(3) == "getVRGs" { // Called only from DRCluster reconciler, at present - return fakeVRGWithMModesProtectedPVC() - } - } + return doGetFakeVRGsFromManagedClusters(managedCluster, vrgStatus) + } + + return nil, fmt.Errorf("unknown caller %s", getFunctionNameAtIndex(2)) +} - return nil, err +func doGetFakeVRGsFromManagedClusters(managedCluster string, vrgStatus rmn.VolumeReplicationGroupStatus, +) (*rmn.VolumeReplicationGroup, error) { + vrgFromMW, err := getVRGFromManifestWork(managedCluster) + if err != nil { + if errors.IsNotFound(err) { + if getFunctionNameAtIndex(3) == "getVRGs" { // Called only from DRCluster reconciler, at present + return fakeVRGWithMModesProtectedPVC() + } } - if vrgFromMW != nil { - vrgFromMW.Generation = 1 - vrgFromMW.Status = vrgStatus - vrgFromMW.Status.Conditions = append(vrgFromMW.Status.Conditions, metav1.Condition{ - Type: controllers.VRGConditionTypeClusterDataReady, - Reason: controllers.VRGConditionReasonClusterDataRestored, - Status: metav1.ConditionTrue, - Message: "Cluster Data Ready", - LastTransitionTime: metav1.Now(), - ObservedGeneration: vrgFromMW.Generation, - }) - vrgFromMW.Status.Conditions = append(vrgFromMW.Status.Conditions, metav1.Condition{ - Type: controllers.VRGConditionTypeClusterDataProtected, - Reason: controllers.VRGConditionReasonClusterDataRestored, - Status: metav1.ConditionTrue, - Message: "Cluster Data Protected", - LastTransitionTime: metav1.Now(), - ObservedGeneration: vrgFromMW.Generation, - }) - vrgFromMW.Status.Conditions = append(vrgFromMW.Status.Conditions, metav1.Condition{ - Type: controllers.VRGConditionTypeDataProtected, - Reason: controllers.VRGConditionReasonDataProtected, - Status: metav1.ConditionTrue, - Message: "Data Protected", - LastTransitionTime: metav1.Now(), - ObservedGeneration: vrgFromMW.Generation, - }) + return nil, err + } - protectedPVC := &rmn.ProtectedPVC{} - protectedPVC.Name = "random name" - protectedPVC.StorageIdentifiers.ReplicationID.ID = MModeReplicationID - protectedPVC.StorageIdentifiers.StorageProvisioner = MModeCSIProvisioner - protectedPVC.StorageIdentifiers.ReplicationID.Modes = []rmn.MMode{rmn.MModeFailover} + if vrgFromMW != nil { + vrgFromMW.Generation = 1 + vrgFromMW.Status = vrgStatus + vrgFromMW.Status.Conditions = append(vrgFromMW.Status.Conditions, metav1.Condition{ + Type: controllers.VRGConditionTypeClusterDataReady, + Reason: controllers.VRGConditionReasonClusterDataRestored, + Status: metav1.ConditionTrue, + Message: "Cluster Data Ready", + LastTransitionTime: metav1.Now(), + ObservedGeneration: vrgFromMW.Generation, + }) + vrgFromMW.Status.Conditions = append(vrgFromMW.Status.Conditions, metav1.Condition{ + Type: controllers.VRGConditionTypeClusterDataProtected, + Reason: controllers.VRGConditionReasonClusterDataRestored, + Status: metav1.ConditionTrue, + Message: "Cluster Data Protected", + LastTransitionTime: metav1.Now(), + ObservedGeneration: vrgFromMW.Generation, + }) + vrgFromMW.Status.Conditions = append(vrgFromMW.Status.Conditions, metav1.Condition{ + Type: controllers.VRGConditionTypeDataProtected, + Reason: controllers.VRGConditionReasonDataProtected, + Status: metav1.ConditionTrue, + Message: "Data Protected", + LastTransitionTime: metav1.Now(), + ObservedGeneration: vrgFromMW.Generation, + }) - vrgFromMW.Status.ProtectedPVCs = append(vrgFromMW.Status.ProtectedPVCs, *protectedPVC) - } + protectedPVC := &rmn.ProtectedPVC{} + protectedPVC.Name = "random name" + protectedPVC.StorageIdentifiers.ReplicationID.ID = MModeReplicationID + protectedPVC.StorageIdentifiers.StorageProvisioner = MModeCSIProvisioner + protectedPVC.StorageIdentifiers.ReplicationID.Modes = []rmn.MMode{rmn.MModeFailover} - return vrgFromMW, nil + vrgFromMW.Status.ProtectedPVCs = append(vrgFromMW.Status.ProtectedPVCs, *protectedPVC) } - return nil, fmt.Errorf("unknown caller %s", getFunctionNameAtIndex(2)) + return vrgFromMW, nil } func (f FakeMCVGetter) DeleteVRGManagedClusterView( @@ -1304,7 +1311,7 @@ func verifyUserPlacementRuleDecisionUnchanged(name, namespace, homeCluster strin var placementObj client.Object - Consistently(func() bool { + Eventually(func() bool { err := k8sClient.Get(context.TODO(), usrPlcementLookupKey, usrPlRule) if errors.IsNotFound(err) { usrPlmnt := &clrapiv1beta1.Placement{} @@ -1425,6 +1432,7 @@ func waitForCompletion(expectedState string) { fmt.Sprintf("failed waiting for state to match. expecting: %s, found %s", expectedState, drstate)) } +//nolint:unused func waitForUpdateDRPCStatus() { Eventually(func() bool { drpc := getLatestDRPC() @@ -1491,6 +1499,7 @@ func runFailoverAction(placementObj client.Object, fromCluster, toCluster string Expect(decision.ClusterName).To(Equal(toCluster)) } +//nolint:all func clearDRActionAfterFailover(userPlacementRule *plrv1.PlacementRule, preferredCluster, failoverCluster string) { drstate = "none" @@ -1831,9 +1840,32 @@ func verifyDRPCOwnedByPlacement(placementObj client.Object, drpc *rmn.DRPlacemen Fail(fmt.Sprintf("DRPC %s not owned by Placement %s", drpc.GetName(), placementObj.GetName())) } +//nolint:all +func checkConditionAllowFailover() { + var drpc *rmn.DRPlacementControl + + var availableCondition metav1.Condition + + Eventually(func() bool { + drpc = getLatestDRPC() + for _, availableCondition = range drpc.Status.Conditions { + if availableCondition.Type != rmn.ConditionPeerReady { + if availableCondition.Status == metav1.ConditionTrue { + return true + } + } + } + + return false + }, timeout, interval).Should(BeTrue(), fmt.Sprintf("Condition '%+v'", availableCondition)) + + Expect(drpc.Status.Phase).To(Equal(rmn.DRState(""))) + Expect(availableCondition.Message).Should(Equal("Failover allowed")) +} + // +kubebuilder:docs-gen:collapse=Imports // -//nolint:errcheck +//nolint:errcheck,scopelint var _ = Describe("DRPlacementControl Reconciler", func() { Specify("DRClusters", func() { populateDRClusters() @@ -1858,7 +1890,7 @@ var _ = Describe("DRPlacementControl Reconciler", func() { By("\n\n*** Failover - 1\n\n") setRestorePVsUncomplete() setDRPCSpecExpectationTo(rmn.ActionFailover, East1ManagedCluster, West1ManagedCluster) - verifyUserPlacementRuleDecisionUnchanged(userPlacementRule.Name, userPlacementRule.Namespace, East1ManagedCluster) + verifyUserPlacementRuleDecisionUnchanged(userPlacementRule.Name, userPlacementRule.Namespace, West1ManagedCluster) // MWs for VRG, NS, DRCluster, and MMode Eventually(getManifestWorkCount, timeout, interval).WithArguments(West1ManagedCluster).Should(Equal(4)) setRestorePVsComplete() @@ -1884,21 +1916,6 @@ var _ = Describe("DRPlacementControl Reconciler", func() { runRelocateAction(userPlacementRule, West1ManagedCluster, false, false) }) }) - When("DRAction is Relocate during hub recovery", func() { - It("Should reconstructs the DRPC state and points to Primary (East1ManagedCluster)", func() { - // ----------------------------- RELOCATION TO PRIMARY -------------------------------------- - By("\n\n*** Relocate - 2\n\n") - clearFakeUserPlacementRuleStatus() - clearDRPCStatus() - runRelocateAction(userPlacementRule, West1ManagedCluster, false, false) - }) - }) - When("DRAction is cleared after relocation", func() { - It("Should not do anything", func() { - // ----------------------------- Clear DRAction -------------------------------------- - clearDRActionAfterRelocate(userPlacementRule, East1ManagedCluster, West1ManagedCluster) - }) - }) When("DRAction is changed to Failover after relocation", func() { It("Should failover again to Secondary (West1ManagedCluster)", func() { // ----------------------------- FAILOVER TO SECONDARY -------------------------------------- @@ -1906,30 +1923,6 @@ var _ = Describe("DRPlacementControl Reconciler", func() { runFailoverAction(userPlacementRule, East1ManagedCluster, West1ManagedCluster, false, false) }) }) - When("DRAction is cleared after failover", func() { - It("Should not do anything", func() { - // ----------------------------- Clear DRAction -------------------------------------- - By("\n\n>>> clearing DRAction") - clearDRActionAfterFailover(userPlacementRule, East1ManagedCluster, West1ManagedCluster) - }) - }) - When("DRAction is cleared but DRPC status is empty", func() { - It("Should do nothing for placement as it is deployed somewhere else", func() { - // ----------------------------- Clear DRAction -------------------------------------- - By("\n\n>>> Update DRPC status") - clearDRPCStatus() - waitForCompletion(string("")) - drpc = getLatestDRPC() - // At this point expect the DRPC status condition to have 2 types - // {Available and PeerReady} - // Final state didn't change and it is 'FailedOver' even though we tried to run - // initial deployment. But the status remains cleared. - Expect(drpc.Status.Phase).To(Equal(rmn.DRState(""))) - decision := getLatestUserPlacementDecision(userPlacementRule.Name, userPlacementRule.Namespace) - Expect(decision.ClusterName).To(Equal(West1ManagedCluster)) - Expect(drpc.GetAnnotations()[controllers.LastAppDeploymentCluster]).To(Equal(West1ManagedCluster)) - }) - }) When("DRAction is set to Relocate", func() { It("Should relocate to Primary (East1ManagedCluster)", func() { // ----------------------------- RELOCATION TO PRIMARY -------------------------------------- @@ -2009,7 +2002,7 @@ var _ = Describe("DRPlacementControl Reconciler", func() { It("Should not failover to Secondary (West1ManagedCluster) till PV manifest is applied", func() { setRestorePVsUncomplete() setDRPCSpecExpectationTo(rmn.ActionFailover, East1ManagedCluster, West1ManagedCluster) - verifyUserPlacementRuleDecisionUnchanged(placement.Name, placement.Namespace, East1ManagedCluster) + verifyUserPlacementRuleDecisionUnchanged(placement.Name, placement.Namespace, West1ManagedCluster) // MWs for VRG, NS, VRG DRCluster, and MMode Expect(getManifestWorkCount(West1ManagedCluster)).Should(Equal(4)) Expect(len(getPlacementDecision(placement.GetName(), placement.GetNamespace()). @@ -2150,14 +2143,14 @@ var _ = Describe("DRPlacementControl Reconciler", func() { }) Context("DRPlacementControl Reconciler Sync DR", func() { userPlacementRule := &plrv1.PlacementRule{} - drpc := &rmn.DRPlacementControl{} + // drpc := &rmn.DRPlacementControl{} Specify("DRClusters", func() { populateDRClusters() }) When("An Application is deployed for the first time", func() { It("Should deploy to East1ManagedCluster", func() { By("Initial Deployment") - userPlacementRule, drpc = InitialDeploymentSync(DRPCNamespaceName, UserPlacementRuleName, East1ManagedCluster) + userPlacementRule, _ = InitialDeploymentSync(DRPCNamespaceName, UserPlacementRuleName, East1ManagedCluster) verifyInitialDRPCDeployment(userPlacementRule, East1ManagedCluster) verifyDRPCOwnedByPlacement(userPlacementRule, getLatestDRPC()) }) @@ -2168,7 +2161,7 @@ var _ = Describe("DRPlacementControl Reconciler", func() { setRestorePVsUncomplete() fenceCluster(East1ManagedCluster, false) setDRPCSpecExpectationTo(rmn.ActionFailover, East1ManagedCluster, East2ManagedCluster) - verifyUserPlacementRuleDecisionUnchanged(userPlacementRule.Name, userPlacementRule.Namespace, East1ManagedCluster) + verifyUserPlacementRuleDecisionUnchanged(userPlacementRule.Name, userPlacementRule.Namespace, East2ManagedCluster) // MWs for VRG, VRG DRCluster and the MW for NetworkFence CR to fence off // East1ManagedCluster Expect(getManifestWorkCount(East2ManagedCluster)).Should(Equal(4)) @@ -2200,29 +2193,6 @@ var _ = Describe("DRPlacementControl Reconciler", func() { runFailoverAction(userPlacementRule, East1ManagedCluster, East2ManagedCluster, true, false) }) }) - When("DRAction is cleared after failover", func() { - It("Should not do anything", func() { - // ----------------------------- Clear DRAction -------------------------------------- - By("\n\n>>> clearing DRAction") - clearDRActionAfterFailover(userPlacementRule, East1ManagedCluster, East2ManagedCluster) - }) - }) - When("DRAction is cleared but DRPC status is empty", func() { - It("Should do nothing for placement as it is deployed somewhere else", func() { - // ----------------------------- Clear DRAction -------------------------------------- - By("\n\n>>> Update DRPC status") - clearDRPCStatus() - waitForCompletion(string("")) - drpc = getLatestDRPC() - // At this point expect the DRPC status condition to have 2 types - // {Available and PeerReady} - // Final state didn't change and it is 'FailedOver' even though we tried to run - // initial deployment. But the status remains cleared. - Expect(drpc.Status.Phase).To(Equal(rmn.DRState(""))) - decision := getLatestUserPlacementDecision(userPlacementRule.Name, userPlacementRule.Namespace) - Expect(decision.ClusterName).To(Equal(East2ManagedCluster)) - }) - }) When("DRAction is set to Relocate", func() { It("Should relocate to Primary (East1ManagedCluster)", func() { // ----------------------------- RELOCATION TO PRIMARY FOR SYNC DR------------------------ @@ -2249,14 +2219,14 @@ var _ = Describe("DRPlacementControl Reconciler", func() { // manual fencing and manual unfencing userPlacementRule = &plrv1.PlacementRule{} - drpc = &rmn.DRPlacementControl{} + // drpc = &rmn.DRPlacementControl{} Specify("DRClusters", func() { populateDRClusters() }) When("An Application is deployed for the first time", func() { It("Should deploy to East1ManagedCluster", func() { By("Initial Deployment") - userPlacementRule, drpc = InitialDeploymentSync(DRPCNamespaceName, UserPlacementRuleName, East1ManagedCluster) + userPlacementRule, _ = InitialDeploymentSync(DRPCNamespaceName, UserPlacementRuleName, East1ManagedCluster) verifyInitialDRPCDeployment(userPlacementRule, East1ManagedCluster) verifyDRPCOwnedByPlacement(userPlacementRule, getLatestDRPC()) }) @@ -2267,7 +2237,7 @@ var _ = Describe("DRPlacementControl Reconciler", func() { setRestorePVsUncomplete() fenceCluster(East1ManagedCluster, true) setDRPCSpecExpectationTo(rmn.ActionFailover, East1ManagedCluster, East2ManagedCluster) - verifyUserPlacementRuleDecisionUnchanged(userPlacementRule.Name, userPlacementRule.Namespace, East1ManagedCluster) + verifyUserPlacementRuleDecisionUnchanged(userPlacementRule.Name, userPlacementRule.Namespace, East2ManagedCluster) // MWs for VRG, VRG DRCluster and the MW for NetworkFence CR to fence off // East1ManagedCluster Expect(getManifestWorkCount(East2ManagedCluster)).Should(Equal(4)) @@ -2286,12 +2256,6 @@ var _ = Describe("DRPlacementControl Reconciler", func() { runRelocateAction(userPlacementRule, East2ManagedCluster, true, true) }) }) - When("DRAction is cleared after relocation", func() { - It("Should not do anything", func() { - // ----------------------------- Clear DRAction -------------------------------------- - clearDRActionAfterRelocate(userPlacementRule, East1ManagedCluster, East2ManagedCluster) - }) - }) When("DRAction is changed to Failover after relocation", func() { It("Should failover again to Secondary (East2ManagedCluster)", func() { // ----------------------------- FAILOVER TO SECONDARY FOR SYNC DR-------------------- @@ -2299,29 +2263,6 @@ var _ = Describe("DRPlacementControl Reconciler", func() { runFailoverAction(userPlacementRule, East1ManagedCluster, East2ManagedCluster, true, true) }) }) - When("DRAction is cleared after failover", func() { - It("Should not do anything", func() { - // ----------------------------- Clear DRAction -------------------------------------- - By("\n\n>>> clearing DRAction") - clearDRActionAfterFailover(userPlacementRule, East1ManagedCluster, East2ManagedCluster) - }) - }) - When("DRAction is cleared but DRPC status is empty", func() { - It("Should do nothing for placement as it is deployed somewhere else", func() { - // ----------------------------- Clear DRAction -------------------------------------- - By("\n\n>>> Update DRPC status") - clearDRPCStatus() - waitForCompletion(string("")) - drpc = getLatestDRPC() - // At this point expect the DRPC status condition to have 2 types - // {Available and PeerReady} - // Final state didn't change and it is 'FailedOver' even though we tried to run - // initial deployment. But the status remains cleared. - Expect(drpc.Status.Phase).To(Equal(rmn.DRState(""))) - decision := getLatestUserPlacementDecision(userPlacementRule.Name, userPlacementRule.Namespace) - Expect(decision.ClusterName).To(Equal(East2ManagedCluster)) - }) - }) When("DRAction is set to Relocate", func() { It("Should relocate to Primary (East1ManagedCluster)", func() { // ----------------------------- RELOCATION TO PRIMARY FOR SYNC DR------------------------ diff --git a/controllers/drplacementcontrolvolsync.go b/controllers/drplacementcontrolvolsync.go index 04b284cdb8..691382ddbb 100644 --- a/controllers/drplacementcontrolvolsync.go +++ b/controllers/drplacementcontrolvolsync.go @@ -308,7 +308,7 @@ func (d *DRPCInstance) createVolSyncDestManifestWork(clusterToSkip string) error annotations[DRPCNameAnnotation] = d.instance.Name annotations[DRPCNamespaceAnnotation] = d.instance.Namespace - vrg := d.generateVRG(rmn.Secondary) + vrg := d.generateVRG(dstCluster, rmn.Secondary) if err := d.mwu.CreateOrUpdateVRGManifestWork( d.instance.Name, d.vrgNamespace, dstCluster, vrg, annotations); err != nil {