Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix disable dr when VR failed validation #1570

Merged
merged 7 commits into from
Oct 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
136 changes: 92 additions & 44 deletions internal/controller/vrg_volrep.go
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,8 @@ func (v *VRGInstance) preparePVCForVRDeletion(pvc *corev1.PersistentVolumeClaim,
pvc.Spec.VolumeName, pvc.Namespace, pvc.Name, v.instance.Namespace, v.instance.Name, err)
}

log.Info("Deleted ramen annotations from PersistentVolume", "pv", pv.Name)

ownerRemoved := rmnutil.ObjectOwnerUnsetIfSet(pvc, vrg)
// Remove VR finalizer from PVC and the annotation (PVC maybe left behind, so remove the annotation)
finalizerRemoved := controllerutil.RemoveFinalizer(pvc, PvcVRFinalizerProtected)
Expand All @@ -470,8 +472,8 @@ func (v *VRGInstance) preparePVCForVRDeletion(pvc *corev1.PersistentVolumeClaim,
pvc.Namespace, pvc.Name, v.instance.Namespace, v.instance.Name, err)
}

log1.Info("PVC update for VR deletion",
"finalizers", pvc.GetFinalizers(), "labels", pvc.GetLabels(), "annotations", pvc.GetAnnotations())
log1.Info("Deleted ramen annotations, labels, and finallizers from PersistentVolumeClaim",
"annotations", pvc.GetAnnotations(), "labels", pvc.GetLabels(), "finalizers", pvc.GetFinalizers())

return nil
}
Expand Down Expand Up @@ -886,14 +888,17 @@ func (v *VRGInstance) undoPVCFinalizersAndPVRetention(pvc *corev1.PersistentVolu

// reconcileMissingVR determines if VR is missing, and if missing completes other steps required for
// reconciliation during deletion.
// VR can be missing,
// - if no VR was created post initial processing, by when VRG was deleted. In this case
// no PV was also uploaded, as VR is created first before PV is uploaded.
// - if VR was deleted in a prior reconcile, during VRG deletion, but steps post VR deletion were not
// completed, at this point a deleted VR is also not processed further (its generation would have been updated)
// Returns 2 booleans,
// - the first indicating if VR is missing or not, to enable further VR processing if needed
// - the next indicating any required requeue of the request, due to errors in determining VR presence
//
// VR can be missing:
// - if no VR was created post initial processing, by when VRG was deleted. In this case no PV was also
// uploaded, as VR is created first before PV is uploaded.
// - if VR was deleted in a prior reconcile, during VRG deletion, but steps post VR deletion were not
// completed, at this point a deleted VR is also not processed further (its generation would have been
// updated)
//
// Returns 2 booleans:
// - the first indicating if VR is missing or not, to enable further VR processing if needed
// - the next indicating any required requeue of the request, due to errors in determining VR presence
func (v *VRGInstance) reconcileMissingVR(pvc *corev1.PersistentVolumeClaim, log logr.Logger) (bool, bool) {
const (
requeue = true
Expand All @@ -910,7 +915,7 @@ func (v *VRGInstance) reconcileMissingVR(pvc *corev1.PersistentVolumeClaim, log
err := v.reconciler.Get(v.ctx, vrNamespacedName, volRep)
if err == nil {
if rmnutil.ResourceIsDeleted(volRep) {
log.Info("Requeuing due to processing a VR under deletion")
log.Info("Requeuing due to processing a deleted VR")

return !vrMissing, requeue
}
Expand All @@ -924,7 +929,7 @@ func (v *VRGInstance) reconcileMissingVR(pvc *corev1.PersistentVolumeClaim, log
return !vrMissing, requeue
}

log.Info("Preparing PVC as VR is detected as missing or deleted")
log.Info("Unprotecting PVC as VR is missing")

if err := v.preparePVCForVRDeletion(pvc, log); err != nil {
log.Info("Requeuing due to failure in preparing PersistentVolumeClaim for deletion",
Expand Down Expand Up @@ -1407,12 +1412,69 @@ func (v *VRGInstance) checkVRStatus(pvc *corev1.PersistentVolumeClaim, volRep *v
}

// validateVRStatus validates if the VolumeReplication resource has the desired status for the
// current generation and returns true if so, false otherwise
// - When replication state is Primary, only Completed condition is checked.
// - When replication state is Secondary, all 3 conditions for Completed/Degraded/Resyncing is
// checked and ensured healthy.
// current generation, deletion status, and repliaction state.
//
// We handle 3 cases:
// - Primary deleted VRG: If Validated condition exists and false, the VR will never complete and can be
// deleted safely. Otherwise Completed condition is checked.
// - Primary VRG: Completed condition is checked.
// - Secondary VRG: Completed, Degraded and Resyncing conditions are checked and ensured healthy.
func (v *VRGInstance) validateVRStatus(pvc *corev1.PersistentVolumeClaim, volRep *volrep.VolumeReplication,
state ramendrv1alpha1.ReplicationState,
) bool {
// Check validated for primary during VRG deletion.
if state == ramendrv1alpha1.Primary && rmnutil.ResourceIsDeleted(v.instance) {
validated, ok := v.validateVRValidatedStatus(volRep)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't this a straight call to check the condition, do we need a wrapper function for this?

Suggested change
validated, ok := v.validateVRValidatedStatus(volRep)
conditionMet, errorMsg := isVRConditionMet(volRep, volrep.ConditionValidated, metav1.ConditionTrue)
if !conditionMet && errorMsg == "" {

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here we can update the DataReady and Protected conditions since we have more specific error message. I remvoed this update since many tests failed, I think the issue is wrong validation in the test and it is not related to this update, but I focused on solving the main problem. We can improve this in the next step.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, v.validateVRValidatedStatus is a good place to log errors about missing, stale, or unknown condition, similar to validateCompletedStatus and the other helper for secondary vrg.

if !validated && ok {
v.log.Info(fmt.Sprintf("VolumeReplication %s/%s failed validation and can be deleted",
volRep.GetName(), volRep.GetNamespace()))

return true
}
}

// Check completed for both primary and secondary.
if !v.validateVRCompletedStatus(pvc, volRep, state) {
return false
}

// if primary, all checks are completed.
if state == ramendrv1alpha1.Secondary {
return v.validateAdditionalVRStatusForSecondary(pvc, volRep)
}

msg := "PVC in the VolumeReplicationGroup is ready for use"
v.updatePVCDataReadyCondition(pvc.Namespace, pvc.Name, VRGConditionReasonReady, msg)
v.updatePVCDataProtectedCondition(pvc.Namespace, pvc.Name, VRGConditionReasonReady, msg)
v.updatePVCLastSyncTime(pvc.Namespace, pvc.Name, volRep.Status.LastSyncTime)
v.updatePVCLastSyncDuration(pvc.Namespace, pvc.Name, volRep.Status.LastSyncDuration)
v.updatePVCLastSyncBytes(pvc.Namespace, pvc.Name, volRep.Status.LastSyncBytes)
v.log.Info(fmt.Sprintf("VolumeReplication resource %s/%s is ready for use", volRep.GetName(),
volRep.GetNamespace()))

return true
}

// validateVRValidatedStatus validates that VolumeReplicaion resource was validated.
// Return 2 booleans
// - validated: true if the condition is true, otherwise false
// - ok: true if the check was succeesfull, false if the condition is missing, stale, or unknown.
func (v *VRGInstance) validateVRValidatedStatus(
volRep *volrep.VolumeReplication,
) (bool, bool) {
conditionMet, errorMsg := isVRConditionMet(volRep, volrep.ConditionValidated, metav1.ConditionTrue)
if errorMsg != "" {
v.log.Info(fmt.Sprintf("%s (VolRep: %s/%s)", errorMsg, volRep.GetName(), volRep.GetNamespace()))
}

return conditionMet, errorMsg == ""
}

// validateVRCompletedStatus validates if the VolumeReplication resource Completed condition is met and update
// the PVC DataReady and Protected conditions.
// Returns true if the condtion is true, false if the condition is missing, stale, ubnknown, of false.
func (v *VRGInstance) validateVRCompletedStatus(pvc *corev1.PersistentVolumeClaim, volRep *volrep.VolumeReplication,
state ramendrv1alpha1.ReplicationState,
) bool {
var (
stateString string
Expand All @@ -1428,35 +1490,18 @@ func (v *VRGInstance) validateVRStatus(pvc *corev1.PersistentVolumeClaim, volRep
action = "demoted"
}

// it should be completed
conditionMet, msg := isVRConditionMet(volRep, volrep.ConditionCompleted, metav1.ConditionTrue)
if !conditionMet {
defaultMsg := fmt.Sprintf("VolumeReplication resource for pvc not %s to %s", action, stateString)
v.updatePVCDataReadyConditionHelper(pvc.Namespace, pvc.Name, VRGConditionReasonError, msg,
defaultMsg)

v.updatePVCDataProtectedConditionHelper(pvc.Namespace, pvc.Name, VRGConditionReasonError, msg,
defaultMsg)

v.log.Info(fmt.Sprintf("%s (VolRep: %s/%s)", defaultMsg, volRep.GetName(), volRep.GetNamespace()))

return false
}

// if primary, all checks are completed
if state == ramendrv1alpha1.Secondary {
return v.validateAdditionalVRStatusForSecondary(pvc, volRep)
}

msg = "PVC in the VolumeReplicationGroup is ready for use"
v.updatePVCDataReadyCondition(pvc.Namespace, pvc.Name, VRGConditionReasonReady, msg)
v.updatePVCDataProtectedCondition(pvc.Namespace, pvc.Name, VRGConditionReasonReady, msg)
v.updatePVCLastSyncTime(pvc.Namespace, pvc.Name, volRep.Status.LastSyncTime)
v.updatePVCLastSyncDuration(pvc.Namespace, pvc.Name, volRep.Status.LastSyncDuration)
v.updatePVCLastSyncBytes(pvc.Namespace, pvc.Name, volRep.Status.LastSyncBytes)
v.log.Info(fmt.Sprintf("VolumeReplication resource %s/%s is ready for use", volRep.GetName(),
volRep.GetNamespace()))

return true
}

Expand Down Expand Up @@ -1555,34 +1600,35 @@ func (v *VRGInstance) checkResyncCompletionAsSecondary(pvc *corev1.PersistentVol
return true
}

// isVRConditionMet returns true if the condition is met, and an error mesage if we could not get the
// condition value.
func isVRConditionMet(volRep *volrep.VolumeReplication,
conditionType string,
desiredStatus metav1.ConditionStatus,
) (bool, string) {
volRepCondition := findCondition(volRep.Status.Conditions, conditionType)
if volRepCondition == nil {
msg := fmt.Sprintf("Failed to get the %s condition from status of VolumeReplication resource.", conditionType)
errorMsg := fmt.Sprintf("Failed to get the %s condition from status of VolumeReplication resource.",
conditionType)

return false, msg
return false, errorMsg
}

if volRep.GetGeneration() != volRepCondition.ObservedGeneration {
msg := fmt.Sprintf("Stale generation for condition %s from status of VolumeReplication resource.", conditionType)
errorMsg := fmt.Sprintf("Stale generation for condition %s from status of VolumeReplication resource.",
conditionType)

return false, msg
return false, errorMsg
}

if volRepCondition.Status == metav1.ConditionUnknown {
msg := fmt.Sprintf("Unknown status for condition %s from status of VolumeReplication resource.", conditionType)
errorMsg := fmt.Sprintf("Unknown status for condition %s from status of VolumeReplication resource.",
conditionType)

return false, msg
return false, errorMsg
}

if volRepCondition.Status != desiredStatus {
return false, ""
}

return true, ""
return volRepCondition.Status == desiredStatus, ""
}

// Disabling unparam linter as currently every invokation of this
Expand Down Expand Up @@ -1803,6 +1849,8 @@ func (v *VRGInstance) deleteVR(vrNamespacedName types.NamespacedName, log logr.L
return nil
}

v.log.Info("Deleted VolumeReplication resource %s/%s", vrNamespacedName.Namespace, vrNamespacedName.Name)

return v.ensureVRDeletedFromAPIServer(vrNamespacedName, log)
}

Expand Down
1 change: 1 addition & 0 deletions test/envs/kubevirt.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ profiles:
disk_size: "100g"
workers:
- addons:
- name: external-snapshotter
- name: rook-operator
- name: rook-cluster
- name: rook-toolbox
Expand Down
1 change: 1 addition & 0 deletions test/envs/regional-dr-kubevirt.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ templates:
disk_size: "50g"
workers:
- addons:
- name: external-snapshotter
- name: rook-operator
- name: rook-cluster
- name: rook-toolbox
Expand Down
Loading