Skip to content

Commit

Permalink
Skip patching the PV in finalization for failed operation
Browse files Browse the repository at this point in the history
This commit makes change in restore finalizer controller, to make it
check the status in item operation of a PVC before patch the PV that is
bound to it.  If the operation is not successful it will skip patching
the PV.

Signed-off-by: Daniel Jiang <daniel.jiang@broadcom.com>
  • Loading branch information
reasonerjt committed Jan 8, 2025
1 parent 3eaa739 commit f2426c7
Show file tree
Hide file tree
Showing 5 changed files with 412 additions and 81 deletions.
3 changes: 2 additions & 1 deletion pkg/cmd/server/plugin/plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"github.com/spf13/cobra"
apiextensions "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset"

"github.com/vmware-tanzu/velero/pkg/constant"
"github.com/vmware-tanzu/velero/pkg/datamover"

dia "github.com/vmware-tanzu/velero/internal/delete/actions/csi"
Expand Down Expand Up @@ -162,7 +163,7 @@ func NewCommand(f client.Factory) *cobra.Command {
newVolumeSnapshotClassBackupItemAction,
).
RegisterRestoreItemActionV2(
"velero.io/csi-pvc-restorer",
constant.PluginCSIPVCRestoreRIA,
newPvcRestoreItemAction(f),
).
RegisterRestoreItemActionV2(
Expand Down
2 changes: 2 additions & 0 deletions pkg/constant/constant.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,6 @@ const (
ControllerSchedule = "schedule"
ControllerServerStatusRequest = "server-status-request"
ControllerRestoreFinalizer = "restore-finalizer"

PluginCSIPVCRestoreRIA = "velero.io/csi-pvc-restorer"
)
68 changes: 61 additions & 7 deletions pkg/controller/restore_finalizer_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,12 @@ import (
"sync"
"time"

"k8s.io/apimachinery/pkg/runtime/schema"

"github.com/vmware-tanzu/velero/pkg/constant"
"github.com/vmware-tanzu/velero/pkg/itemoperation"
"github.com/vmware-tanzu/velero/pkg/plugin/velero"

storagev1api "k8s.io/api/storage/v1"

"github.com/pkg/errors"
Expand Down Expand Up @@ -155,6 +161,12 @@ func (r *restoreFinalizerReconciler) Reconcile(ctx context.Context, req ctrl.Req

restoredPVCList := volume.RestoredPVCFromRestoredResourceList(restoredResourceList)

restoreItemOperations, err := backupStore.GetRestoreItemOperations(restore.Name)
if err != nil {
log.WithError(err).Error("error getting itemOperationList")
return ctrl.Result{}, errors.Wrap(err, "error getting itemOperationList")
}

finalizerCtx := &finalizerContext{
logger: log,
restore: restore,
Expand All @@ -163,6 +175,9 @@ func (r *restoreFinalizerReconciler) Reconcile(ctx context.Context, req ctrl.Req
restoredPVCList: restoredPVCList,
multiHookTracker: r.multiHookTracker,
resourceTimeout: r.resourceTimeout,
restoreItemOperationList: restoreItemOperationList{
items: restoreItemOperations,
},
}
warnings, errs := finalizerCtx.execute()

Expand Down Expand Up @@ -239,16 +254,44 @@ func (r *restoreFinalizerReconciler) finishProcessing(restorePhase velerov1api.R
return kubeutil.PatchResourceWithRetriesOnErrors(r.resourceTimeout, original, restore, r.Client)
}

type restoreItemOperationList struct {
items []*itemoperation.RestoreOperation
}

func (r *restoreItemOperationList) selectByResource(group, resource, ns, name string) []*itemoperation.RestoreOperation {
var res []*itemoperation.RestoreOperation
rid := velero.ResourceIdentifier{
GroupResource: schema.GroupResource{
Group: group,
Resource: resource,
},
Namespace: ns,
Name: name,
}
for _, item := range r.items {
if item != nil && item.Spec.ResourceIdentifier == rid {
res = append(res, item)
}
}
return res
}

// SelectByPVC filters the restore item operation list by PVC namespace and name.
func (r *restoreItemOperationList) SelectByPVC(ns, name string) []*itemoperation.RestoreOperation {
return r.selectByResource("", "persistentvolumeclaims", ns, name)
}

// finalizerContext includes all the dependencies required by finalization tasks and
// a function execute() to orderly implement task logic.
type finalizerContext struct {
logger logrus.FieldLogger
restore *velerov1api.Restore
crClient client.Client
volumeInfo []*volume.BackupVolumeInfo
restoredPVCList map[string]struct{}
multiHookTracker *hook.MultiHookTracker
resourceTimeout time.Duration
logger logrus.FieldLogger
restore *velerov1api.Restore
crClient client.Client
volumeInfo []*volume.BackupVolumeInfo
restoredPVCList map[string]struct{}
restoreItemOperationList restoreItemOperationList
multiHookTracker *hook.MultiHookTracker
resourceTimeout time.Duration
}

func (ctx *finalizerContext) execute() (results.Result, results.Result) { //nolint:unparam //temporarily ignore the lint report: result 0 is always nil (unparam)
Expand Down Expand Up @@ -310,6 +353,17 @@ func (ctx *finalizerContext) patchDynamicPVWithVolumeInfo() (errs results.Result
return false, err
}

// Check whether the async operation to populate the PVC is successful. If it's not, will skip patching the PV, instead of waiting.
operations := ctx.restoreItemOperationList.SelectByPVC(pvc.Namespace, pvc.Name)
for _, op := range operations {
if op.Spec.RestoreItemAction == constant.PluginCSIPVCRestoreRIA &&
op.Status.Phase != itemoperation.OperationPhaseCompleted {
log.Warnf("skipping PV patch, because the operation to restore the PVC is not completed, "+
"operation: %s, phase: %s", op.Spec.OperationID, op.Status.Phase)
return true, nil
}
}

// We are handling a common but specific scenario where a PVC is in a pending state and uses a storage class with
// VolumeBindingMode set to WaitForFirstConsumer. In this case, the PV patch step is skipped to avoid
// failures due to the PVC not being bound, which could cause a timeout and result in a failed restore.
Expand Down
115 changes: 115 additions & 0 deletions pkg/controller/restore_finalizer_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,11 @@ import (
"testing"
"time"

"k8s.io/apimachinery/pkg/runtime/schema"

"github.com/vmware-tanzu/velero/pkg/itemoperation"
"github.com/vmware-tanzu/velero/pkg/plugin/velero"

"github.com/sirupsen/logrus"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/mock"
Expand Down Expand Up @@ -145,6 +150,7 @@ func TestRestoreFinalizerReconcile(t *testing.T) {
if test.restore != nil && test.restore.Namespace == velerov1api.DefaultNamespace {
require.NoError(t, r.Client.Create(context.Background(), test.restore))
backupStore.On("GetRestoredResourceList", test.restore.Name).Return(map[string][]string{}, nil)
backupStore.On("GetRestoreItemOperations", test.restore.Name).Return([]*itemoperation.RestoreOperation{}, nil)
}
if test.backup != nil {
assert.NoError(t, r.Client.Create(context.Background(), test.backup))
Expand Down Expand Up @@ -627,3 +633,112 @@ func Test_restoreFinalizerReconciler_finishProcessing(t *testing.T) {
})
}
}

func TestRestoreOperationList(t *testing.T) {
var empty []*itemoperation.RestoreOperation
tests := []struct {
name string
items []*itemoperation.RestoreOperation
inputPVCNS string
inputPVCName string
expected []*itemoperation.RestoreOperation
}{
{
name: "no restore operations",
items: []*itemoperation.RestoreOperation{},
inputPVCNS: "ns-1",
inputPVCName: "pvc-1",
expected: empty,
},
{
name: "one operation with matched info and a nil element",
items: []*itemoperation.RestoreOperation{
nil,
{
Spec: itemoperation.RestoreOperationSpec{
RestoreName: "restore-1",
RestoreUID: "uid-1",
RestoreItemAction: "velero.io/csi-pvc-restorer",
OperationID: "dd-abbb048d-7036-4855-bf50-ebba978b59a6.2426dd0e-b863-4222b5b2b",
ResourceIdentifier: velero.ResourceIdentifier{
GroupResource: schema.GroupResource{
Group: "",
Resource: "persistentvolumeclaims",
},
Namespace: "ns-1",
Name: "pvc-1",
},
},
Status: itemoperation.OperationStatus{
Phase: itemoperation.OperationPhaseCompleted,
OperationUnits: "Byte",
Description: "Completed",
},
},
},
inputPVCNS: "ns-1",
inputPVCName: "pvc-1",
expected: []*itemoperation.RestoreOperation{
{
Spec: itemoperation.RestoreOperationSpec{
RestoreName: "restore-1",
RestoreUID: "uid-1",
RestoreItemAction: "velero.io/csi-pvc-restorer",
OperationID: "dd-abbb048d-7036-4855-bf50-ebba978b59a6.2426dd0e-b863-4222b5b2b",
ResourceIdentifier: velero.ResourceIdentifier{
GroupResource: schema.GroupResource{
Group: "",
Resource: "persistentvolumeclaims",
},
Namespace: "ns-1",
Name: "pvc-1",
},
},
Status: itemoperation.OperationStatus{
Phase: itemoperation.OperationPhaseCompleted,
OperationUnits: "Byte",
Description: "Completed",
},
},
},
},
{
name: "one operation with incorrect resource type",
items: []*itemoperation.RestoreOperation{
{
Spec: itemoperation.RestoreOperationSpec{
RestoreName: "restore-1",
RestoreUID: "uid-1",
RestoreItemAction: "velero.io/csi-pvc-restorer",
OperationID: "dd-abbb048d-7036-4855-bf50-ebba978b59a6.2426dd0e-b863-4222b5b2b",
ResourceIdentifier: velero.ResourceIdentifier{
GroupResource: schema.GroupResource{
Group: "",
Resource: "configmaps",
},
Namespace: "ns-1",
Name: "pvc-1",
},
},
Status: itemoperation.OperationStatus{
Phase: itemoperation.OperationPhaseCompleted,
OperationUnits: "Byte",
Description: "Completed",
},
},
},
inputPVCNS: "ns-1",
inputPVCName: "pvc-1",
expected: empty,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
l := restoreItemOperationList{
items: tt.items,
}
assert.Equal(t, tt.expected, l.SelectByPVC(tt.inputPVCNS, tt.inputPVCName))
})
}
}
Loading

0 comments on commit f2426c7

Please sign in to comment.