From 6ff0aa32e3b0e195969fcaa272d633773ed8ff18 Mon Sep 17 00:00:00 2001 From: Lyndon-Li Date: Fri, 20 Dec 2024 10:28:50 +0800 Subject: [PATCH 1/9] recall existing repo maintenance to history Signed-off-by: Lyndon-Li --- .../backup_repository_controller.go | 85 +++++++++++ pkg/repository/maintenance.go | 61 +++++++- pkg/repository/maintenance_test.go | 141 ++++++++++++++++++ 3 files changed, 286 insertions(+), 1 deletion(-) diff --git a/pkg/controller/backup_repository_controller.go b/pkg/controller/backup_repository_controller.go index d41f547796..46aef57559 100644 --- a/pkg/controller/backup_repository_controller.go +++ b/pkg/controller/backup_repository_controller.go @@ -22,6 +22,7 @@ import ( "encoding/json" "fmt" "reflect" + "slices" "time" "github.com/pkg/errors" @@ -40,6 +41,7 @@ import ( velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1" "github.com/vmware-tanzu/velero/pkg/constant" "github.com/vmware-tanzu/velero/pkg/label" + "github.com/vmware-tanzu/velero/pkg/repository" repoconfig "github.com/vmware-tanzu/velero/pkg/repository/config" repomanager "github.com/vmware-tanzu/velero/pkg/repository/manager" "github.com/vmware-tanzu/velero/pkg/util/kube" @@ -206,12 +208,95 @@ func (r *BackupRepoReconciler) Reconcile(ctx context.Context, req ctrl.Request) } fallthrough case velerov1api.BackupRepositoryPhaseReady: + if err := r.processUnrecordedMaintenance(ctx, backupRepo, log); err != nil { + return ctrl.Result{}, errors.Wrap(err, "error handling incomplete repo maintenance jobs") + } + return ctrl.Result{}, r.runMaintenanceIfDue(ctx, backupRepo, log) } return ctrl.Result{}, nil } +func (r *BackupRepoReconciler) processUnrecordedMaintenance(ctx context.Context, req *velerov1api.BackupRepository, log logrus.FieldLogger) error { + history, err := repository.WaitIncompleteMaintenance(ctx, r.Client, req, defaultMaintenanceStatusQueueLength, log) + if err != nil { + return errors.Wrapf(err, "error waiting incomplete repo maintenance job for repo %s", req.Name) + } + + consolidated := consolidateHistory(history, req.Status.RecentMaintenance) + if consolidated == nil { + return nil + } + + log.Warn("Updating backup repository because of unrecorded histories") + + return r.patchBackupRepository(ctx, req, func(rr *velerov1api.BackupRepository) { + rr.Status.RecentMaintenance = consolidated + }) +} + +func consolidateHistory(coming, cur []velerov1api.BackupRepositoryMaintenanceStatus) []velerov1api.BackupRepositoryMaintenanceStatus { + if len(coming) == 0 { + return nil + } + + if isIdenticalHistories(coming, cur) { + return nil + } + + truncated := []velerov1api.BackupRepositoryMaintenanceStatus{} + i := len(cur) - 1 + j := len(coming) - 1 + for i >= 0 || j >= 0 { + if len(truncated) == defaultMaintenanceStatusQueueLength { + break + } + + if i >= 0 && j >= 0 { + if isEarlierHistory(cur[i], coming[j]) { + truncated = append(truncated, coming[j]) + j-- + } else { + truncated = append(truncated, cur[i]) + i-- + } + } else if i >= 0 { + truncated = append(truncated, cur[i]) + i-- + } else { + truncated = append(truncated, coming[j]) + j-- + } + } + + slices.Reverse(truncated) + + if isIdenticalHistories(truncated, cur) { + return nil + } + + return truncated +} + +func isIdenticalHistories(a, b []velerov1api.BackupRepositoryMaintenanceStatus) bool { + if len(a) != len(b) { + return false + } + + for i := 0; i < len(a); i++ { + if !a[i].CompleteTimestamp.Equal(b[i].CompleteTimestamp) { + return false + } + } + + return true +} + +func isEarlierHistory(a, b velerov1api.BackupRepositoryMaintenanceStatus) bool { + return a.CompleteTimestamp.Before(b.CompleteTimestamp) +} + func (r *BackupRepoReconciler) getIdentiferByBSL(ctx context.Context, req *velerov1api.BackupRepository) (string, error) { loc := &velerov1api.BackupStorageLocation{} diff --git a/pkg/repository/maintenance.go b/pkg/repository/maintenance.go index 5ea63979f0..96e08c1479 100644 --- a/pkg/repository/maintenance.go +++ b/pkg/repository/maintenance.go @@ -87,7 +87,7 @@ func DeleteOldMaintenanceJobs(cli client.Client, repo string, keep int) error { } func WaitForJobComplete(ctx context.Context, client client.Client, job *batchv1.Job) error { - return wait.PollUntilContextCancel(ctx, 1, true, func(ctx context.Context) (bool, error) { + return wait.PollUntilContextCancel(ctx, time.Second, true, func(ctx context.Context) (bool, error) { err := client.Get(ctx, types.NamespacedName{Namespace: job.Namespace, Name: job.Name}, job) if err != nil && !apierrors.IsNotFound(err) { return false, err @@ -250,3 +250,62 @@ func GetMaintenanceJobConfig( return result, nil } + +// WaitIncompleteMaintenance checks all the incomplete maintenance jobs of the specified repo and wait for them to complete, +// and then return the maintenance jobs in the range of limit +func WaitIncompleteMaintenance(ctx context.Context, cli client.Client, repo *velerov1api.BackupRepository, limit int, log logrus.FieldLogger) ([]velerov1api.BackupRepositoryMaintenanceStatus, error) { + jobList := &batchv1.JobList{} + err := cli.List(context.TODO(), jobList, &client.ListOptions{ + Namespace: repo.Namespace, + }, + client.MatchingLabels(map[string]string{RepositoryNameLabel: repo.Name}), + ) + + if err != nil { + return nil, errors.Wrapf(err, "error listing maintenance job for repo %s", repo.Name) + } + + if len(jobList.Items) == 0 { + return nil, nil + } + + history := []velerov1api.BackupRepositoryMaintenanceStatus{} + + for _, job := range jobList.Items { + if job.Status.Succeeded == 0 && job.Status.Failed == 0 { + log.Infof("Waiting for maintenance job %s to complete", job.Name) + + if err := WaitForJobComplete(ctx, cli, &job); err != nil { + return nil, errors.Wrapf(err, "error waiting maintenance job[%s] complete", job.Name) + } + } + + result := velerov1api.BackupRepositoryMaintenanceSucceeded + if job.Status.Failed > 0 { + result = velerov1api.BackupRepositoryMaintenanceFailed + } + + message, err := GetMaintenanceResultFromJob(cli, &job) + if err != nil { + return nil, errors.Wrapf(err, "error getting maintenance job[%s] result", job.Name) + } + + history = append(history, velerov1api.BackupRepositoryMaintenanceStatus{ + Result: result, + StartTimestamp: &metav1.Time{Time: job.Status.StartTime.Time}, + CompleteTimestamp: &metav1.Time{Time: job.Status.CompletionTime.Time}, + Message: message, + }) + } + + sort.Slice(history, func(i, j int) bool { + return history[i].CompleteTimestamp.Time.After(history[j].CompleteTimestamp.Time) + }) + + startPos := len(history) - limit + if startPos < 0 { + startPos = 0 + } + + return history[startPos:], nil +} diff --git a/pkg/repository/maintenance_test.go b/pkg/repository/maintenance_test.go index f6344b166a..7917ac65d8 100644 --- a/pkg/repository/maintenance_test.go +++ b/pkg/repository/maintenance_test.go @@ -458,3 +458,144 @@ func TestGetMaintenanceJobConfig(t *testing.T) { }) } } + +// func TestWaitIncompleteMaintenance(t *testing.T) { +// ctx, cancel := context.WithCancel(context.Background()) + +// veleroNamespace := "velero" +// repo := &velerov1api.BackupRepository{ +// ObjectMeta: metav1.ObjectMeta{ +// Namespace: veleroNamespace, +// Name: "fake-repo", +// }, +// Spec: velerov1api.BackupRepositorySpec{ +// BackupStorageLocation: "default", +// RepositoryType: "kopia", +// VolumeNamespace: "test", +// }, +// } + +// scheme := runtime.NewScheme() +// batchv1.AddToScheme(scheme) + +// testCases := []struct { +// name string +// ctx context.Context +// kubeClientObj []runtime.Object +// runtimeScheme *runtime.Scheme +// expectedStatus []velerov1api.BackupRepositoryMaintenanceStatus +// expectedError string +// }{ +// { +// name: "list job error", +// expectedError: "error listing maintenance job for repo fake-repo", +// }, +// { +// name: "job not exist", +// runtimeScheme: scheme, +// }, +// { +// name: "no matching job", +// runtimeScheme: scheme, +// kubeClientObj: []runtime.Object{ +// jobOtherLabel, +// }, +// }, +// { +// name: "wait complete error", +// ctx: context.WithTimeout(context.TODO(), time.Second), +// runtimeScheme: scheme, +// kubeClientObj: []runtime.Object{ +// jobIncomplete, +// }, +// expectedError: nil, +// }, +// { +// name: "Find config specific for global", +// repoJobConfig: &v1.ConfigMap{ +// ObjectMeta: metav1.ObjectMeta{ +// Namespace: veleroNamespace, +// Name: repoMaintenanceJobConfig, +// }, +// Data: map[string]string{ +// GlobalKeyForRepoMaintenanceJobCM: "{\"podResources\":{\"cpuRequest\":\"50m\",\"cpuLimit\":\"100m\",\"memoryRequest\":\"50Mi\",\"memoryLimit\":\"100Mi\"},\"loadAffinity\":[{\"nodeSelector\":{\"matchExpressions\":[{\"key\":\"cloud.google.com/machine-family\",\"operator\":\"In\",\"values\":[\"n2\"]}]}}]}", +// }, +// }, +// expectedConfig: &JobConfigs{ +// PodResources: &kube.PodResources{ +// CPURequest: "50m", +// CPULimit: "100m", +// MemoryRequest: "50Mi", +// MemoryLimit: "100Mi", +// }, +// LoadAffinities: []*kube.LoadAffinity{ +// { +// NodeSelector: metav1.LabelSelector{ +// MatchExpressions: []metav1.LabelSelectorRequirement{ +// { +// Key: "cloud.google.com/machine-family", +// Operator: metav1.LabelSelectorOpIn, +// Values: []string{"n2"}, +// }, +// }, +// }, +// }, +// }, +// }, +// expectedError: nil, +// }, +// { +// name: "Specific config supersede global config", +// repoJobConfig: &v1.ConfigMap{ +// ObjectMeta: metav1.ObjectMeta{ +// Namespace: veleroNamespace, +// Name: repoMaintenanceJobConfig, +// }, +// Data: map[string]string{ +// GlobalKeyForRepoMaintenanceJobCM: "{\"podResources\":{\"cpuRequest\":\"50m\",\"cpuLimit\":\"100m\",\"memoryRequest\":\"50Mi\",\"memoryLimit\":\"100Mi\"},\"loadAffinity\":[{\"nodeSelector\":{\"matchExpressions\":[{\"key\":\"cloud.google.com/machine-family\",\"operator\":\"In\",\"values\":[\"n2\"]}]}}]}", +// "test-default-kopia": "{\"podResources\":{\"cpuRequest\":\"100m\",\"cpuLimit\":\"200m\",\"memoryRequest\":\"100Mi\",\"memoryLimit\":\"200Mi\"},\"loadAffinity\":[{\"nodeSelector\":{\"matchExpressions\":[{\"key\":\"cloud.google.com/machine-family\",\"operator\":\"In\",\"values\":[\"e2\"]}]}}]}", +// }, +// }, +// expectedConfig: &JobConfigs{ +// PodResources: &kube.PodResources{ +// CPURequest: "100m", +// CPULimit: "200m", +// MemoryRequest: "100Mi", +// MemoryLimit: "200Mi", +// }, +// LoadAffinities: []*kube.LoadAffinity{ +// { +// NodeSelector: metav1.LabelSelector{ +// MatchExpressions: []metav1.LabelSelectorRequirement{ +// { +// Key: "cloud.google.com/machine-family", +// Operator: metav1.LabelSelectorOpIn, +// Values: []string{"e2"}, +// }, +// }, +// }, +// }, +// }, +// }, +// expectedError: nil, +// }, +// } + +// for _, test := range testCases { +// t.Run(test.name, func(t *testing.T) { +// fakeClientBuilder := fake.NewClientBuilder() +// if test.runtimeScheme != nil { +// fakeClientBuilder = fakeClientBuilder.WithScheme(test.runtimeScheme) +// } + +// fakeClient := fakeClientBuilder.WithRuntimeObjects(test.kubeClientObj...).Build() + +// if tc.expectedError != nil { +// require.ErrorContains(t, err, tc.expectedError.Error()) +// } else { +// require.NoError(t, err) +// } +// require.Equal(t, tc.expectedConfig, jobConfig) +// }) +// } +// } From 912b116bdb2c6d4d5fa7ca520a3252dd8b6bc000 Mon Sep 17 00:00:00 2001 From: Lyndon-Li Date: Thu, 2 Jan 2025 17:01:44 +0800 Subject: [PATCH 2/9] always use job's time Signed-off-by: Lyndon-Li --- .../backup_repository_controller.go | 204 +++++---- .../backup_repository_controller_test.go | 9 +- pkg/repository/maintenance.go | 65 +-- pkg/repository/maintenance_test.go | 431 ++++++++++++------ pkg/repository/manager/manager.go | 33 +- pkg/repository/mocks/Manager.go | 20 +- 6 files changed, 477 insertions(+), 285 deletions(-) diff --git a/pkg/controller/backup_repository_controller.go b/pkg/controller/backup_repository_controller.go index 46aef57559..535fc41b6b 100644 --- a/pkg/controller/backup_repository_controller.go +++ b/pkg/controller/backup_repository_controller.go @@ -208,7 +208,7 @@ func (r *BackupRepoReconciler) Reconcile(ctx context.Context, req ctrl.Request) } fallthrough case velerov1api.BackupRepositoryPhaseReady: - if err := r.processUnrecordedMaintenance(ctx, backupRepo, log); err != nil { + if err := r.recallMaintenance(ctx, backupRepo, log); err != nil { return ctrl.Result{}, errors.Wrap(err, "error handling incomplete repo maintenance jobs") } @@ -218,85 +218,6 @@ func (r *BackupRepoReconciler) Reconcile(ctx context.Context, req ctrl.Request) return ctrl.Result{}, nil } -func (r *BackupRepoReconciler) processUnrecordedMaintenance(ctx context.Context, req *velerov1api.BackupRepository, log logrus.FieldLogger) error { - history, err := repository.WaitIncompleteMaintenance(ctx, r.Client, req, defaultMaintenanceStatusQueueLength, log) - if err != nil { - return errors.Wrapf(err, "error waiting incomplete repo maintenance job for repo %s", req.Name) - } - - consolidated := consolidateHistory(history, req.Status.RecentMaintenance) - if consolidated == nil { - return nil - } - - log.Warn("Updating backup repository because of unrecorded histories") - - return r.patchBackupRepository(ctx, req, func(rr *velerov1api.BackupRepository) { - rr.Status.RecentMaintenance = consolidated - }) -} - -func consolidateHistory(coming, cur []velerov1api.BackupRepositoryMaintenanceStatus) []velerov1api.BackupRepositoryMaintenanceStatus { - if len(coming) == 0 { - return nil - } - - if isIdenticalHistories(coming, cur) { - return nil - } - - truncated := []velerov1api.BackupRepositoryMaintenanceStatus{} - i := len(cur) - 1 - j := len(coming) - 1 - for i >= 0 || j >= 0 { - if len(truncated) == defaultMaintenanceStatusQueueLength { - break - } - - if i >= 0 && j >= 0 { - if isEarlierHistory(cur[i], coming[j]) { - truncated = append(truncated, coming[j]) - j-- - } else { - truncated = append(truncated, cur[i]) - i-- - } - } else if i >= 0 { - truncated = append(truncated, cur[i]) - i-- - } else { - truncated = append(truncated, coming[j]) - j-- - } - } - - slices.Reverse(truncated) - - if isIdenticalHistories(truncated, cur) { - return nil - } - - return truncated -} - -func isIdenticalHistories(a, b []velerov1api.BackupRepositoryMaintenanceStatus) bool { - if len(a) != len(b) { - return false - } - - for i := 0; i < len(a); i++ { - if !a[i].CompleteTimestamp.Equal(b[i].CompleteTimestamp) { - return false - } - } - - return true -} - -func isEarlierHistory(a, b velerov1api.BackupRepositoryMaintenanceStatus) bool { - return a.CompleteTimestamp.Before(b.CompleteTimestamp) -} - func (r *BackupRepoReconciler) getIdentiferByBSL(ctx context.Context, req *velerov1api.BackupRepository) (string, error) { loc := &velerov1api.BackupStorageLocation{} @@ -384,10 +305,109 @@ func ensureRepo(repo *velerov1api.BackupRepository, repoManager repomanager.Mana return repoManager.PrepareRepo(repo) } -func (r *BackupRepoReconciler) runMaintenanceIfDue(ctx context.Context, req *velerov1api.BackupRepository, log logrus.FieldLogger) error { - startTime := r.clock.Now() +func (r *BackupRepoReconciler) recallMaintenance(ctx context.Context, req *velerov1api.BackupRepository, log logrus.FieldLogger) error { + history, err := repository.WaitIncompleteMaintenance(ctx, r.Client, req, defaultMaintenanceStatusQueueLength, log) + if err != nil { + return errors.Wrapf(err, "error waiting incomplete repo maintenance job for repo %s", req.Name) + } - if !dueForMaintenance(req, startTime) { + consolidated := consolidateHistory(history, req.Status.RecentMaintenance) + if consolidated == nil { + return nil + } + + lastMaintenanceTime := getLastMaintenanceTimeFromHistory(consolidated) + + log.Warn("Updating backup repository because of unrecorded histories") + + if lastMaintenanceTime.After(req.Status.LastMaintenanceTime.Time) { + log.Warnf("Updating backup repository last maintenance time (%v) from history (%v)", req.Status.LastMaintenanceTime.Time, lastMaintenanceTime.Time) + } + + return r.patchBackupRepository(ctx, req, func(rr *velerov1api.BackupRepository) { + if lastMaintenanceTime.After(rr.Status.LastMaintenanceTime.Time) { + rr.Status.LastMaintenanceTime = lastMaintenanceTime + } + + rr.Status.RecentMaintenance = consolidated + }) +} + +func consolidateHistory(coming, cur []velerov1api.BackupRepositoryMaintenanceStatus) []velerov1api.BackupRepositoryMaintenanceStatus { + if len(coming) == 0 { + return nil + } + + if isIdenticalHistory(coming, cur) { + return nil + } + + truncated := []velerov1api.BackupRepositoryMaintenanceStatus{} + i := len(cur) - 1 + j := len(coming) - 1 + for i >= 0 || j >= 0 { + if len(truncated) == defaultMaintenanceStatusQueueLength { + break + } + + if i >= 0 && j >= 0 { + if isEarlierMaintenanceStatus(cur[i], coming[j]) { + truncated = append(truncated, coming[j]) + j-- + } else { + truncated = append(truncated, cur[i]) + i-- + } + } else if i >= 0 { + truncated = append(truncated, cur[i]) + i-- + } else { + truncated = append(truncated, coming[j]) + j-- + } + } + + slices.Reverse(truncated) + + if isIdenticalHistory(truncated, cur) { + return nil + } + + return truncated +} + +func getLastMaintenanceTimeFromHistory(history []velerov1api.BackupRepositoryMaintenanceStatus) *metav1.Time { + time := history[0].CompleteTimestamp + + for i := range history { + if time.Before(history[i].CompleteTimestamp) { + time = history[i].CompleteTimestamp + } + } + + return time +} + +func isIdenticalHistory(a, b []velerov1api.BackupRepositoryMaintenanceStatus) bool { + if len(a) != len(b) { + return false + } + + for i := 0; i < len(a); i++ { + if !a[i].StartTimestamp.Equal(b[i].StartTimestamp) { + return false + } + } + + return true +} + +func isEarlierMaintenanceStatus(a, b velerov1api.BackupRepositoryMaintenanceStatus) bool { + return a.StartTimestamp.Before(b.StartTimestamp) +} + +func (r *BackupRepoReconciler) runMaintenanceIfDue(ctx context.Context, req *velerov1api.BackupRepository, log logrus.FieldLogger) error { + if !dueForMaintenance(req, r.clock.Now()) { log.Debug("not due for maintenance") return nil } @@ -398,17 +418,23 @@ func (r *BackupRepoReconciler) runMaintenanceIfDue(ctx context.Context, req *vel // should not cause the repo to move to `NotReady`. log.Debug("Pruning repo") - if err := r.repositoryManager.PruneRepo(req); err != nil { - log.WithError(err).Warn("error pruning repository") + // when PruneRepo fails, the maintenance result will be left temporarily + // If the maintenenance still completes later, recallMaintenance recalls the left onces and update LastMaintenanceTime and history + status, err := r.repositoryManager.PruneRepo(req) + if err != nil { + return errors.Wrapf(err, "error pruning repository") + } + + if status.Result == velerov1api.BackupRepositoryMaintenanceFailed { + log.WithError(err).Warn("Pruning repository failed") return r.patchBackupRepository(ctx, req, func(rr *velerov1api.BackupRepository) { - updateRepoMaintenanceHistory(rr, velerov1api.BackupRepositoryMaintenanceFailed, startTime, r.clock.Now(), err.Error()) + updateRepoMaintenanceHistory(rr, velerov1api.BackupRepositoryMaintenanceFailed, status.StartTimestamp.Time, status.CompleteTimestamp.Time, status.Message) }) } return r.patchBackupRepository(ctx, req, func(rr *velerov1api.BackupRepository) { - completionTime := r.clock.Now() - rr.Status.LastMaintenanceTime = &metav1.Time{Time: completionTime} - updateRepoMaintenanceHistory(rr, velerov1api.BackupRepositoryMaintenanceSucceeded, startTime, completionTime, "") + rr.Status.LastMaintenanceTime = &metav1.Time{Time: status.CompleteTimestamp.Time} + updateRepoMaintenanceHistory(rr, velerov1api.BackupRepositoryMaintenanceSucceeded, status.StartTimestamp.Time, status.CompleteTimestamp.Time, status.Message) }) } diff --git a/pkg/controller/backup_repository_controller_test.go b/pkg/controller/backup_repository_controller_test.go index 376b17ce84..9b5dd4c4ae 100644 --- a/pkg/controller/backup_repository_controller_test.go +++ b/pkg/controller/backup_repository_controller_test.go @@ -37,11 +37,11 @@ import ( const testMaintenanceFrequency = 10 * time.Minute -func mockBackupRepoReconciler(t *testing.T, mockOn string, arg interface{}, ret interface{}) *BackupRepoReconciler { +func mockBackupRepoReconciler(t *testing.T, mockOn string, arg interface{}, ret ...interface{}) *BackupRepoReconciler { t.Helper() mgr := &repomokes.Manager{} if mockOn != "" { - mgr.On(mockOn, arg).Return(ret) + mgr.On(mockOn, arg).Return(ret...) } return NewBackupRepoReconciler( velerov1api.DefaultNamespace, @@ -106,7 +106,10 @@ func TestCheckNotReadyRepo(t *testing.T) { func TestRunMaintenanceIfDue(t *testing.T) { rr := mockBackupRepositoryCR() - reconciler := mockBackupRepoReconciler(t, "PruneRepo", rr, nil) + reconciler := mockBackupRepoReconciler(t, "PruneRepo", rr, velerov1api.BackupRepositoryMaintenanceStatus{ + StartTimestamp: &metav1.Time{}, + CompleteTimestamp: &metav1.Time{}, + }, nil) err := reconciler.Client.Create(context.TODO(), rr) assert.NoError(t, err) lastTm := rr.Status.LastMaintenanceTime diff --git a/pkg/repository/maintenance.go b/pkg/repository/maintenance.go index 96e08c1479..5a1ec1f614 100644 --- a/pkg/repository/maintenance.go +++ b/pkg/repository/maintenance.go @@ -86,22 +86,26 @@ func DeleteOldMaintenanceJobs(cli client.Client, repo string, keep int) error { return nil } -func WaitForJobComplete(ctx context.Context, client client.Client, job *batchv1.Job) error { - return wait.PollUntilContextCancel(ctx, time.Second, true, func(ctx context.Context) (bool, error) { - err := client.Get(ctx, types.NamespacedName{Namespace: job.Namespace, Name: job.Name}, job) +// WaitForJobComplete wait for completion of the specified job and update the latest job object +func WaitForJobComplete(ctx context.Context, client client.Client, ns string, job string) (*batchv1.Job, error) { + updated := &batchv1.Job{} + err := wait.PollUntilContextCancel(ctx, time.Second, true, func(ctx context.Context) (bool, error) { + err := client.Get(ctx, types.NamespacedName{Namespace: ns, Name: job}, updated) if err != nil && !apierrors.IsNotFound(err) { return false, err } - if job.Status.Succeeded > 0 { + if updated.Status.Succeeded > 0 { return true, nil } - if job.Status.Failed > 0 { - return true, fmt.Errorf("maintenance job %s/%s failed", job.Namespace, job.Name) + if updated.Status.Failed > 0 { + return true, fmt.Errorf("maintenance job %s/%s failed", job, job) } return false, nil }) + + return updated, err } func GetMaintenanceResultFromJob(cli client.Client, job *batchv1.Job) (string, error) { @@ -269,43 +273,52 @@ func WaitIncompleteMaintenance(ctx context.Context, cli client.Client, repo *vel return nil, nil } + sort.Slice(jobList.Items, func(i, j int) bool { + return jobList.Items[i].CreationTimestamp.Time.Before(jobList.Items[j].CreationTimestamp.Time) + }) + history := []velerov1api.BackupRepositoryMaintenanceStatus{} - for _, job := range jobList.Items { + startPos := len(history) - limit + if startPos < 0 { + startPos = 0 + } + + for i := startPos; i < len(jobList.Items); i++ { + job := &jobList.Items[i] + if job.Status.Succeeded == 0 && job.Status.Failed == 0 { log.Infof("Waiting for maintenance job %s to complete", job.Name) - if err := WaitForJobComplete(ctx, cli, &job); err != nil { + updated, err := WaitForJobComplete(ctx, cli, job.Namespace, job.Name) + if err != nil { return nil, errors.Wrapf(err, "error waiting maintenance job[%s] complete", job.Name) } - } - result := velerov1api.BackupRepositoryMaintenanceSucceeded - if job.Status.Failed > 0 { - result = velerov1api.BackupRepositoryMaintenanceFailed + job = updated } - message, err := GetMaintenanceResultFromJob(cli, &job) + message, err := GetMaintenanceResultFromJob(cli, job) if err != nil { return nil, errors.Wrapf(err, "error getting maintenance job[%s] result", job.Name) } - history = append(history, velerov1api.BackupRepositoryMaintenanceStatus{ - Result: result, - StartTimestamp: &metav1.Time{Time: job.Status.StartTime.Time}, - CompleteTimestamp: &metav1.Time{Time: job.Status.CompletionTime.Time}, - Message: message, - }) + history = append(history, ComposeMaintenanceStatusFromJob(job, message)) } - sort.Slice(history, func(i, j int) bool { - return history[i].CompleteTimestamp.Time.After(history[j].CompleteTimestamp.Time) - }) + return history, nil +} - startPos := len(history) - limit - if startPos < 0 { - startPos = 0 +func ComposeMaintenanceStatusFromJob(job *batchv1.Job, message string) velerov1api.BackupRepositoryMaintenanceStatus { + result := velerov1api.BackupRepositoryMaintenanceSucceeded + if job.Status.Failed > 0 { + result = velerov1api.BackupRepositoryMaintenanceFailed } - return history[startPos:], nil + return velerov1api.BackupRepositoryMaintenanceStatus{ + Result: result, + StartTimestamp: &metav1.Time{Time: job.CreationTimestamp.Time}, + CompleteTimestamp: &metav1.Time{Time: job.Status.CompletionTime.Time}, + Message: message, + } } diff --git a/pkg/repository/maintenance_test.go b/pkg/repository/maintenance_test.go index 7917ac65d8..6cde95c956 100644 --- a/pkg/repository/maintenance_test.go +++ b/pkg/repository/maintenance_test.go @@ -34,6 +34,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client/fake" velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1" + "github.com/vmware-tanzu/velero/pkg/builder" velerotest "github.com/vmware-tanzu/velero/pkg/test" "github.com/vmware-tanzu/velero/pkg/util/kube" ) @@ -167,7 +168,7 @@ func TestWaitForJobComplete(t *testing.T) { // Create a fake Kubernetes client cli := fake.NewClientBuilder().WithObjects(job).Build() // Call the function - err := WaitForJobComplete(context.Background(), cli, job) + _, err := WaitForJobComplete(context.Background(), cli, job.Namespace, job.Name) // Check if the error matches the expectation if tc.expectError { @@ -459,143 +460,291 @@ func TestGetMaintenanceJobConfig(t *testing.T) { } } -// func TestWaitIncompleteMaintenance(t *testing.T) { -// ctx, cancel := context.WithCancel(context.Background()) - -// veleroNamespace := "velero" -// repo := &velerov1api.BackupRepository{ -// ObjectMeta: metav1.ObjectMeta{ -// Namespace: veleroNamespace, -// Name: "fake-repo", -// }, -// Spec: velerov1api.BackupRepositorySpec{ -// BackupStorageLocation: "default", -// RepositoryType: "kopia", -// VolumeNamespace: "test", -// }, -// } - -// scheme := runtime.NewScheme() -// batchv1.AddToScheme(scheme) - -// testCases := []struct { -// name string -// ctx context.Context -// kubeClientObj []runtime.Object -// runtimeScheme *runtime.Scheme -// expectedStatus []velerov1api.BackupRepositoryMaintenanceStatus -// expectedError string -// }{ -// { -// name: "list job error", -// expectedError: "error listing maintenance job for repo fake-repo", -// }, -// { -// name: "job not exist", -// runtimeScheme: scheme, -// }, -// { -// name: "no matching job", -// runtimeScheme: scheme, -// kubeClientObj: []runtime.Object{ -// jobOtherLabel, -// }, -// }, -// { -// name: "wait complete error", -// ctx: context.WithTimeout(context.TODO(), time.Second), -// runtimeScheme: scheme, -// kubeClientObj: []runtime.Object{ -// jobIncomplete, -// }, -// expectedError: nil, -// }, -// { -// name: "Find config specific for global", -// repoJobConfig: &v1.ConfigMap{ -// ObjectMeta: metav1.ObjectMeta{ -// Namespace: veleroNamespace, -// Name: repoMaintenanceJobConfig, -// }, -// Data: map[string]string{ -// GlobalKeyForRepoMaintenanceJobCM: "{\"podResources\":{\"cpuRequest\":\"50m\",\"cpuLimit\":\"100m\",\"memoryRequest\":\"50Mi\",\"memoryLimit\":\"100Mi\"},\"loadAffinity\":[{\"nodeSelector\":{\"matchExpressions\":[{\"key\":\"cloud.google.com/machine-family\",\"operator\":\"In\",\"values\":[\"n2\"]}]}}]}", -// }, -// }, -// expectedConfig: &JobConfigs{ -// PodResources: &kube.PodResources{ -// CPURequest: "50m", -// CPULimit: "100m", -// MemoryRequest: "50Mi", -// MemoryLimit: "100Mi", -// }, -// LoadAffinities: []*kube.LoadAffinity{ -// { -// NodeSelector: metav1.LabelSelector{ -// MatchExpressions: []metav1.LabelSelectorRequirement{ -// { -// Key: "cloud.google.com/machine-family", -// Operator: metav1.LabelSelectorOpIn, -// Values: []string{"n2"}, -// }, -// }, -// }, -// }, -// }, -// }, -// expectedError: nil, -// }, -// { -// name: "Specific config supersede global config", -// repoJobConfig: &v1.ConfigMap{ -// ObjectMeta: metav1.ObjectMeta{ -// Namespace: veleroNamespace, -// Name: repoMaintenanceJobConfig, -// }, -// Data: map[string]string{ -// GlobalKeyForRepoMaintenanceJobCM: "{\"podResources\":{\"cpuRequest\":\"50m\",\"cpuLimit\":\"100m\",\"memoryRequest\":\"50Mi\",\"memoryLimit\":\"100Mi\"},\"loadAffinity\":[{\"nodeSelector\":{\"matchExpressions\":[{\"key\":\"cloud.google.com/machine-family\",\"operator\":\"In\",\"values\":[\"n2\"]}]}}]}", -// "test-default-kopia": "{\"podResources\":{\"cpuRequest\":\"100m\",\"cpuLimit\":\"200m\",\"memoryRequest\":\"100Mi\",\"memoryLimit\":\"200Mi\"},\"loadAffinity\":[{\"nodeSelector\":{\"matchExpressions\":[{\"key\":\"cloud.google.com/machine-family\",\"operator\":\"In\",\"values\":[\"e2\"]}]}}]}", -// }, -// }, -// expectedConfig: &JobConfigs{ -// PodResources: &kube.PodResources{ -// CPURequest: "100m", -// CPULimit: "200m", -// MemoryRequest: "100Mi", -// MemoryLimit: "200Mi", -// }, -// LoadAffinities: []*kube.LoadAffinity{ -// { -// NodeSelector: metav1.LabelSelector{ -// MatchExpressions: []metav1.LabelSelectorRequirement{ -// { -// Key: "cloud.google.com/machine-family", -// Operator: metav1.LabelSelectorOpIn, -// Values: []string{"e2"}, -// }, -// }, -// }, -// }, -// }, -// }, -// expectedError: nil, -// }, -// } - -// for _, test := range testCases { -// t.Run(test.name, func(t *testing.T) { -// fakeClientBuilder := fake.NewClientBuilder() -// if test.runtimeScheme != nil { -// fakeClientBuilder = fakeClientBuilder.WithScheme(test.runtimeScheme) -// } - -// fakeClient := fakeClientBuilder.WithRuntimeObjects(test.kubeClientObj...).Build() - -// if tc.expectedError != nil { -// require.ErrorContains(t, err, tc.expectedError.Error()) -// } else { -// require.NoError(t, err) -// } -// require.Equal(t, tc.expectedConfig, jobConfig) -// }) -// } -// } +func TestWaitIncompleteMaintenance(t *testing.T) { + ctx, cancel := context.WithTimeout(context.Background(), time.Second*2) + + veleroNamespace := "velero" + repo := &velerov1api.BackupRepository{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: veleroNamespace, + Name: "fake-repo", + }, + Spec: velerov1api.BackupRepositorySpec{ + BackupStorageLocation: "default", + RepositoryType: "kopia", + VolumeNamespace: "test", + }, + } + + now := time.Now().Round(time.Second) + + jobOtherLabel := &batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{ + Name: "job1", + Namespace: veleroNamespace, + Labels: map[string]string{RepositoryNameLabel: "other-repo"}, + CreationTimestamp: metav1.Time{Time: now}, + }, + } + + jobIncomplete := &batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{ + Name: "job1", + Namespace: veleroNamespace, + Labels: map[string]string{RepositoryNameLabel: "fake-repo"}, + CreationTimestamp: metav1.Time{Time: now}, + }, + } + + jobSucceeded1 := &batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{ + Name: "job1", + Namespace: veleroNamespace, + Labels: map[string]string{RepositoryNameLabel: "fake-repo"}, + CreationTimestamp: metav1.Time{Time: now}, + }, + Status: batchv1.JobStatus{ + StartTime: &metav1.Time{Time: now}, + CompletionTime: &metav1.Time{Time: now.Add(time.Hour)}, + Succeeded: 1, + }, + } + + jobPodSucceeded1 := builder.ForPod(veleroNamespace, "job1").Labels(map[string]string{"job-name": "job1"}).ContainerStatuses(&v1.ContainerStatus{ + State: v1.ContainerState{ + Terminated: &v1.ContainerStateTerminated{}, + }, + }).Result() + + jobFailed1 := &batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{ + Name: "job2", + Namespace: veleroNamespace, + Labels: map[string]string{RepositoryNameLabel: "fake-repo"}, + CreationTimestamp: metav1.Time{Time: now.Add(time.Hour)}, + }, + Status: batchv1.JobStatus{ + StartTime: &metav1.Time{Time: now.Add(time.Hour)}, + CompletionTime: &metav1.Time{Time: now.Add(time.Hour * 2)}, + Failed: 1, + }, + } + + jobPodFailed1 := builder.ForPod(veleroNamespace, "job2").Labels(map[string]string{"job-name": "job2"}).ContainerStatuses(&v1.ContainerStatus{ + State: v1.ContainerState{ + Terminated: &v1.ContainerStateTerminated{ + Message: "fake-message-2", + }, + }, + }).Result() + + jobSucceeded2 := &batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{ + Name: "job3", + Namespace: veleroNamespace, + Labels: map[string]string{RepositoryNameLabel: "fake-repo"}, + CreationTimestamp: metav1.Time{Time: now.Add(time.Hour * 2)}, + }, + Status: batchv1.JobStatus{ + StartTime: &metav1.Time{Time: now.Add(time.Hour * 2)}, + CompletionTime: &metav1.Time{Time: now.Add(time.Hour * 3)}, + Succeeded: 1, + }, + } + + jobPodSucceeded2 := builder.ForPod(veleroNamespace, "job3").Labels(map[string]string{"job-name": "job3"}).ContainerStatuses(&v1.ContainerStatus{ + State: v1.ContainerState{ + Terminated: &v1.ContainerStateTerminated{}, + }, + }).Result() + + jobSucceeded3 := &batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{ + Name: "job4", + Namespace: veleroNamespace, + Labels: map[string]string{RepositoryNameLabel: "fake-repo"}, + CreationTimestamp: metav1.Time{Time: now.Add(time.Hour * 3)}, + }, + Status: batchv1.JobStatus{ + StartTime: &metav1.Time{Time: now.Add(time.Hour * 3)}, + CompletionTime: &metav1.Time{Time: now.Add(time.Hour * 4)}, + Succeeded: 1, + }, + } + + jobPodSucceeded3 := builder.ForPod(veleroNamespace, "job4").Labels(map[string]string{"job-name": "job4"}).ContainerStatuses(&v1.ContainerStatus{ + State: v1.ContainerState{ + Terminated: &v1.ContainerStateTerminated{}, + }, + }).Result() + + schemeFail := runtime.NewScheme() + + scheme := runtime.NewScheme() + batchv1.AddToScheme(scheme) + v1.AddToScheme(scheme) + + testCases := []struct { + name string + ctx context.Context + kubeClientObj []runtime.Object + runtimeScheme *runtime.Scheme + expectedStatus []velerov1api.BackupRepositoryMaintenanceStatus + expectedError string + }{ + { + name: "list job error", + runtimeScheme: schemeFail, + expectedError: "error listing maintenance job for repo fake-repo: no kind is registered for the type v1.JobList in scheme \"pkg/runtime/scheme.go:100\"", + }, + { + name: "job not exist", + runtimeScheme: scheme, + }, + { + name: "no matching job", + runtimeScheme: scheme, + kubeClientObj: []runtime.Object{ + jobOtherLabel, + }, + }, + { + name: "wait complete error", + ctx: ctx, + runtimeScheme: scheme, + kubeClientObj: []runtime.Object{ + jobIncomplete, + }, + expectedError: "error waiting maintenance job[job1] complete: context deadline exceeded", + }, + { + name: "get result error", + ctx: context.TODO(), + runtimeScheme: scheme, + kubeClientObj: []runtime.Object{ + jobSucceeded1, + }, + expectedError: "error getting maintenance job[job1] result: no pod found for job job1", + }, + { + name: "less than limit", + ctx: context.TODO(), + runtimeScheme: scheme, + kubeClientObj: []runtime.Object{ + jobFailed1, + jobSucceeded1, + jobPodSucceeded1, + jobPodFailed1, + }, + expectedStatus: []velerov1api.BackupRepositoryMaintenanceStatus{ + { + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + StartTimestamp: &metav1.Time{Time: now}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour)}, + }, + { + Result: velerov1api.BackupRepositoryMaintenanceFailed, + StartTimestamp: &metav1.Time{Time: now.Add(time.Hour)}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)}, + Message: "fake-message-2", + }, + }, + }, + { + name: "equal to limit", + ctx: context.TODO(), + runtimeScheme: scheme, + kubeClientObj: []runtime.Object{ + jobSucceeded2, + jobFailed1, + jobSucceeded1, + jobPodSucceeded1, + jobPodFailed1, + jobPodSucceeded2, + }, + expectedStatus: []velerov1api.BackupRepositoryMaintenanceStatus{ + { + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + StartTimestamp: &metav1.Time{Time: now}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour)}, + }, + { + Result: velerov1api.BackupRepositoryMaintenanceFailed, + StartTimestamp: &metav1.Time{Time: now.Add(time.Hour)}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)}, + Message: "fake-message-2", + }, + { + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + StartTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 3)}, + }, + }, + }, + { + name: "more than limit", + ctx: context.TODO(), + runtimeScheme: scheme, + kubeClientObj: []runtime.Object{ + jobSucceeded3, + jobSucceeded2, + jobFailed1, + jobSucceeded1, + jobPodSucceeded1, + jobPodFailed1, + jobPodSucceeded2, + jobPodSucceeded3, + }, + expectedStatus: []velerov1api.BackupRepositoryMaintenanceStatus{ + { + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + StartTimestamp: &metav1.Time{Time: now}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour)}, + }, + { + Result: velerov1api.BackupRepositoryMaintenanceFailed, + StartTimestamp: &metav1.Time{Time: now.Add(time.Hour)}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)}, + Message: "fake-message-2", + }, + { + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + StartTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 3)}, + }, + { + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + StartTimestamp: &metav1.Time{Time: now.Add(time.Hour * 3)}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 4)}, + }, + }, + }, + } + + for _, test := range testCases { + t.Run(test.name, func(t *testing.T) { + fakeClientBuilder := fake.NewClientBuilder() + fakeClientBuilder = fakeClientBuilder.WithScheme(test.runtimeScheme) + + fakeClient := fakeClientBuilder.WithRuntimeObjects(test.kubeClientObj...).Build() + + history, err := WaitIncompleteMaintenance(test.ctx, fakeClient, repo, 3, velerotest.NewLogger()) + + if test.expectedError != "" { + assert.EqualError(t, err, test.expectedError) + } else { + require.NoError(t, err) + } + + assert.Len(t, history, len(test.expectedStatus)) + for i := 0; i < len(test.expectedStatus); i++ { + assert.Equal(t, test.expectedStatus[i].Result, history[i].Result) + assert.Equal(t, test.expectedStatus[i].Message, history[i].Message) + assert.Equal(t, test.expectedStatus[i].StartTimestamp.Time, history[i].StartTimestamp.Time) + assert.Equal(t, test.expectedStatus[i].CompleteTimestamp.Time, history[i].CompleteTimestamp.Time) + } + }) + } + + cancel() +} diff --git a/pkg/repository/manager/manager.go b/pkg/repository/manager/manager.go index f590f2b145..3853f2d3c3 100644 --- a/pkg/repository/manager/manager.go +++ b/pkg/repository/manager/manager.go @@ -54,7 +54,7 @@ type Manager interface { PrepareRepo(repo *velerov1api.BackupRepository) error // PruneRepo deletes unused data from a repo. - PruneRepo(repo *velerov1api.BackupRepository) error + PruneRepo(repo *velerov1api.BackupRepository) (velerov1api.BackupRepositoryMaintenanceStatus, error) // UnlockRepo removes stale locks from a repo. UnlockRepo(repo *velerov1api.BackupRepository) error @@ -172,13 +172,13 @@ func (m *manager) PrepareRepo(repo *velerov1api.BackupRepository) error { return prd.PrepareRepo(context.Background(), param) } -func (m *manager) PruneRepo(repo *velerov1api.BackupRepository) error { +func (m *manager) PruneRepo(repo *velerov1api.BackupRepository) (velerov1api.BackupRepositoryMaintenanceStatus, error) { m.repoLocker.LockExclusive(repo.Name) defer m.repoLocker.UnlockExclusive(repo.Name) param, err := m.assembleRepoParam(repo) if err != nil { - return errors.WithStack(err) + return velerov1api.BackupRepositoryMaintenanceStatus{}, errors.WithStack(err) } log := m.log.WithFields(logrus.Fields{ @@ -190,12 +190,12 @@ func (m *manager) PruneRepo(repo *velerov1api.BackupRepository) error { job, err := repository.GetLatestMaintenanceJob(m.client, m.namespace) if err != nil { - return errors.WithStack(err) + return velerov1api.BackupRepositoryMaintenanceStatus{}, errors.WithStack(err) } if job != nil && job.Status.Succeeded == 0 && job.Status.Failed == 0 { log.Debugf("There already has a unfinished maintenance job %s/%s for repository %s, please wait for it to complete", job.Namespace, job.Name, param.BackupRepo.Name) - return nil + return velerov1api.BackupRepositoryMaintenanceStatus{}, nil } jobConfig, err := repository.GetMaintenanceJobConfig( @@ -220,13 +220,13 @@ func (m *manager) PruneRepo(repo *velerov1api.BackupRepository) error { param, ) if err != nil { - return errors.Wrap(err, "error to build maintenance job") + return velerov1api.BackupRepositoryMaintenanceStatus{}, errors.Wrap(err, "error to build maintenance job") } log = log.WithField("job", fmt.Sprintf("%s/%s", maintenanceJob.Namespace, maintenanceJob.Name)) if err := m.client.Create(context.TODO(), maintenanceJob); err != nil { - return errors.Wrap(err, "error to create maintenance job") + return velerov1api.BackupRepositoryMaintenanceStatus{}, errors.Wrap(err, "error to create maintenance job") } log.Debug("Creating maintenance job") @@ -240,27 +240,18 @@ func (m *manager) PruneRepo(repo *velerov1api.BackupRepository) error { } }() - var jobErr error - if err := repository.WaitForJobComplete(context.TODO(), m.client, maintenanceJob); err != nil { - log.WithError(err).Error("Error to wait for maintenance job complete") - jobErr = err // we won't return here for job may failed by maintenance failure, we want return the actual error + maintenanceJob, err = repository.WaitForJobComplete(context.TODO(), m.client, maintenanceJob.Namespace, maintenanceJob.Name) + if err != nil { + return velerov1api.BackupRepositoryMaintenanceStatus{}, errors.Wrap(err, "error to wait for maintenance job complete") } result, err := repository.GetMaintenanceResultFromJob(m.client, maintenanceJob) if err != nil { - return errors.Wrap(err, "error to get maintenance job result") - } - - if result != "" { - return errors.New(fmt.Sprintf("Maintenance job %s failed: %s", maintenanceJob.Name, result)) - } - - if jobErr != nil { - return errors.Wrap(jobErr, "error to wait for maintenance job complete") + return velerov1api.BackupRepositoryMaintenanceStatus{}, errors.Wrap(err, "error to get maintenance job result") } log.Info("Maintenance repo complete") - return nil + return repository.ComposeMaintenanceStatusFromJob(maintenanceJob, result), nil } func (m *manager) UnlockRepo(repo *velerov1api.BackupRepository) error { diff --git a/pkg/repository/mocks/Manager.go b/pkg/repository/mocks/Manager.go index 2264117752..c987693b4b 100644 --- a/pkg/repository/mocks/Manager.go +++ b/pkg/repository/mocks/Manager.go @@ -138,21 +138,31 @@ func (_m *Manager) PrepareRepo(repo *v1.BackupRepository) error { } // PruneRepo provides a mock function with given fields: repo -func (_m *Manager) PruneRepo(repo *v1.BackupRepository) error { +func (_m *Manager) PruneRepo(repo *v1.BackupRepository) (v1.BackupRepositoryMaintenanceStatus, error) { ret := _m.Called(repo) if len(ret) == 0 { panic("no return value specified for PruneRepo") } - var r0 error - if rf, ok := ret.Get(0).(func(*v1.BackupRepository) error); ok { + var r0 v1.BackupRepositoryMaintenanceStatus + var r1 error + if rf, ok := ret.Get(0).(func(*v1.BackupRepository) (v1.BackupRepositoryMaintenanceStatus, error)); ok { + return rf(repo) + } + if rf, ok := ret.Get(0).(func(*v1.BackupRepository) v1.BackupRepositoryMaintenanceStatus); ok { r0 = rf(repo) } else { - r0 = ret.Error(0) + r0 = ret.Get(0).(v1.BackupRepositoryMaintenanceStatus) } - return r0 + if rf, ok := ret.Get(1).(func(*v1.BackupRepository) error); ok { + r1 = rf(repo) + } else { + r1 = ret.Error(1) + } + + return r0, r1 } // UnlockRepo provides a mock function with given fields: repo From db69829fd7ba5629b321c8b6ad7a667651c8d67e Mon Sep 17 00:00:00 2001 From: Lyndon-Li Date: Mon, 6 Jan 2025 16:25:33 +0800 Subject: [PATCH 3/9] repo maintenance job out of repo manager Signed-off-by: Lyndon-Li --- pkg/cmd/cli/repomantenance/maintenance.go | 87 +++--- pkg/cmd/server/server.go | 13 +- .../backup_repository_controller.go | 75 +++-- .../backup_repository_controller_test.go | 196 ++++++++++++- pkg/repository/maintenance.go | 223 ++++++++++++++- pkg/repository/maintenance_test.go | 260 +++++++++++++++++- pkg/repository/manager/manager.go | 245 ++--------------- pkg/repository/manager/manager_test.go | 232 +--------------- pkg/repository/mocks/Manager.go | 20 +- 9 files changed, 773 insertions(+), 578 deletions(-) diff --git a/pkg/cmd/cli/repomantenance/maintenance.go b/pkg/cmd/cli/repomantenance/maintenance.go index e467c054a9..80721a4369 100644 --- a/pkg/cmd/cli/repomantenance/maintenance.go +++ b/pkg/cmd/cli/repomantenance/maintenance.go @@ -5,6 +5,7 @@ import ( "fmt" "os" "strings" + "time" "github.com/pkg/errors" "github.com/sirupsen/logrus" @@ -12,21 +13,25 @@ import ( "github.com/spf13/pflag" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/kubernetes" "sigs.k8s.io/controller-runtime/pkg/client" "github.com/vmware-tanzu/velero/internal/credentials" velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1" velerocli "github.com/vmware-tanzu/velero/pkg/client" "github.com/vmware-tanzu/velero/pkg/repository" - "github.com/vmware-tanzu/velero/pkg/repository/provider" "github.com/vmware-tanzu/velero/pkg/util/filesystem" "github.com/vmware-tanzu/velero/pkg/util/logging" + + repokey "github.com/vmware-tanzu/velero/pkg/repository/keys" + repomanager "github.com/vmware-tanzu/velero/pkg/repository/manager" ) type Options struct { RepoName string BackupStorageLocation string RepoType string + ResourceTimeout time.Duration LogLevelFlag *logging.LevelFlag FormatFlag *logging.FormatFlag } @@ -83,39 +88,46 @@ func (o *Options) Run(f velerocli.Factory) { } } -func (o *Options) initClient(f velerocli.Factory) (client.Client, error) { +func (o *Options) initClient(f velerocli.Factory) (client.Client, kubernetes.Interface, error) { scheme := runtime.NewScheme() err := velerov1api.AddToScheme(scheme) if err != nil { - return nil, errors.Wrap(err, "failed to add velero scheme") + return nil, nil, errors.Wrap(err, "failed to add velero scheme") } err = v1.AddToScheme(scheme) if err != nil { - return nil, errors.Wrap(err, "failed to add api core scheme") + return nil, nil, errors.Wrap(err, "failed to add api core scheme") } config, err := f.ClientConfig() if err != nil { - return nil, errors.Wrap(err, "failed to get client config") + return nil, nil, errors.Wrap(err, "failed to get client config") } cli, err := client.New(config, client.Options{ Scheme: scheme, }) if err != nil { - return nil, errors.Wrap(err, "failed to create client") + return nil, nil, errors.Wrap(err, "failed to create client") + } + + kubeClient, err := f.KubeClient() + if err != nil { + return nil, nil, errors.Wrap(err, "failed to create kube client") } - return cli, nil + return cli, kubeClient, nil } -func (o *Options) runRepoPrune(f velerocli.Factory, namespace string, logger logrus.FieldLogger) error { - cli, err := o.initClient(f) - if err != nil { - return err +func initRepoManager(namespace string, kubeClient kubernetes.Interface, cli client.Client, logger logrus.FieldLogger) (repomanager.Manager, error) { + // ensure the repo key secret is set up + if err := repokey.EnsureCommonRepositoryKey(kubeClient.CoreV1(), namespace); err != nil { + return nil, errors.Wrap(err, "failed to ensure repository key") } + repoLocker := repository.NewRepoLocker() + credentialFileStore, err := credentials.NewNamespacedFileStore( cli, namespace, @@ -123,23 +135,33 @@ func (o *Options) runRepoPrune(f velerocli.Factory, namespace string, logger log filesystem.NewFileSystem(), ) if err != nil { - return errors.Wrap(err, "failed to create namespaced file store") + return nil, errors.Wrap(err, "failed to create namespaced file store") } credentialSecretStore, err := credentials.NewNamespacedSecretStore(cli, namespace) if err != nil { - return errors.Wrap(err, "failed to create namespaced secret store") + return nil, errors.Wrap(err, "failed to create namespaced secret store") } - var repoProvider provider.Provider - if o.RepoType == velerov1api.BackupRepositoryTypeRestic { - repoProvider = provider.NewResticRepositoryProvider(credentialFileStore, filesystem.NewFileSystem(), logger) - } else { - repoProvider = provider.NewUnifiedRepoProvider( - credentials.CredentialGetter{ - FromFile: credentialFileStore, - FromSecret: credentialSecretStore, - }, o.RepoType, logger) + return repomanager.NewManager( + namespace, + cli, + repoLocker, + credentialFileStore, + credentialSecretStore, + logger, + ), nil +} + +func (o *Options) runRepoPrune(f velerocli.Factory, namespace string, logger logrus.FieldLogger) error { + cli, kubeClient, err := o.initClient(f) + if err != nil { + return err + } + + manager, err := initRepoManager(namespace, kubeClient, cli, logger) + if err != nil { + return err } // backupRepository @@ -149,31 +171,14 @@ func (o *Options) runRepoPrune(f velerocli.Factory, namespace string, logger log BackupLocation: o.BackupStorageLocation, RepositoryType: o.RepoType, }, true) - if err != nil { return errors.Wrap(err, "failed to get backup repository") } - // bsl - bsl := &velerov1api.BackupStorageLocation{} - err = cli.Get(context.Background(), client.ObjectKey{Namespace: namespace, Name: repo.Spec.BackupStorageLocation}, bsl) - if err != nil { - return errors.Wrap(err, "failed to get backup storage location") - } - - para := provider.RepoParam{ - BackupRepo: repo, - BackupLocation: bsl, - } - - err = repoProvider.BoostRepoConnect(context.Background(), para) - if err != nil { - return errors.Wrap(err, "failed to boost repo connect") - } - - err = repoProvider.PruneRepo(context.Background(), para) + err = manager.PruneRepo(repo) if err != nil { return errors.Wrap(err, "failed to prune repo") } + return nil } diff --git a/pkg/cmd/server/server.go b/pkg/cmd/server/server.go index a0828e69ef..dd616ad65a 100644 --- a/pkg/cmd/server/server.go +++ b/pkg/cmd/server/server.go @@ -491,15 +491,9 @@ func (s *server) initRepoManager() error { s.namespace, s.mgr.GetClient(), s.repoLocker, - s.repoEnsurer, s.credentialFileStore, s.credentialSecretStore, - s.config.RepoMaintenanceJobConfig, - s.config.PodResources, - s.config.KeepLatestMaintenanceJobs, s.logger, - s.logLevel, - s.config.LogFormat, ) return nil @@ -720,9 +714,14 @@ func (s *server) runControllers(defaultVolumeSnapshotLocations map[string]string s.namespace, s.logger, s.mgr.GetClient(), + s.repoManager, s.config.RepoMaintenanceFrequency, s.config.BackupRepoConfig, - s.repoManager, + s.config.KeepLatestMaintenanceJobs, + s.config.RepoMaintenanceJobConfig, + s.config.PodResources, + s.logLevel, + s.config.LogFormat, ).SetupWithManager(s.mgr); err != nil { s.logger.Fatal(err, "unable to create controller", "controller", constant.ControllerBackupRepo) } diff --git a/pkg/controller/backup_repository_controller.go b/pkg/controller/backup_repository_controller.go index 535fc41b6b..52d3cf04c6 100644 --- a/pkg/controller/backup_repository_controller.go +++ b/pkg/controller/backup_repository_controller.go @@ -45,6 +45,7 @@ import ( repoconfig "github.com/vmware-tanzu/velero/pkg/repository/config" repomanager "github.com/vmware-tanzu/velero/pkg/repository/manager" "github.com/vmware-tanzu/velero/pkg/util/kube" + "github.com/vmware-tanzu/velero/pkg/util/logging" ) const ( @@ -55,16 +56,22 @@ const ( type BackupRepoReconciler struct { client.Client - namespace string - logger logrus.FieldLogger - clock clocks.WithTickerAndDelayedExecution - maintenanceFrequency time.Duration - backupRepoConfig string - repositoryManager repomanager.Manager + namespace string + logger logrus.FieldLogger + clock clocks.WithTickerAndDelayedExecution + maintenanceFrequency time.Duration + backupRepoConfig string + repositoryManager repomanager.Manager + keepLatestMaintenanceJobs int + repoMaintenanceConfig string + podResources kube.PodResources + logLevel logrus.Level + logFormat *logging.FormatFlag } -func NewBackupRepoReconciler(namespace string, logger logrus.FieldLogger, client client.Client, - maintenanceFrequency time.Duration, backupRepoConfig string, repositoryManager repomanager.Manager) *BackupRepoReconciler { +func NewBackupRepoReconciler(namespace string, logger logrus.FieldLogger, client client.Client, repositoryManager repomanager.Manager, + maintenanceFrequency time.Duration, backupRepoConfig string, keepLatestMaintenanceJobs int, repoMaintenanceConfig string, podResources kube.PodResources, + logLevel logrus.Level, logFormat *logging.FormatFlag) *BackupRepoReconciler { c := &BackupRepoReconciler{ client, namespace, @@ -73,6 +80,11 @@ func NewBackupRepoReconciler(namespace string, logger logrus.FieldLogger, client maintenanceFrequency, backupRepoConfig, repositoryManager, + keepLatestMaintenanceJobs, + repoMaintenanceConfig, + podResources, + logLevel, + logFormat, } return c @@ -212,7 +224,13 @@ func (r *BackupRepoReconciler) Reconcile(ctx context.Context, req ctrl.Request) return ctrl.Result{}, errors.Wrap(err, "error handling incomplete repo maintenance jobs") } - return ctrl.Result{}, r.runMaintenanceIfDue(ctx, backupRepo, log) + if err := r.runMaintenanceIfDue(ctx, backupRepo, log); err != nil { + return ctrl.Result{}, errors.Wrap(err, "error check and run repo maintenance jobs") + } + + if err := repository.DeleteOldMaintenanceJobs(r.Client, req.Name, r.keepLatestMaintenanceJobs); err != nil { + log.WithError(err).Warn("Failed to delete old maintenance jobs") + } } return ctrl.Result{}, nil @@ -306,7 +324,7 @@ func ensureRepo(repo *velerov1api.BackupRepository, repoManager repomanager.Mana } func (r *BackupRepoReconciler) recallMaintenance(ctx context.Context, req *velerov1api.BackupRepository, log logrus.FieldLogger) error { - history, err := repository.WaitIncompleteMaintenance(ctx, r.Client, req, defaultMaintenanceStatusQueueLength, log) + history, err := repository.WaitAllMaintenanceJobComplete(ctx, r.Client, req, defaultMaintenanceStatusQueueLength, log) if err != nil { return errors.Wrapf(err, "error waiting incomplete repo maintenance job for repo %s", req.Name) } @@ -380,7 +398,11 @@ func getLastMaintenanceTimeFromHistory(history []velerov1api.BackupRepositoryMai time := history[0].CompleteTimestamp for i := range history { - if time.Before(history[i].CompleteTimestamp) { + if history[i].CompleteTimestamp == nil { + continue + } + + if time == nil || time.Before(history[i].CompleteTimestamp) { time = history[i].CompleteTimestamp } } @@ -406,8 +428,13 @@ func isEarlierMaintenanceStatus(a, b velerov1api.BackupRepositoryMaintenanceStat return a.StartTimestamp.Before(b.StartTimestamp) } +var funcStartMaintenanceJob = repository.StartMaintenanceJob +var funcWaitMaintenanceJobComplete = repository.WaitMaintenanceJobComplete + func (r *BackupRepoReconciler) runMaintenanceIfDue(ctx context.Context, req *velerov1api.BackupRepository, log logrus.FieldLogger) error { - if !dueForMaintenance(req, r.clock.Now()) { + startTime := r.clock.Now() + + if !dueForMaintenance(req, startTime) { log.Debug("not due for maintenance") return nil } @@ -418,31 +445,39 @@ func (r *BackupRepoReconciler) runMaintenanceIfDue(ctx context.Context, req *vel // should not cause the repo to move to `NotReady`. log.Debug("Pruning repo") - // when PruneRepo fails, the maintenance result will be left temporarily + job, err := funcStartMaintenanceJob(r.Client, ctx, req, r.repoMaintenanceConfig, r.podResources, r.logLevel, r.logFormat, log) + if err != nil { + log.WithError(err).Warn("Starting repo maintenance failed") + return r.patchBackupRepository(ctx, req, func(rr *velerov1api.BackupRepository) { + updateRepoMaintenanceHistory(rr, velerov1api.BackupRepositoryMaintenanceFailed, &metav1.Time{Time: startTime}, nil, fmt.Sprintf("Failed to start maintenance job, err: %v", err)) + }) + } + + // when WaitMaintenanceJobComplete fails, the maintenance result will be left temporarily // If the maintenenance still completes later, recallMaintenance recalls the left onces and update LastMaintenanceTime and history - status, err := r.repositoryManager.PruneRepo(req) + status, err := funcWaitMaintenanceJobComplete(r.Client, ctx, job, r.namespace, log) if err != nil { - return errors.Wrapf(err, "error pruning repository") + return errors.Wrapf(err, "error waiting repo maintenance completion status") } if status.Result == velerov1api.BackupRepositoryMaintenanceFailed { log.WithError(err).Warn("Pruning repository failed") return r.patchBackupRepository(ctx, req, func(rr *velerov1api.BackupRepository) { - updateRepoMaintenanceHistory(rr, velerov1api.BackupRepositoryMaintenanceFailed, status.StartTimestamp.Time, status.CompleteTimestamp.Time, status.Message) + updateRepoMaintenanceHistory(rr, velerov1api.BackupRepositoryMaintenanceFailed, status.StartTimestamp, status.CompleteTimestamp, status.Message) }) } return r.patchBackupRepository(ctx, req, func(rr *velerov1api.BackupRepository) { rr.Status.LastMaintenanceTime = &metav1.Time{Time: status.CompleteTimestamp.Time} - updateRepoMaintenanceHistory(rr, velerov1api.BackupRepositoryMaintenanceSucceeded, status.StartTimestamp.Time, status.CompleteTimestamp.Time, status.Message) + updateRepoMaintenanceHistory(rr, velerov1api.BackupRepositoryMaintenanceSucceeded, status.StartTimestamp, status.CompleteTimestamp, status.Message) }) } -func updateRepoMaintenanceHistory(repo *velerov1api.BackupRepository, result velerov1api.BackupRepositoryMaintenanceResult, startTime time.Time, completionTime time.Time, message string) { +func updateRepoMaintenanceHistory(repo *velerov1api.BackupRepository, result velerov1api.BackupRepositoryMaintenanceResult, startTime, completionTime *metav1.Time, message string) { latest := velerov1api.BackupRepositoryMaintenanceStatus{ Result: result, - StartTimestamp: &metav1.Time{Time: startTime}, - CompleteTimestamp: &metav1.Time{Time: completionTime}, + StartTimestamp: startTime, + CompleteTimestamp: completionTime, Message: message, } diff --git a/pkg/controller/backup_repository_controller_test.go b/pkg/controller/backup_repository_controller_test.go index 9b5dd4c4ae..b3a358fabc 100644 --- a/pkg/controller/backup_repository_controller_test.go +++ b/pkg/controller/backup_repository_controller_test.go @@ -19,20 +19,30 @@ import ( "testing" "time" + "github.com/sirupsen/logrus" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/mock" corev1 "k8s.io/api/core/v1" + v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" ctrl "sigs.k8s.io/controller-runtime" velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1" + "github.com/vmware-tanzu/velero/pkg/builder" + "github.com/vmware-tanzu/velero/pkg/repository" repomokes "github.com/vmware-tanzu/velero/pkg/repository/mocks" repotypes "github.com/vmware-tanzu/velero/pkg/repository/types" velerotest "github.com/vmware-tanzu/velero/pkg/test" + "github.com/vmware-tanzu/velero/pkg/util/kube" + "github.com/vmware-tanzu/velero/pkg/util/logging" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" clientFake "sigs.k8s.io/controller-runtime/pkg/client/fake" + + batchv1 "k8s.io/api/batch/v1" ) const testMaintenanceFrequency = 10 * time.Minute @@ -47,9 +57,14 @@ func mockBackupRepoReconciler(t *testing.T, mockOn string, arg interface{}, ret velerov1api.DefaultNamespace, velerotest.NewLogger(), velerotest.NewFakeControllerRuntimeClient(t), + mgr, testMaintenanceFrequency, "fake-repo-config", - mgr, + 3, + "", + kube.PodResources{}, + logrus.InfoLevel, + nil, ) } @@ -104,24 +119,71 @@ func TestCheckNotReadyRepo(t *testing.T) { assert.Equal(t, "s3:test.amazonaws.com/bucket/restic/volume-ns-1", rr.Spec.ResticIdentifier) } +func startMaintenanceJobFail(client.Client, context.Context, *velerov1api.BackupRepository, string, kube.PodResources, logrus.Level, *logging.FormatFlag, logrus.FieldLogger) (string, error) { + return "", errors.New("fake-start-error") +} + +func startMaintenanceJobSucceed(client.Client, context.Context, *velerov1api.BackupRepository, string, kube.PodResources, logrus.Level, *logging.FormatFlag, logrus.FieldLogger) (string, error) { + return "fake-job-name", nil +} + +func waitMaintenanceJobCompleteFail(client.Client, context.Context, string, string, logrus.FieldLogger) (velerov1api.BackupRepositoryMaintenanceStatus, error) { + return velerov1api.BackupRepositoryMaintenanceStatus{}, errors.New("fake-wait-error") +} + +func waitMaintenanceJobCompleteSucceed(client.Client, context.Context, string, string, logrus.FieldLogger) (velerov1api.BackupRepositoryMaintenanceStatus, error) { + return velerov1api.BackupRepositoryMaintenanceStatus{ + StartTimestamp: &metav1.Time{Time: time.Now()}, + CompleteTimestamp: &metav1.Time{Time: time.Now().Add(time.Hour)}, + }, nil +} + func TestRunMaintenanceIfDue(t *testing.T) { rr := mockBackupRepositoryCR() - reconciler := mockBackupRepoReconciler(t, "PruneRepo", rr, velerov1api.BackupRepositoryMaintenanceStatus{ - StartTimestamp: &metav1.Time{}, - CompleteTimestamp: &metav1.Time{}, - }, nil) + reconciler := mockBackupRepoReconciler(t, "", rr, nil) + funcStartMaintenanceJob = startMaintenanceJobFail err := reconciler.Client.Create(context.TODO(), rr) assert.NoError(t, err) lastTm := rr.Status.LastMaintenanceTime + history := rr.Status.RecentMaintenance + err = reconciler.runMaintenanceIfDue(context.TODO(), rr, reconciler.logger) + assert.NoError(t, err) + assert.Equal(t, rr.Status.LastMaintenanceTime, lastTm) + assert.NotEqual(t, rr.Status.RecentMaintenance, history) + + rr = mockBackupRepositoryCR() + reconciler = mockBackupRepoReconciler(t, "", rr, nil) + funcStartMaintenanceJob = startMaintenanceJobSucceed + funcWaitMaintenanceJobComplete = waitMaintenanceJobCompleteFail + err = reconciler.Client.Create(context.TODO(), rr) + assert.NoError(t, err) + lastTm = rr.Status.LastMaintenanceTime + history = rr.Status.RecentMaintenance + err = reconciler.runMaintenanceIfDue(context.TODO(), rr, reconciler.logger) + assert.EqualError(t, err, "error waiting repo maintenance completion status: fake-wait-error") + assert.Equal(t, rr.Status.LastMaintenanceTime, lastTm) + assert.Equal(t, rr.Status.RecentMaintenance, history) + + rr = mockBackupRepositoryCR() + reconciler = mockBackupRepoReconciler(t, "", rr, nil) + funcStartMaintenanceJob = startMaintenanceJobSucceed + funcWaitMaintenanceJobComplete = waitMaintenanceJobCompleteSucceed + err = reconciler.Client.Create(context.TODO(), rr) + assert.NoError(t, err) + lastTm = rr.Status.LastMaintenanceTime + history = rr.Status.RecentMaintenance err = reconciler.runMaintenanceIfDue(context.TODO(), rr, reconciler.logger) assert.NoError(t, err) assert.NotEqual(t, rr.Status.LastMaintenanceTime, lastTm) + assert.NotEqual(t, rr.Status.RecentMaintenance, history) rr.Status.LastMaintenanceTime = &metav1.Time{Time: time.Now()} lastTm = rr.Status.LastMaintenanceTime + history = rr.Status.RecentMaintenance err = reconciler.runMaintenanceIfDue(context.TODO(), rr, reconciler.logger) assert.NoError(t, err) assert.Equal(t, rr.Status.LastMaintenanceTime, lastTm) + assert.Equal(t, rr.Status.RecentMaintenance, history) } func TestInitializeRepo(t *testing.T) { @@ -251,9 +313,14 @@ func TestGetRepositoryMaintenanceFrequency(t *testing.T) { velerov1api.DefaultNamespace, velerotest.NewLogger(), velerotest.NewFakeControllerRuntimeClient(t), + &mgr, test.userDefinedFreq, "", - &mgr, + 3, + "", + kube.PodResources{}, + logrus.InfoLevel, + nil, ) freq := reconciler.getRepositoryMaintenanceFrequency(test.repo) @@ -380,7 +447,14 @@ func TestNeedInvalidBackupRepo(t *testing.T) { velerov1api.DefaultNamespace, velerotest.NewLogger(), velerotest.NewFakeControllerRuntimeClient(t), - time.Duration(0), "", nil) + nil, + time.Duration(0), + "", + 3, + "", + kube.PodResources{}, + logrus.InfoLevel, + nil) need := reconciler.needInvalidBackupRepo(test.oldBSL, test.newBSL) assert.Equal(t, test.expect, need) @@ -656,7 +730,7 @@ func TestUpdateRepoMaintenanceHistory(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - updateRepoMaintenanceHistory(test.backupRepo, test.result, standardTime, standardTime.Add(time.Hour), "fake-message-0") + updateRepoMaintenanceHistory(test.backupRepo, test.result, &metav1.Time{Time: standardTime}, &metav1.Time{Time: standardTime.Add(time.Hour)}, "fake-message-0") for at := range test.backupRepo.Status.RecentMaintenance { assert.Equal(t, test.expectedHistory[at].StartTimestamp.Time, test.backupRepo.Status.RecentMaintenance[at].StartTimestamp.Time) @@ -666,3 +740,109 @@ func TestUpdateRepoMaintenanceHistory(t *testing.T) { }) } } + +func TestRecallMaintenance(t *testing.T) { + now := time.Now() + + schemeFail := runtime.NewScheme() + velerov1api.AddToScheme(schemeFail) + + scheme := runtime.NewScheme() + batchv1.AddToScheme(scheme) + corev1.AddToScheme(scheme) + velerov1api.AddToScheme(scheme) + + jobSucceeded := &batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{ + Name: "job1", + Namespace: velerov1api.DefaultNamespace, + Labels: map[string]string{repository.RepositoryNameLabel: "repo"}, + CreationTimestamp: metav1.Time{Time: now.Add(time.Hour)}, + }, + Status: batchv1.JobStatus{ + StartTime: &metav1.Time{Time: now.Add(time.Hour)}, + CompletionTime: &metav1.Time{Time: now.Add(time.Hour * 2)}, + Succeeded: 1, + }, + } + + jobPodSucceeded := builder.ForPod(velerov1api.DefaultNamespace, "job1").Labels(map[string]string{"job-name": "job1"}).ContainerStatuses(&v1.ContainerStatus{ + State: v1.ContainerState{ + Terminated: &v1.ContainerStateTerminated{}, + }, + }).Result() + + tests := []struct { + name string + kubeClientObj []runtime.Object + runtimeScheme *runtime.Scheme + repoLastMatainTime metav1.Time + expectNewHistory bool + expectTimeUpdate bool + expectedErr string + }{ + { + name: "wait completion error", + runtimeScheme: schemeFail, + expectedErr: "error waiting incomplete repo maintenance job for repo repo: error listing maintenance job for repo repo: no kind is registered for the type v1.JobList in scheme \"pkg/runtime/scheme.go:100\"", + }, + { + name: "no consolidate result", + runtimeScheme: scheme, + }, + { + name: "no update last time", + runtimeScheme: scheme, + kubeClientObj: []runtime.Object{ + jobSucceeded, + jobPodSucceeded, + }, + repoLastMatainTime: metav1.Time{Time: now.Add(time.Hour * 5)}, + expectNewHistory: true, + }, + { + name: "update last time", + runtimeScheme: scheme, + kubeClientObj: []runtime.Object{ + jobSucceeded, + jobPodSucceeded, + }, + expectNewHistory: true, + expectTimeUpdate: true, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + r := mockBackupRepoReconciler(t, "", nil, nil) + + backupRepo := mockBackupRepositoryCR() + backupRepo.Status.LastMaintenanceTime = &test.repoLastMatainTime + + test.kubeClientObj = append(test.kubeClientObj, backupRepo) + + fakeClientBuilder := fake.NewClientBuilder() + fakeClientBuilder = fakeClientBuilder.WithScheme(test.runtimeScheme) + fakeClient := fakeClientBuilder.WithRuntimeObjects(test.kubeClientObj...).Build() + r.Client = fakeClient + + lastTm := backupRepo.Status.LastMaintenanceTime + history := backupRepo.Status.RecentMaintenance + + err := r.recallMaintenance(context.TODO(), backupRepo, velerotest.NewLogger()) + if test.expectedErr != "" { + assert.EqualError(t, err, test.expectedErr) + } else { + assert.NoError(t, err) + + if test.expectNewHistory { + assert.NotEqual(t, history, backupRepo.Status.RecentMaintenance) + } + + if test.expectTimeUpdate { + assert.NotEqual(t, lastTm, backupRepo.Status.LastMaintenanceTime) + } + } + }) + } +} diff --git a/pkg/repository/maintenance.go b/pkg/repository/maintenance.go index 5a1ec1f614..0e048d0a76 100644 --- a/pkg/repository/maintenance.go +++ b/pkg/repository/maintenance.go @@ -35,6 +35,12 @@ import ( velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1" "github.com/vmware-tanzu/velero/pkg/util/kube" + + appsv1 "k8s.io/api/apps/v1" + + veleroutil "github.com/vmware-tanzu/velero/pkg/util/velero" + + "github.com/vmware-tanzu/velero/pkg/util/logging" ) const ( @@ -86,29 +92,34 @@ func DeleteOldMaintenanceJobs(cli client.Client, repo string, keep int) error { return nil } -// WaitForJobComplete wait for completion of the specified job and update the latest job object -func WaitForJobComplete(ctx context.Context, client client.Client, ns string, job string) (*batchv1.Job, error) { - updated := &batchv1.Job{} +// waitForJobComplete wait for completion of the specified job and update the latest job object +func waitForJobComplete(ctx context.Context, client client.Client, ns string, job string) (*batchv1.Job, error) { + var ret *batchv1.Job + err := wait.PollUntilContextCancel(ctx, time.Second, true, func(ctx context.Context) (bool, error) { + updated := &batchv1.Job{} err := client.Get(ctx, types.NamespacedName{Namespace: ns, Name: job}, updated) if err != nil && !apierrors.IsNotFound(err) { return false, err } + ret = updated + if updated.Status.Succeeded > 0 { return true, nil } if updated.Status.Failed > 0 { - return true, fmt.Errorf("maintenance job %s/%s failed", job, job) + return true, nil } + return false, nil }) - return updated, err + return ret, err } -func GetMaintenanceResultFromJob(cli client.Client, job *batchv1.Job) (string, error) { +func getMaintenanceResultFromJob(cli client.Client, job *batchv1.Job) (string, error) { // Get the maintenance job related pod by label selector podList := &v1.PodList{} err := cli.List(context.TODO(), podList, client.InNamespace(job.Namespace), client.MatchingLabels(map[string]string{"job-name": job.Name})) @@ -137,7 +148,7 @@ func GetMaintenanceResultFromJob(cli client.Client, job *batchv1.Job) (string, e return terminated.Message, nil } -func GetLatestMaintenanceJob(cli client.Client, ns string) (*batchv1.Job, error) { +func getLatestMaintenanceJob(cli client.Client, ns string) (*batchv1.Job, error) { // Get the maintenance job list by label jobList := &batchv1.JobList{} err := cli.List(context.TODO(), jobList, &client.ListOptions{ @@ -162,7 +173,7 @@ func GetLatestMaintenanceJob(cli client.Client, ns string) (*batchv1.Job, error) return &jobList.Items[0], nil } -// GetMaintenanceJobConfig is called to get the Maintenance Job Config for the +// getMaintenanceJobConfig is called to get the Maintenance Job Config for the // BackupRepository specified by the repo parameter. // // Params: @@ -173,7 +184,7 @@ func GetLatestMaintenanceJob(cli client.Client, ns string) (*batchv1.Job, error) // veleroNamespace: the Velero-installed namespace. It's used to retrieve the BackupRepository. // repoMaintenanceJobConfig: the repository maintenance job ConfigMap name. // repo: the BackupRepository needs to run the maintenance Job. -func GetMaintenanceJobConfig( +func getMaintenanceJobConfig( ctx context.Context, client client.Client, logger logrus.FieldLogger, @@ -255,9 +266,27 @@ func GetMaintenanceJobConfig( return result, nil } -// WaitIncompleteMaintenance checks all the incomplete maintenance jobs of the specified repo and wait for them to complete, +func WaitMaintenanceJobComplete(cli client.Client, ctx context.Context, jobName, ns string, logger logrus.FieldLogger) (velerov1api.BackupRepositoryMaintenanceStatus, error) { + log := logger.WithField("job name", jobName) + + maintenanceJob, err := waitForJobComplete(ctx, cli, ns, jobName) + if err != nil { + return velerov1api.BackupRepositoryMaintenanceStatus{}, errors.Wrap(err, "error to wait for maintenance job complete") + } + + log.Info("Maintenance repo complete") + + result, err := getMaintenanceResultFromJob(cli, maintenanceJob) + if err != nil { + log.WithError(err).Warn("Failed to get maintenance job result") + } + + return composeMaintenanceStatusFromJob(maintenanceJob, result), nil +} + +// WaitAllMaintenanceJobComplete checks all the incomplete maintenance jobs of the specified repo and wait for them to complete, // and then return the maintenance jobs in the range of limit -func WaitIncompleteMaintenance(ctx context.Context, cli client.Client, repo *velerov1api.BackupRepository, limit int, log logrus.FieldLogger) ([]velerov1api.BackupRepositoryMaintenanceStatus, error) { +func WaitAllMaintenanceJobComplete(ctx context.Context, cli client.Client, repo *velerov1api.BackupRepository, limit int, log logrus.FieldLogger) ([]velerov1api.BackupRepositoryMaintenanceStatus, error) { jobList := &batchv1.JobList{} err := cli.List(context.TODO(), jobList, &client.ListOptions{ Namespace: repo.Namespace, @@ -290,7 +319,7 @@ func WaitIncompleteMaintenance(ctx context.Context, cli client.Client, repo *vel if job.Status.Succeeded == 0 && job.Status.Failed == 0 { log.Infof("Waiting for maintenance job %s to complete", job.Name) - updated, err := WaitForJobComplete(ctx, cli, job.Namespace, job.Name) + updated, err := waitForJobComplete(ctx, cli, job.Namespace, job.Name) if err != nil { return nil, errors.Wrapf(err, "error waiting maintenance job[%s] complete", job.Name) } @@ -298,18 +327,182 @@ func WaitIncompleteMaintenance(ctx context.Context, cli client.Client, repo *vel job = updated } - message, err := GetMaintenanceResultFromJob(cli, job) + message, err := getMaintenanceResultFromJob(cli, job) if err != nil { return nil, errors.Wrapf(err, "error getting maintenance job[%s] result", job.Name) } - history = append(history, ComposeMaintenanceStatusFromJob(job, message)) + history = append(history, composeMaintenanceStatusFromJob(job, message)) } return history, nil } -func ComposeMaintenanceStatusFromJob(job *batchv1.Job, message string) velerov1api.BackupRepositoryMaintenanceStatus { +func StartMaintenanceJob(cli client.Client, ctx context.Context, repo *velerov1api.BackupRepository, repoMaintenanceJobConfig string, + podResources kube.PodResources, logLevel logrus.Level, logFormat *logging.FormatFlag, logger logrus.FieldLogger) (string, error) { + bsl := &velerov1api.BackupStorageLocation{} + if err := cli.Get(ctx, client.ObjectKey{Namespace: repo.Namespace, Name: repo.Spec.BackupStorageLocation}, bsl); err != nil { + return "", errors.WithStack(err) + } + + log := logger.WithFields(logrus.Fields{ + "BSL name": bsl.Name, + "repo type": repo.Spec.RepositoryType, + "repo name": repo.Name, + "repo UID": repo.UID, + }) + + jobConfig, err := getMaintenanceJobConfig( + ctx, + cli, + log, + repo.Namespace, + repoMaintenanceJobConfig, + repo, + ) + if err != nil { + log.Warnf("Fail to find the ConfigMap %s to build maintenance job with error: %s. Use default value.", + repo.Namespace+"/"+repoMaintenanceJobConfig, + err.Error(), + ) + } + + log.Info("Starting maintenance repo") + + maintenanceJob, err := buildMaintenanceJob(cli, ctx, repo, bsl.Name, jobConfig, podResources, logLevel, logFormat) + if err != nil { + return "", errors.Wrap(err, "error to build maintenance job") + } + + log = log.WithField("job", fmt.Sprintf("%s/%s", maintenanceJob.Namespace, maintenanceJob.Name)) + + if err := cli.Create(context.TODO(), maintenanceJob); err != nil { + return "", errors.Wrap(err, "error to create maintenance job") + } + + log.Info("Repo maintenance job started") + + return maintenanceJob.Name, nil +} + +func buildMaintenanceJob(cli client.Client, ctx context.Context, repo *velerov1api.BackupRepository, bslName string, config *JobConfigs, + podResources kube.PodResources, logLevel logrus.Level, logFormat *logging.FormatFlag) (*batchv1.Job, error) { + // Get the Velero server deployment + deployment := &appsv1.Deployment{} + err := cli.Get(ctx, types.NamespacedName{Name: "velero", Namespace: repo.Namespace}, deployment) + if err != nil { + return nil, err + } + + // Get the environment variables from the Velero server deployment + envVars := veleroutil.GetEnvVarsFromVeleroServer(deployment) + + // Get the referenced storage from the Velero server deployment + envFromSources := veleroutil.GetEnvFromSourcesFromVeleroServer(deployment) + + // Get the volume mounts from the Velero server deployment + volumeMounts := veleroutil.GetVolumeMountsFromVeleroServer(deployment) + + // Get the volumes from the Velero server deployment + volumes := veleroutil.GetVolumesFromVeleroServer(deployment) + + // Get the service account from the Velero server deployment + serviceAccount := veleroutil.GetServiceAccountFromVeleroServer(deployment) + + // Get image + image := veleroutil.GetVeleroServerImage(deployment) + + // Set resource limits and requests + cpuRequest := podResources.CPURequest + memRequest := podResources.MemoryRequest + cpuLimit := podResources.CPULimit + memLimit := podResources.MemoryLimit + if config != nil && config.PodResources != nil { + cpuRequest = config.PodResources.CPURequest + memRequest = config.PodResources.MemoryRequest + cpuLimit = config.PodResources.CPULimit + memLimit = config.PodResources.MemoryLimit + } + resources, err := kube.ParseResourceRequirements(cpuRequest, memRequest, cpuLimit, memLimit) + if err != nil { + return nil, errors.Wrap(err, "failed to parse resource requirements for maintenance job") + } + + // Set arguments + args := []string{"repo-maintenance"} + args = append(args, fmt.Sprintf("--repo-name=%s", repo.Spec.VolumeNamespace)) + args = append(args, fmt.Sprintf("--repo-type=%s", repo.Spec.RepositoryType)) + args = append(args, fmt.Sprintf("--backup-storage-location=%s", bslName)) + args = append(args, fmt.Sprintf("--log-level=%s", logLevel.String())) + args = append(args, fmt.Sprintf("--log-format=%s", logFormat.String())) + + // build the maintenance job + job := &batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{ + Name: GenerateJobName(repo.Name), + Namespace: repo.Namespace, + Labels: map[string]string{ + RepositoryNameLabel: repo.Name, + }, + }, + Spec: batchv1.JobSpec{ + BackoffLimit: new(int32), // Never retry + Template: v1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Name: "velero-repo-maintenance-pod", + Labels: map[string]string{ + RepositoryNameLabel: repo.Name, + }, + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "velero-repo-maintenance-container", + Image: image, + Command: []string{ + "/velero", + }, + Args: args, + ImagePullPolicy: v1.PullIfNotPresent, + Env: envVars, + EnvFrom: envFromSources, + VolumeMounts: volumeMounts, + Resources: resources, + }, + }, + RestartPolicy: v1.RestartPolicyNever, + Volumes: volumes, + ServiceAccountName: serviceAccount, + }, + }, + }, + } + + if config != nil && len(config.LoadAffinities) > 0 { + affinity := kube.ToSystemAffinity(config.LoadAffinities) + job.Spec.Template.Spec.Affinity = affinity + } + + if tolerations := veleroutil.GetTolerationsFromVeleroServer(deployment); tolerations != nil { + job.Spec.Template.Spec.Tolerations = tolerations + } + + if nodeSelector := veleroutil.GetNodeSelectorFromVeleroServer(deployment); nodeSelector != nil { + job.Spec.Template.Spec.NodeSelector = nodeSelector + } + + if labels := veleroutil.GetVeleroServerLables(deployment); len(labels) > 0 { + job.Spec.Template.Labels = labels + } + + if annotations := veleroutil.GetVeleroServerAnnotations(deployment); len(annotations) > 0 { + job.Spec.Template.Annotations = annotations + } + + return job, nil +} + +func composeMaintenanceStatusFromJob(job *batchv1.Job, message string) velerov1api.BackupRepositoryMaintenanceStatus { result := velerov1api.BackupRepositoryMaintenanceSucceeded if job.Status.Failed > 0 { result = velerov1api.BackupRepositoryMaintenanceFailed diff --git a/pkg/repository/maintenance_test.go b/pkg/repository/maintenance_test.go index 6cde95c956..42e2477d2c 100644 --- a/pkg/repository/maintenance_test.go +++ b/pkg/repository/maintenance_test.go @@ -28,6 +28,7 @@ import ( "github.com/stretchr/testify/require" batchv1 "k8s.io/api/batch/v1" v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "sigs.k8s.io/controller-runtime/pkg/client" @@ -35,8 +36,12 @@ import ( velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1" "github.com/vmware-tanzu/velero/pkg/builder" + "github.com/vmware-tanzu/velero/pkg/repository/provider" velerotest "github.com/vmware-tanzu/velero/pkg/test" "github.com/vmware-tanzu/velero/pkg/util/kube" + "github.com/vmware-tanzu/velero/pkg/util/logging" + + appsv1 "k8s.io/api/apps/v1" ) func TestGenerateJobName1(t *testing.T) { @@ -138,25 +143,45 @@ func TestWaitForJobComplete(t *testing.T) { Status: batchv1.JobStatus{}, } + schemeFail := runtime.NewScheme() + + scheme := runtime.NewScheme() + batchv1.AddToScheme(scheme) + // Define test cases tests := []struct { - description string // Test case description - jobStatus batchv1.JobStatus // Job status to set for the test - expectError bool // Whether an error is expected + description string // Test case description + kubeClientObj []runtime.Object + runtimeScheme *runtime.Scheme + jobStatus batchv1.JobStatus // Job status to set for the test + expectError bool // Whether an error is expected }{ { - description: "Job Succeeded", + description: "wait error", + runtimeScheme: schemeFail, + expectError: true, + }, + { + description: "Job Succeeded", + runtimeScheme: scheme, + kubeClientObj: []runtime.Object{ + job, + }, jobStatus: batchv1.JobStatus{ Succeeded: 1, }, expectError: false, }, { - description: "Job Failed", + description: "Job Failed", + runtimeScheme: scheme, + kubeClientObj: []runtime.Object{ + job, + }, jobStatus: batchv1.JobStatus{ Failed: 1, }, - expectError: true, + expectError: false, }, } @@ -166,9 +191,12 @@ func TestWaitForJobComplete(t *testing.T) { // Set the job status job.Status = tc.jobStatus // Create a fake Kubernetes client - cli := fake.NewClientBuilder().WithObjects(job).Build() + fakeClientBuilder := fake.NewClientBuilder() + fakeClientBuilder = fakeClientBuilder.WithScheme(tc.runtimeScheme) + fakeClient := fakeClientBuilder.WithRuntimeObjects(tc.kubeClientObj...).Build() + // Call the function - _, err := WaitForJobComplete(context.Background(), cli, job.Namespace, job.Name) + _, err := waitForJobComplete(context.Background(), fakeClient, job.Namespace, job.Name) // Check if the error matches the expectation if tc.expectError { @@ -202,7 +230,7 @@ func TestGetMaintenanceResultFromJob(t *testing.T) { cli := fake.NewClientBuilder().WithObjects(job, pod).Build() // test an error should be returned - result, err := GetMaintenanceResultFromJob(cli, job) + result, err := getMaintenanceResultFromJob(cli, job) assert.Error(t, err) assert.Equal(t, "", result) @@ -217,7 +245,7 @@ func TestGetMaintenanceResultFromJob(t *testing.T) { // Test an error should be returned cli = fake.NewClientBuilder().WithObjects(job, pod).Build() - result, err = GetMaintenanceResultFromJob(cli, job) + result, err = getMaintenanceResultFromJob(cli, job) assert.Error(t, err) assert.Equal(t, "", result) @@ -236,7 +264,7 @@ func TestGetMaintenanceResultFromJob(t *testing.T) { // This call should return the termination message with no error cli = fake.NewClientBuilder().WithObjects(job, pod).Build() - result, err = GetMaintenanceResultFromJob(cli, job) + result, err = getMaintenanceResultFromJob(cli, job) assert.NoError(t, err) assert.Equal(t, "test message", result) } @@ -278,7 +306,7 @@ func TestGetLatestMaintenanceJob(t *testing.T) { cli := fake.NewClientBuilder().WithScheme(scheme).WithObjects(objs...).Build() // Call the function - job, err := GetLatestMaintenanceJob(cli, "default") + job, err := getLatestMaintenanceJob(cli, "default") assert.NoError(t, err) // We expect the returned job to be the newer job @@ -441,7 +469,7 @@ func TestGetMaintenanceJobConfig(t *testing.T) { fakeClient = velerotest.NewFakeControllerRuntimeClient(t) } - jobConfig, err := GetMaintenanceJobConfig( + jobConfig, err := getMaintenanceJobConfig( ctx, fakeClient, logger, @@ -460,7 +488,7 @@ func TestGetMaintenanceJobConfig(t *testing.T) { } } -func TestWaitIncompleteMaintenance(t *testing.T) { +func TestWaitAllMaintenanceJobComplete(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), time.Second*2) veleroNamespace := "velero" @@ -728,7 +756,7 @@ func TestWaitIncompleteMaintenance(t *testing.T) { fakeClient := fakeClientBuilder.WithRuntimeObjects(test.kubeClientObj...).Build() - history, err := WaitIncompleteMaintenance(test.ctx, fakeClient, repo, 3, velerotest.NewLogger()) + history, err := WaitAllMaintenanceJobComplete(test.ctx, fakeClient, repo, 3, velerotest.NewLogger()) if test.expectedError != "" { assert.EqualError(t, err, test.expectedError) @@ -748,3 +776,205 @@ func TestWaitIncompleteMaintenance(t *testing.T) { cancel() } + +func TestBuildMaintenanceJob(t *testing.T) { + testCases := []struct { + name string + m *JobConfigs + deploy *appsv1.Deployment + logLevel logrus.Level + logFormat *logging.FormatFlag + expectedJobName string + expectedError bool + expectedEnv []v1.EnvVar + expectedEnvFrom []v1.EnvFromSource + }{ + { + name: "Valid maintenance job", + m: &JobConfigs{ + PodResources: &kube.PodResources{ + CPURequest: "100m", + MemoryRequest: "128Mi", + CPULimit: "200m", + MemoryLimit: "256Mi", + }, + }, + deploy: &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "velero", + Namespace: "velero", + }, + Spec: appsv1.DeploymentSpec{ + Template: v1.PodTemplateSpec{ + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "velero-repo-maintenance-container", + Image: "velero-image", + Env: []v1.EnvVar{ + { + Name: "test-name", + Value: "test-value", + }, + }, + EnvFrom: []v1.EnvFromSource{ + { + ConfigMapRef: &v1.ConfigMapEnvSource{ + LocalObjectReference: v1.LocalObjectReference{ + Name: "test-configmap", + }, + }, + }, + { + SecretRef: &v1.SecretEnvSource{ + LocalObjectReference: v1.LocalObjectReference{ + Name: "test-secret", + }, + }, + }, + }, + }, + }, + }, + }, + }, + }, + logLevel: logrus.InfoLevel, + logFormat: logging.NewFormatFlag(), + expectedJobName: "test-123-maintain-job", + expectedError: false, + expectedEnv: []v1.EnvVar{ + { + Name: "test-name", + Value: "test-value", + }, + }, + expectedEnvFrom: []v1.EnvFromSource{ + { + ConfigMapRef: &v1.ConfigMapEnvSource{ + LocalObjectReference: v1.LocalObjectReference{ + Name: "test-configmap", + }, + }, + }, + { + SecretRef: &v1.SecretEnvSource{ + LocalObjectReference: v1.LocalObjectReference{ + Name: "test-secret", + }, + }, + }, + }, + }, + { + name: "Error getting Velero server deployment", + m: &JobConfigs{ + PodResources: &kube.PodResources{ + CPURequest: "100m", + MemoryRequest: "128Mi", + CPULimit: "200m", + MemoryLimit: "256Mi", + }, + }, + logLevel: logrus.InfoLevel, + logFormat: logging.NewFormatFlag(), + expectedJobName: "", + expectedError: true, + }, + } + + param := provider.RepoParam{ + BackupRepo: &velerov1api.BackupRepository{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "velero", + Name: "test-123", + }, + Spec: velerov1api.BackupRepositorySpec{ + VolumeNamespace: "test-123", + RepositoryType: "kopia", + }, + }, + BackupLocation: &velerov1api.BackupStorageLocation{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "velero", + Name: "test-location", + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + // Create a fake clientset with resources + objs := []runtime.Object{param.BackupLocation, param.BackupRepo} + + if tc.deploy != nil { + objs = append(objs, tc.deploy) + } + scheme := runtime.NewScheme() + _ = appsv1.AddToScheme(scheme) + _ = velerov1api.AddToScheme(scheme) + cli := fake.NewClientBuilder().WithScheme(scheme).WithRuntimeObjects(objs...).Build() + + // Call the function to test + job, err := buildMaintenanceJob(cli, context.TODO(), param.BackupRepo, param.BackupLocation.Name, tc.m, *tc.m.PodResources, tc.logLevel, tc.logFormat) + + // Check the error + if tc.expectedError { + assert.Error(t, err) + assert.Nil(t, job) + } else { + assert.NoError(t, err) + assert.NotNil(t, job) + assert.Contains(t, job.Name, tc.expectedJobName) + assert.Equal(t, param.BackupRepo.Namespace, job.Namespace) + assert.Equal(t, param.BackupRepo.Name, job.Labels[RepositoryNameLabel]) + + assert.Equal(t, param.BackupRepo.Name, job.Spec.Template.ObjectMeta.Labels[RepositoryNameLabel]) + + // Check container + assert.Len(t, job.Spec.Template.Spec.Containers, 1) + container := job.Spec.Template.Spec.Containers[0] + assert.Equal(t, "velero-repo-maintenance-container", container.Name) + assert.Equal(t, "velero-image", container.Image) + assert.Equal(t, v1.PullIfNotPresent, container.ImagePullPolicy) + + // Check container env + assert.Equal(t, tc.expectedEnv, container.Env) + assert.Equal(t, tc.expectedEnvFrom, container.EnvFrom) + + // Check resources + expectedResources := v1.ResourceRequirements{ + Requests: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse(tc.m.PodResources.CPURequest), + v1.ResourceMemory: resource.MustParse(tc.m.PodResources.MemoryRequest), + }, + Limits: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse(tc.m.PodResources.CPULimit), + v1.ResourceMemory: resource.MustParse(tc.m.PodResources.MemoryLimit), + }, + } + assert.Equal(t, expectedResources, container.Resources) + + // Check args + expectedArgs := []string{ + "repo-maintenance", + fmt.Sprintf("--repo-name=%s", param.BackupRepo.Spec.VolumeNamespace), + fmt.Sprintf("--repo-type=%s", param.BackupRepo.Spec.RepositoryType), + fmt.Sprintf("--backup-storage-location=%s", param.BackupLocation.Name), + fmt.Sprintf("--log-level=%s", tc.logLevel.String()), + fmt.Sprintf("--log-format=%s", tc.logFormat.String()), + } + assert.Equal(t, expectedArgs, container.Args) + + // Check affinity + assert.Nil(t, job.Spec.Template.Spec.Affinity) + + // Check tolerations + assert.Nil(t, job.Spec.Template.Spec.Tolerations) + + // Check node selector + assert.Nil(t, job.Spec.Template.Spec.NodeSelector) + } + }) + } +} diff --git a/pkg/repository/manager/manager.go b/pkg/repository/manager/manager.go index 3853f2d3c3..7027c96500 100644 --- a/pkg/repository/manager/manager.go +++ b/pkg/repository/manager/manager.go @@ -23,11 +23,6 @@ import ( "github.com/pkg/errors" "github.com/sirupsen/logrus" - appsv1 "k8s.io/api/apps/v1" - batchv1 "k8s.io/api/batch/v1" - v1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/client" "github.com/vmware-tanzu/velero/internal/credentials" @@ -35,9 +30,6 @@ import ( "github.com/vmware-tanzu/velero/pkg/repository" "github.com/vmware-tanzu/velero/pkg/repository/provider" "github.com/vmware-tanzu/velero/pkg/util/filesystem" - "github.com/vmware-tanzu/velero/pkg/util/kube" - "github.com/vmware-tanzu/velero/pkg/util/logging" - veleroutil "github.com/vmware-tanzu/velero/pkg/util/velero" ) // Manager manages backup repositories. @@ -54,7 +46,7 @@ type Manager interface { PrepareRepo(repo *velerov1api.BackupRepository) error // PruneRepo deletes unused data from a repo. - PruneRepo(repo *velerov1api.BackupRepository) (velerov1api.BackupRepositoryMaintenanceStatus, error) + PruneRepo(repo *velerov1api.BackupRepository) error // UnlockRepo removes stale locks from a repo. UnlockRepo(repo *velerov1api.BackupRepository) error @@ -76,16 +68,10 @@ type manager struct { providers map[string]provider.Provider // client is the Velero controller manager's client. // It's limited to resources in the Velero namespace. - client client.Client - repoLocker *repository.RepoLocker - repoEnsurer *repository.Ensurer - fileSystem filesystem.Interface - repoMaintenanceJobConfig string - podResources kube.PodResources - keepLatestMaintenanceJobs int - log logrus.FieldLogger - logLevel logrus.Level - logFormat *logging.FormatFlag + client client.Client + repoLocker *repository.RepoLocker + fileSystem filesystem.Interface + log logrus.FieldLogger } // NewManager create a new repository manager. @@ -93,29 +79,17 @@ func NewManager( namespace string, client client.Client, repoLocker *repository.RepoLocker, - repoEnsurer *repository.Ensurer, credentialFileStore credentials.FileStore, credentialSecretStore credentials.SecretStore, - repoMaintenanceJobConfig string, - podResources kube.PodResources, - keepLatestMaintenanceJobs int, log logrus.FieldLogger, - logLevel logrus.Level, - logFormat *logging.FormatFlag, ) Manager { mgr := &manager{ - namespace: namespace, - client: client, - providers: map[string]provider.Provider{}, - repoLocker: repoLocker, - repoEnsurer: repoEnsurer, - fileSystem: filesystem.NewFileSystem(), - repoMaintenanceJobConfig: repoMaintenanceJobConfig, - podResources: podResources, - keepLatestMaintenanceJobs: keepLatestMaintenanceJobs, - log: log, - logLevel: logLevel, - logFormat: logFormat, + namespace: namespace, + client: client, + providers: map[string]provider.Provider{}, + repoLocker: repoLocker, + fileSystem: filesystem.NewFileSystem(), + log: log, } mgr.providers[velerov1api.BackupRepositoryTypeRestic] = provider.NewResticRepositoryProvider(credentialFileStore, mgr.fileSystem, mgr.log) @@ -172,86 +146,24 @@ func (m *manager) PrepareRepo(repo *velerov1api.BackupRepository) error { return prd.PrepareRepo(context.Background(), param) } -func (m *manager) PruneRepo(repo *velerov1api.BackupRepository) (velerov1api.BackupRepositoryMaintenanceStatus, error) { +func (m *manager) PruneRepo(repo *velerov1api.BackupRepository) error { m.repoLocker.LockExclusive(repo.Name) defer m.repoLocker.UnlockExclusive(repo.Name) - param, err := m.assembleRepoParam(repo) - if err != nil { - return velerov1api.BackupRepositoryMaintenanceStatus{}, errors.WithStack(err) - } - - log := m.log.WithFields(logrus.Fields{ - "BSL name": param.BackupLocation.Name, - "repo type": param.BackupRepo.Spec.RepositoryType, - "repo name": param.BackupRepo.Name, - "repo UID": param.BackupRepo.UID, - }) - - job, err := repository.GetLatestMaintenanceJob(m.client, m.namespace) - if err != nil { - return velerov1api.BackupRepositoryMaintenanceStatus{}, errors.WithStack(err) - } - - if job != nil && job.Status.Succeeded == 0 && job.Status.Failed == 0 { - log.Debugf("There already has a unfinished maintenance job %s/%s for repository %s, please wait for it to complete", job.Namespace, job.Name, param.BackupRepo.Name) - return velerov1api.BackupRepositoryMaintenanceStatus{}, nil - } - - jobConfig, err := repository.GetMaintenanceJobConfig( - context.Background(), - m.client, - m.log, - m.namespace, - m.repoMaintenanceJobConfig, - repo, - ) - if err != nil { - log.Infof("Fail to find the ConfigMap %s to build maintenance job with error: %s. Use default value.", - m.namespace+"/"+m.repoMaintenanceJobConfig, - err.Error(), - ) - } - - log.Info("Start to maintenance repo") - - maintenanceJob, err := m.buildMaintenanceJob( - jobConfig, - param, - ) + prd, err := m.getRepositoryProvider(repo) if err != nil { - return velerov1api.BackupRepositoryMaintenanceStatus{}, errors.Wrap(err, "error to build maintenance job") - } - - log = log.WithField("job", fmt.Sprintf("%s/%s", maintenanceJob.Namespace, maintenanceJob.Name)) - - if err := m.client.Create(context.TODO(), maintenanceJob); err != nil { - return velerov1api.BackupRepositoryMaintenanceStatus{}, errors.Wrap(err, "error to create maintenance job") + return errors.WithStack(err) } - log.Debug("Creating maintenance job") - - defer func() { - if err := repository.DeleteOldMaintenanceJobs( - m.client, - param.BackupRepo.Name, - m.keepLatestMaintenanceJobs, - ); err != nil { - log.WithError(err).Error("Failed to delete maintenance job") - } - }() - - maintenanceJob, err = repository.WaitForJobComplete(context.TODO(), m.client, maintenanceJob.Namespace, maintenanceJob.Name) + param, err := m.assembleRepoParam(repo) if err != nil { - return velerov1api.BackupRepositoryMaintenanceStatus{}, errors.Wrap(err, "error to wait for maintenance job complete") + return errors.WithStack(err) } - result, err := repository.GetMaintenanceResultFromJob(m.client, maintenanceJob) - if err != nil { - return velerov1api.BackupRepositoryMaintenanceStatus{}, errors.Wrap(err, "error to get maintenance job result") + if err := prd.BoostRepoConnect(context.Background(), param); err != nil { + return errors.WithStack(err) } - log.Info("Maintenance repo complete") - return repository.ComposeMaintenanceStatusFromJob(maintenanceJob, result), nil + return prd.PruneRepo(context.Background(), param) } func (m *manager) UnlockRepo(repo *velerov1api.BackupRepository) error { @@ -344,122 +256,3 @@ func (m *manager) assembleRepoParam(repo *velerov1api.BackupRepository) (provide BackupRepo: repo, }, nil } - -func (m *manager) buildMaintenanceJob( - config *repository.JobConfigs, - param provider.RepoParam, -) (*batchv1.Job, error) { - // Get the Velero server deployment - deployment := &appsv1.Deployment{} - err := m.client.Get(context.TODO(), types.NamespacedName{Name: "velero", Namespace: m.namespace}, deployment) - if err != nil { - return nil, err - } - - // Get the environment variables from the Velero server deployment - envVars := veleroutil.GetEnvVarsFromVeleroServer(deployment) - - // Get the referenced storage from the Velero server deployment - envFromSources := veleroutil.GetEnvFromSourcesFromVeleroServer(deployment) - - // Get the volume mounts from the Velero server deployment - volumeMounts := veleroutil.GetVolumeMountsFromVeleroServer(deployment) - - // Get the volumes from the Velero server deployment - volumes := veleroutil.GetVolumesFromVeleroServer(deployment) - - // Get the service account from the Velero server deployment - serviceAccount := veleroutil.GetServiceAccountFromVeleroServer(deployment) - - // Get image - image := veleroutil.GetVeleroServerImage(deployment) - - // Set resource limits and requests - cpuRequest := m.podResources.CPURequest - memRequest := m.podResources.MemoryRequest - cpuLimit := m.podResources.CPULimit - memLimit := m.podResources.MemoryLimit - if config != nil && config.PodResources != nil { - cpuRequest = config.PodResources.CPURequest - memRequest = config.PodResources.MemoryRequest - cpuLimit = config.PodResources.CPULimit - memLimit = config.PodResources.MemoryLimit - } - resources, err := kube.ParseResourceRequirements(cpuRequest, memRequest, cpuLimit, memLimit) - if err != nil { - return nil, errors.Wrap(err, "failed to parse resource requirements for maintenance job") - } - - // Set arguments - args := []string{"repo-maintenance"} - args = append(args, fmt.Sprintf("--repo-name=%s", param.BackupRepo.Spec.VolumeNamespace)) - args = append(args, fmt.Sprintf("--repo-type=%s", param.BackupRepo.Spec.RepositoryType)) - args = append(args, fmt.Sprintf("--backup-storage-location=%s", param.BackupLocation.Name)) - args = append(args, fmt.Sprintf("--log-level=%s", m.logLevel.String())) - args = append(args, fmt.Sprintf("--log-format=%s", m.logFormat.String())) - - // build the maintenance job - job := &batchv1.Job{ - ObjectMeta: metav1.ObjectMeta{ - Name: repository.GenerateJobName(param.BackupRepo.Name), - Namespace: param.BackupRepo.Namespace, - Labels: map[string]string{ - repository.RepositoryNameLabel: param.BackupRepo.Name, - }, - }, - Spec: batchv1.JobSpec{ - BackoffLimit: new(int32), // Never retry - Template: v1.PodTemplateSpec{ - ObjectMeta: metav1.ObjectMeta{ - Name: "velero-repo-maintenance-pod", - Labels: map[string]string{ - repository.RepositoryNameLabel: param.BackupRepo.Name, - }, - }, - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "velero-repo-maintenance-container", - Image: image, - Command: []string{ - "/velero", - }, - Args: args, - ImagePullPolicy: v1.PullIfNotPresent, - Env: envVars, - EnvFrom: envFromSources, - VolumeMounts: volumeMounts, - Resources: resources, - }, - }, - RestartPolicy: v1.RestartPolicyNever, - Volumes: volumes, - ServiceAccountName: serviceAccount, - }, - }, - }, - } - - if config != nil && len(config.LoadAffinities) > 0 { - affinity := kube.ToSystemAffinity(config.LoadAffinities) - job.Spec.Template.Spec.Affinity = affinity - } - - if tolerations := veleroutil.GetTolerationsFromVeleroServer(deployment); tolerations != nil { - job.Spec.Template.Spec.Tolerations = tolerations - } - - if nodeSelector := veleroutil.GetNodeSelectorFromVeleroServer(deployment); nodeSelector != nil { - job.Spec.Template.Spec.NodeSelector = nodeSelector - } - - if labels := veleroutil.GetVeleroServerLables(deployment); len(labels) > 0 { - job.Spec.Template.Labels = labels - } - - if annotations := veleroutil.GetVeleroServerAnnotations(deployment); len(annotations) > 0 { - job.Spec.Template.Annotations = annotations - } - - return job, nil -} diff --git a/pkg/repository/manager/manager_test.go b/pkg/repository/manager/manager_test.go index e83f5f5276..d743e267ae 100644 --- a/pkg/repository/manager/manager_test.go +++ b/pkg/repository/manager/manager_test.go @@ -17,31 +17,18 @@ limitations under the License. package repository import ( - "fmt" "testing" - "github.com/sirupsen/logrus" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - appsv1 "k8s.io/api/apps/v1" - v1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/resource" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" kbclient "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/client/fake" velerov1 "github.com/vmware-tanzu/velero/pkg/apis/velero/v1" - velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1" - "github.com/vmware-tanzu/velero/pkg/repository" - "github.com/vmware-tanzu/velero/pkg/repository/provider" - "github.com/vmware-tanzu/velero/pkg/util/kube" - "github.com/vmware-tanzu/velero/pkg/util/logging" ) func TestGetRepositoryProvider(t *testing.T) { var fakeClient kbclient.Client - mgr := NewManager("", fakeClient, nil, nil, nil, nil, "", kube.PodResources{}, 3, nil, logrus.InfoLevel, nil).(*manager) + mgr := NewManager("", fakeClient, nil, nil, nil, nil).(*manager) repo := &velerov1.BackupRepository{} // empty repository type @@ -60,220 +47,3 @@ func TestGetRepositoryProvider(t *testing.T) { _, err = mgr.getRepositoryProvider(repo) require.Error(t, err) } - -func TestBuildMaintenanceJob(t *testing.T) { - testCases := []struct { - name string - m *repository.JobConfigs - deploy *appsv1.Deployment - logLevel logrus.Level - logFormat *logging.FormatFlag - expectedJobName string - expectedError bool - expectedEnv []v1.EnvVar - expectedEnvFrom []v1.EnvFromSource - }{ - { - name: "Valid maintenance job", - m: &repository.JobConfigs{ - PodResources: &kube.PodResources{ - CPURequest: "100m", - MemoryRequest: "128Mi", - CPULimit: "200m", - MemoryLimit: "256Mi", - }, - }, - deploy: &appsv1.Deployment{ - ObjectMeta: metav1.ObjectMeta{ - Name: "velero", - Namespace: "velero", - }, - Spec: appsv1.DeploymentSpec{ - Template: v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "velero-repo-maintenance-container", - Image: "velero-image", - Env: []v1.EnvVar{ - { - Name: "test-name", - Value: "test-value", - }, - }, - EnvFrom: []v1.EnvFromSource{ - { - ConfigMapRef: &v1.ConfigMapEnvSource{ - LocalObjectReference: v1.LocalObjectReference{ - Name: "test-configmap", - }, - }, - }, - { - SecretRef: &v1.SecretEnvSource{ - LocalObjectReference: v1.LocalObjectReference{ - Name: "test-secret", - }, - }, - }, - }, - }, - }, - }, - }, - }, - }, - logLevel: logrus.InfoLevel, - logFormat: logging.NewFormatFlag(), - expectedJobName: "test-123-maintain-job", - expectedError: false, - expectedEnv: []v1.EnvVar{ - { - Name: "test-name", - Value: "test-value", - }, - }, - expectedEnvFrom: []v1.EnvFromSource{ - { - ConfigMapRef: &v1.ConfigMapEnvSource{ - LocalObjectReference: v1.LocalObjectReference{ - Name: "test-configmap", - }, - }, - }, - { - SecretRef: &v1.SecretEnvSource{ - LocalObjectReference: v1.LocalObjectReference{ - Name: "test-secret", - }, - }, - }, - }, - }, - { - name: "Error getting Velero server deployment", - m: &repository.JobConfigs{ - PodResources: &kube.PodResources{ - CPURequest: "100m", - MemoryRequest: "128Mi", - CPULimit: "200m", - MemoryLimit: "256Mi", - }, - }, - logLevel: logrus.InfoLevel, - logFormat: logging.NewFormatFlag(), - expectedJobName: "", - expectedError: true, - }, - } - - param := provider.RepoParam{ - BackupRepo: &velerov1api.BackupRepository{ - ObjectMeta: metav1.ObjectMeta{ - Namespace: "velero", - Name: "test-123", - }, - Spec: velerov1api.BackupRepositorySpec{ - VolumeNamespace: "test-123", - RepositoryType: "kopia", - }, - }, - BackupLocation: &velerov1api.BackupStorageLocation{ - ObjectMeta: metav1.ObjectMeta{ - Namespace: "velero", - Name: "test-location", - }, - }, - } - - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - // Create a fake clientset with resources - objs := []runtime.Object{param.BackupLocation, param.BackupRepo} - - if tc.deploy != nil { - objs = append(objs, tc.deploy) - } - scheme := runtime.NewScheme() - _ = appsv1.AddToScheme(scheme) - _ = velerov1api.AddToScheme(scheme) - cli := fake.NewClientBuilder().WithScheme(scheme).WithRuntimeObjects(objs...).Build() - - mgr := NewManager( - "velero", - cli, - nil, - nil, - nil, - nil, - "", - kube.PodResources{}, - 3, - nil, - logrus.InfoLevel, - logging.NewFormatFlag(), - ).(*manager) - - // Call the function to test - job, err := mgr.buildMaintenanceJob(tc.m, param) - - // Check the error - if tc.expectedError { - assert.Error(t, err) - assert.Nil(t, job) - } else { - assert.NoError(t, err) - assert.NotNil(t, job) - assert.Contains(t, job.Name, tc.expectedJobName) - assert.Equal(t, param.BackupRepo.Namespace, job.Namespace) - assert.Equal(t, param.BackupRepo.Name, job.Labels[repository.RepositoryNameLabel]) - - assert.Equal(t, param.BackupRepo.Name, job.Spec.Template.ObjectMeta.Labels[repository.RepositoryNameLabel]) - - // Check container - assert.Len(t, job.Spec.Template.Spec.Containers, 1) - container := job.Spec.Template.Spec.Containers[0] - assert.Equal(t, "velero-repo-maintenance-container", container.Name) - assert.Equal(t, "velero-image", container.Image) - assert.Equal(t, v1.PullIfNotPresent, container.ImagePullPolicy) - - // Check container env - assert.Equal(t, tc.expectedEnv, container.Env) - assert.Equal(t, tc.expectedEnvFrom, container.EnvFrom) - - // Check resources - expectedResources := v1.ResourceRequirements{ - Requests: v1.ResourceList{ - v1.ResourceCPU: resource.MustParse(tc.m.PodResources.CPURequest), - v1.ResourceMemory: resource.MustParse(tc.m.PodResources.MemoryRequest), - }, - Limits: v1.ResourceList{ - v1.ResourceCPU: resource.MustParse(tc.m.PodResources.CPULimit), - v1.ResourceMemory: resource.MustParse(tc.m.PodResources.MemoryLimit), - }, - } - assert.Equal(t, expectedResources, container.Resources) - - // Check args - expectedArgs := []string{ - "repo-maintenance", - fmt.Sprintf("--repo-name=%s", param.BackupRepo.Spec.VolumeNamespace), - fmt.Sprintf("--repo-type=%s", param.BackupRepo.Spec.RepositoryType), - fmt.Sprintf("--backup-storage-location=%s", param.BackupLocation.Name), - fmt.Sprintf("--log-level=%s", tc.logLevel.String()), - fmt.Sprintf("--log-format=%s", tc.logFormat.String()), - } - assert.Equal(t, expectedArgs, container.Args) - - // Check affinity - assert.Nil(t, job.Spec.Template.Spec.Affinity) - - // Check tolerations - assert.Nil(t, job.Spec.Template.Spec.Tolerations) - - // Check node selector - assert.Nil(t, job.Spec.Template.Spec.NodeSelector) - } - }) - } -} diff --git a/pkg/repository/mocks/Manager.go b/pkg/repository/mocks/Manager.go index c987693b4b..2264117752 100644 --- a/pkg/repository/mocks/Manager.go +++ b/pkg/repository/mocks/Manager.go @@ -138,31 +138,21 @@ func (_m *Manager) PrepareRepo(repo *v1.BackupRepository) error { } // PruneRepo provides a mock function with given fields: repo -func (_m *Manager) PruneRepo(repo *v1.BackupRepository) (v1.BackupRepositoryMaintenanceStatus, error) { +func (_m *Manager) PruneRepo(repo *v1.BackupRepository) error { ret := _m.Called(repo) if len(ret) == 0 { panic("no return value specified for PruneRepo") } - var r0 v1.BackupRepositoryMaintenanceStatus - var r1 error - if rf, ok := ret.Get(0).(func(*v1.BackupRepository) (v1.BackupRepositoryMaintenanceStatus, error)); ok { - return rf(repo) - } - if rf, ok := ret.Get(0).(func(*v1.BackupRepository) v1.BackupRepositoryMaintenanceStatus); ok { + var r0 error + if rf, ok := ret.Get(0).(func(*v1.BackupRepository) error); ok { r0 = rf(repo) } else { - r0 = ret.Get(0).(v1.BackupRepositoryMaintenanceStatus) - } - - if rf, ok := ret.Get(1).(func(*v1.BackupRepository) error); ok { - r1 = rf(repo) - } else { - r1 = ret.Error(1) + r0 = ret.Error(0) } - return r0, r1 + return r0 } // UnlockRepo provides a mock function with given fields: repo From 6b73a256d59f9423439402ffa83089ec5d6ca1a6 Mon Sep 17 00:00:00 2001 From: Lyndon-Li Date: Mon, 6 Jan 2025 16:39:00 +0800 Subject: [PATCH 4/9] recall repo maintenance history on restart Signed-off-by: Lyndon-Li --- changelogs/unreleased/8580-Lyndon-Li | 1 + pkg/cmd/cli/repomantenance/maintenance.go | 26 +++++------ pkg/cmd/server/server.go | 2 +- .../backup_repository_controller.go | 9 ++-- pkg/repository/keys/keys.go | 12 +++-- pkg/repository/maintenance.go | 29 ++---------- pkg/repository/maintenance_test.go | 44 ------------------- 7 files changed, 26 insertions(+), 97 deletions(-) create mode 100644 changelogs/unreleased/8580-Lyndon-Li diff --git a/changelogs/unreleased/8580-Lyndon-Li b/changelogs/unreleased/8580-Lyndon-Li new file mode 100644 index 0000000000..87862b9529 --- /dev/null +++ b/changelogs/unreleased/8580-Lyndon-Li @@ -0,0 +1 @@ +Fix issue #7753, recall repo maintenance history on Velero server restart \ No newline at end of file diff --git a/pkg/cmd/cli/repomantenance/maintenance.go b/pkg/cmd/cli/repomantenance/maintenance.go index 80721a4369..2d9287296d 100644 --- a/pkg/cmd/cli/repomantenance/maintenance.go +++ b/pkg/cmd/cli/repomantenance/maintenance.go @@ -13,7 +13,6 @@ import ( "github.com/spf13/pflag" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/runtime" - "k8s.io/client-go/kubernetes" "sigs.k8s.io/controller-runtime/pkg/client" "github.com/vmware-tanzu/velero/internal/credentials" @@ -88,41 +87,36 @@ func (o *Options) Run(f velerocli.Factory) { } } -func (o *Options) initClient(f velerocli.Factory) (client.Client, kubernetes.Interface, error) { +func (o *Options) initClient(f velerocli.Factory) (client.Client, error) { scheme := runtime.NewScheme() err := velerov1api.AddToScheme(scheme) if err != nil { - return nil, nil, errors.Wrap(err, "failed to add velero scheme") + return nil, errors.Wrap(err, "failed to add velero scheme") } err = v1.AddToScheme(scheme) if err != nil { - return nil, nil, errors.Wrap(err, "failed to add api core scheme") + return nil, errors.Wrap(err, "failed to add api core scheme") } config, err := f.ClientConfig() if err != nil { - return nil, nil, errors.Wrap(err, "failed to get client config") + return nil, errors.Wrap(err, "failed to get client config") } cli, err := client.New(config, client.Options{ Scheme: scheme, }) if err != nil { - return nil, nil, errors.Wrap(err, "failed to create client") + return nil, errors.Wrap(err, "failed to create client") } - kubeClient, err := f.KubeClient() - if err != nil { - return nil, nil, errors.Wrap(err, "failed to create kube client") - } - - return cli, kubeClient, nil + return cli, nil } -func initRepoManager(namespace string, kubeClient kubernetes.Interface, cli client.Client, logger logrus.FieldLogger) (repomanager.Manager, error) { +func initRepoManager(namespace string, cli client.Client, logger logrus.FieldLogger) (repomanager.Manager, error) { // ensure the repo key secret is set up - if err := repokey.EnsureCommonRepositoryKey(kubeClient.CoreV1(), namespace); err != nil { + if err := repokey.EnsureCommonRepositoryKey(cli, namespace); err != nil { return nil, errors.Wrap(err, "failed to ensure repository key") } @@ -154,12 +148,12 @@ func initRepoManager(namespace string, kubeClient kubernetes.Interface, cli clie } func (o *Options) runRepoPrune(f velerocli.Factory, namespace string, logger logrus.FieldLogger) error { - cli, kubeClient, err := o.initClient(f) + cli, err := o.initClient(f) if err != nil { return err } - manager, err := initRepoManager(namespace, kubeClient, cli, logger) + manager, err := initRepoManager(namespace, cli, logger) if err != nil { return err } diff --git a/pkg/cmd/server/server.go b/pkg/cmd/server/server.go index dd616ad65a..01a2230d11 100644 --- a/pkg/cmd/server/server.go +++ b/pkg/cmd/server/server.go @@ -480,7 +480,7 @@ func (s *server) checkNodeAgent() { func (s *server) initRepoManager() error { // ensure the repo key secret is set up - if err := repokey.EnsureCommonRepositoryKey(s.kubeClient.CoreV1(), s.namespace); err != nil { + if err := repokey.EnsureCommonRepositoryKey(s.mgr.GetClient(), s.namespace); err != nil { return err } diff --git a/pkg/controller/backup_repository_controller.go b/pkg/controller/backup_repository_controller.go index 52d3cf04c6..92266ee1bf 100644 --- a/pkg/controller/backup_repository_controller.go +++ b/pkg/controller/backup_repository_controller.go @@ -338,12 +338,9 @@ func (r *BackupRepoReconciler) recallMaintenance(ctx context.Context, req *veler log.Warn("Updating backup repository because of unrecorded histories") - if lastMaintenanceTime.After(req.Status.LastMaintenanceTime.Time) { - log.Warnf("Updating backup repository last maintenance time (%v) from history (%v)", req.Status.LastMaintenanceTime.Time, lastMaintenanceTime.Time) - } - return r.patchBackupRepository(ctx, req, func(rr *velerov1api.BackupRepository) { if lastMaintenanceTime.After(rr.Status.LastMaintenanceTime.Time) { + log.Warnf("Updating backup repository last maintenance time (%v) from history (%v)", rr.Status.LastMaintenanceTime.Time, lastMaintenanceTime.Time) rr.Status.LastMaintenanceTime = lastMaintenanceTime } @@ -453,8 +450,8 @@ func (r *BackupRepoReconciler) runMaintenanceIfDue(ctx context.Context, req *vel }) } - // when WaitMaintenanceJobComplete fails, the maintenance result will be left temporarily - // If the maintenenance still completes later, recallMaintenance recalls the left onces and update LastMaintenanceTime and history + // when WaitMaintenanceJobComplete fails, the maintenance result will be left aside temporarily + // If the maintenenance still completes later, recallMaintenance recalls the left once and update LastMaintenanceTime and history status, err := funcWaitMaintenanceJobComplete(r.Client, ctx, job, r.namespace, log) if err != nil { return errors.Wrapf(err, "error waiting repo maintenance completion status") diff --git a/pkg/repository/keys/keys.go b/pkg/repository/keys/keys.go index 21423afe09..15f4dae9e0 100644 --- a/pkg/repository/keys/keys.go +++ b/pkg/repository/keys/keys.go @@ -24,9 +24,11 @@ import ( corev1api "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - corev1client "k8s.io/client-go/kubernetes/typed/core/v1" + "k8s.io/apimachinery/pkg/types" "github.com/vmware-tanzu/velero/pkg/builder" + + "sigs.k8s.io/controller-runtime/pkg/client" ) const ( @@ -36,11 +38,13 @@ const ( encryptionKey = "static-passw0rd" ) -func EnsureCommonRepositoryKey(secretClient corev1client.SecretsGetter, namespace string) error { - _, err := secretClient.Secrets(namespace).Get(context.TODO(), credentialsSecretName, metav1.GetOptions{}) +func EnsureCommonRepositoryKey(cli client.Client, namespace string) error { + existing := &corev1api.Secret{} + err := cli.Get(context.TODO(), types.NamespacedName{Namespace: namespace, Name: credentialsSecretName}, existing) if err != nil && !apierrors.IsNotFound(err) { return errors.WithStack(err) } + if err == nil { return nil } @@ -58,7 +62,7 @@ func EnsureCommonRepositoryKey(secretClient corev1client.SecretsGetter, namespac }, } - if _, err = secretClient.Secrets(namespace).Create(context.TODO(), secret, metav1.CreateOptions{}); err != nil { + if err := cli.Create(context.TODO(), secret); err != nil { return errors.Wrapf(err, "error creating %s secret", credentialsSecretName) } diff --git a/pkg/repository/maintenance.go b/pkg/repository/maintenance.go index 0e048d0a76..c1b4276a57 100644 --- a/pkg/repository/maintenance.go +++ b/pkg/repository/maintenance.go @@ -148,31 +148,6 @@ func getMaintenanceResultFromJob(cli client.Client, job *batchv1.Job) (string, e return terminated.Message, nil } -func getLatestMaintenanceJob(cli client.Client, ns string) (*batchv1.Job, error) { - // Get the maintenance job list by label - jobList := &batchv1.JobList{} - err := cli.List(context.TODO(), jobList, &client.ListOptions{ - Namespace: ns, - }, - &client.HasLabels{RepositoryNameLabel}, - ) - - if err != nil { - return nil, err - } - - if len(jobList.Items) == 0 { - return nil, nil - } - - // Get the latest maintenance job - sort.Slice(jobList.Items, func(i, j int) bool { - return jobList.Items[i].CreationTimestamp.Time.After(jobList.Items[j].CreationTimestamp.Time) - }) - - return &jobList.Items[0], nil -} - // getMaintenanceJobConfig is called to get the Maintenance Job Config for the // BackupRepository specified by the repo parameter. // @@ -266,6 +241,7 @@ func getMaintenanceJobConfig( return result, nil } +// WaitMaintenanceJobComplete waits the completion of the specified maintenance job and return the BackupRepositoryMaintenanceStatus func WaitMaintenanceJobComplete(cli client.Client, ctx context.Context, jobName, ns string, logger logrus.FieldLogger) (velerov1api.BackupRepositoryMaintenanceStatus, error) { log := logger.WithField("job name", jobName) @@ -285,7 +261,7 @@ func WaitMaintenanceJobComplete(cli client.Client, ctx context.Context, jobName, } // WaitAllMaintenanceJobComplete checks all the incomplete maintenance jobs of the specified repo and wait for them to complete, -// and then return the maintenance jobs in the range of limit +// and then return the maintenance jobs' status in the range of limit func WaitAllMaintenanceJobComplete(ctx context.Context, cli client.Client, repo *velerov1api.BackupRepository, limit int, log logrus.FieldLogger) ([]velerov1api.BackupRepositoryMaintenanceStatus, error) { jobList := &batchv1.JobList{} err := cli.List(context.TODO(), jobList, &client.ListOptions{ @@ -338,6 +314,7 @@ func WaitAllMaintenanceJobComplete(ctx context.Context, cli client.Client, repo return history, nil } +// StartMaintenanceJob creates a new maintenance job func StartMaintenanceJob(cli client.Client, ctx context.Context, repo *velerov1api.BackupRepository, repoMaintenanceJobConfig string, podResources kube.PodResources, logLevel logrus.Level, logFormat *logging.FormatFlag, logger logrus.FieldLogger) (string, error) { bsl := &velerov1api.BackupStorageLocation{} diff --git a/pkg/repository/maintenance_test.go b/pkg/repository/maintenance_test.go index 42e2477d2c..6d04a7837a 100644 --- a/pkg/repository/maintenance_test.go +++ b/pkg/repository/maintenance_test.go @@ -269,50 +269,6 @@ func TestGetMaintenanceResultFromJob(t *testing.T) { assert.Equal(t, "test message", result) } -func TestGetLatestMaintenanceJob(t *testing.T) { - // Set up test repo - repo := "test-repo" - - // Create some maintenance jobs for testing - var objs []client.Object - // Create a newer job - newerJob := &batchv1.Job{ - ObjectMeta: metav1.ObjectMeta{ - Name: "job1", - Namespace: "default", - Labels: map[string]string{RepositoryNameLabel: repo}, - CreationTimestamp: metav1.Time{ - Time: metav1.Now().Add(time.Duration(-24) * time.Hour), - }, - }, - Spec: batchv1.JobSpec{}, - } - objs = append(objs, newerJob) - - // Create an older job - olderJob := &batchv1.Job{ - ObjectMeta: metav1.ObjectMeta{ - Name: "job2", - Namespace: "default", - Labels: map[string]string{RepositoryNameLabel: repo}, - }, - Spec: batchv1.JobSpec{}, - } - objs = append(objs, olderJob) - - // Create a fake Kubernetes client - scheme := runtime.NewScheme() - _ = batchv1.AddToScheme(scheme) - cli := fake.NewClientBuilder().WithScheme(scheme).WithObjects(objs...).Build() - - // Call the function - job, err := getLatestMaintenanceJob(cli, "default") - assert.NoError(t, err) - - // We expect the returned job to be the newer job - assert.Equal(t, newerJob.Name, job.Name) -} - func TestGetMaintenanceJobConfig(t *testing.T) { ctx := context.Background() logger := logrus.New() From 4ce7361f5a37183e3137a8d46fe1d4ac92f0f6c2 Mon Sep 17 00:00:00 2001 From: Lyndon-Li Date: Mon, 6 Jan 2025 17:35:57 +0800 Subject: [PATCH 5/9] recall repo maintenance history on restart Signed-off-by: Lyndon-Li --- go.mod | 2 +- pkg/cmd/cli/repomantenance/maintenance.go | 16 +- pkg/cmd/server/server.go | 2 +- .../backup_repository_controller.go | 51 ++- .../backup_repository_controller_test.go | 337 ++++++++++++++++++ pkg/repository/keys/keys.go | 12 +- 6 files changed, 379 insertions(+), 41 deletions(-) diff --git a/go.mod b/go.mod index 1b72e62099..51e58a1a59 100644 --- a/go.mod +++ b/go.mod @@ -31,6 +31,7 @@ require ( github.com/kubernetes-csi/external-snapshotter/client/v7 v7.0.0 github.com/onsi/ginkgo/v2 v2.19.0 github.com/onsi/gomega v1.33.1 + github.com/petar/GoLLRB v0.0.0-20210522233825-ae3b015fd3e9 github.com/pkg/errors v0.9.1 github.com/prometheus/client_golang v1.20.5 github.com/robfig/cron/v3 v3.0.1 @@ -154,7 +155,6 @@ require ( github.com/natefinch/atomic v1.0.1 // indirect github.com/nxadm/tail v1.4.8 // indirect github.com/oklog/run v1.0.0 // indirect - github.com/petar/GoLLRB v0.0.0-20210522233825-ae3b015fd3e9 // indirect github.com/pierrec/lz4 v2.6.1+incompatible // indirect github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c // indirect github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect diff --git a/pkg/cmd/cli/repomantenance/maintenance.go b/pkg/cmd/cli/repomantenance/maintenance.go index 5e83a079c4..7ca6d7505e 100644 --- a/pkg/cmd/cli/repomantenance/maintenance.go +++ b/pkg/cmd/cli/repomantenance/maintenance.go @@ -13,7 +13,10 @@ import ( "github.com/spf13/pflag" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/kubernetes" + ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log/zap" "github.com/vmware-tanzu/velero/internal/credentials" velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1" @@ -65,6 +68,8 @@ func (o *Options) Run(f velerocli.Factory) { logger := logging.DefaultLogger(o.LogLevelFlag.Parse(), o.FormatFlag.Parse()) logger.SetOutput(os.Stdout) + ctrl.SetLogger(zap.New(zap.UseDevMode(true))) + time.Sleep(time.Minute) pruneError := o.runRepoPrune(f, f.Namespace(), logger) @@ -116,9 +121,9 @@ func (o *Options) initClient(f velerocli.Factory) (client.Client, error) { return cli, nil } -func initRepoManager(namespace string, cli client.Client, logger logrus.FieldLogger) (repomanager.Manager, error) { +func initRepoManager(namespace string, cli client.Client, kubeClient kubernetes.Interface, logger logrus.FieldLogger) (repomanager.Manager, error) { // ensure the repo key secret is set up - if err := repokey.EnsureCommonRepositoryKey(cli, namespace); err != nil { + if err := repokey.EnsureCommonRepositoryKey(kubeClient.CoreV1(), namespace); err != nil { return nil, errors.Wrap(err, "failed to ensure repository key") } @@ -155,7 +160,12 @@ func (o *Options) runRepoPrune(f velerocli.Factory, namespace string, logger log return err } - manager, err := initRepoManager(namespace, cli, logger) + kubeClient, err := f.KubeClient() + if err != nil { + return err + } + + manager, err := initRepoManager(namespace, cli, kubeClient, logger) if err != nil { return err } diff --git a/pkg/cmd/server/server.go b/pkg/cmd/server/server.go index 82c876761d..396970b92a 100644 --- a/pkg/cmd/server/server.go +++ b/pkg/cmd/server/server.go @@ -491,7 +491,7 @@ func (s *server) checkNodeAgent() { func (s *server) initRepoManager() error { // ensure the repo key secret is set up - if err := repokey.EnsureCommonRepositoryKey(s.mgr.GetClient(), s.namespace); err != nil { + if err := repokey.EnsureCommonRepositoryKey(s.kubeClient.CoreV1(), s.namespace); err != nil { return err } diff --git a/pkg/controller/backup_repository_controller.go b/pkg/controller/backup_repository_controller.go index 92266ee1bf..2f6fa42220 100644 --- a/pkg/controller/backup_repository_controller.go +++ b/pkg/controller/backup_repository_controller.go @@ -25,6 +25,7 @@ import ( "slices" "time" + "github.com/petar/GoLLRB/llrb" "github.com/pkg/errors" "github.com/sirupsen/logrus" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -348,43 +349,45 @@ func (r *BackupRepoReconciler) recallMaintenance(ctx context.Context, req *veler }) } +type maintenanceStatusWrapper struct { + status *velerov1api.BackupRepositoryMaintenanceStatus +} + +func (w maintenanceStatusWrapper) Less(other llrb.Item) bool { + return w.status.StartTimestamp.Before(other.(maintenanceStatusWrapper).status.StartTimestamp) +} + func consolidateHistory(coming, cur []velerov1api.BackupRepositoryMaintenanceStatus) []velerov1api.BackupRepositoryMaintenanceStatus { if len(coming) == 0 { return nil } - if isIdenticalHistory(coming, cur) { + if isIdenticalHistory(cur, coming) { return nil } + consolidator := llrb.New() + for i := range cur { + consolidator.ReplaceOrInsert(maintenanceStatusWrapper{&cur[i]}) + } + + for i := range coming { + consolidator.ReplaceOrInsert(maintenanceStatusWrapper{&coming[i]}) + } + truncated := []velerov1api.BackupRepositoryMaintenanceStatus{} - i := len(cur) - 1 - j := len(coming) - 1 - for i >= 0 || j >= 0 { + for consolidator.Len() > 0 { if len(truncated) == defaultMaintenanceStatusQueueLength { break } - if i >= 0 && j >= 0 { - if isEarlierMaintenanceStatus(cur[i], coming[j]) { - truncated = append(truncated, coming[j]) - j-- - } else { - truncated = append(truncated, cur[i]) - i-- - } - } else if i >= 0 { - truncated = append(truncated, cur[i]) - i-- - } else { - truncated = append(truncated, coming[j]) - j-- - } + item := consolidator.DeleteMax() + truncated = append(truncated, *item.(maintenanceStatusWrapper).status) } slices.Reverse(truncated) - if isIdenticalHistory(truncated, cur) { + if isIdenticalHistory(cur, truncated) { return nil } @@ -421,10 +424,6 @@ func isIdenticalHistory(a, b []velerov1api.BackupRepositoryMaintenanceStatus) bo return true } -func isEarlierMaintenanceStatus(a, b velerov1api.BackupRepositoryMaintenanceStatus) bool { - return a.StartTimestamp.Before(b.StartTimestamp) -} - var funcStartMaintenanceJob = repository.StartMaintenanceJob var funcWaitMaintenanceJobComplete = repository.WaitMaintenanceJobComplete @@ -438,10 +437,6 @@ func (r *BackupRepoReconciler) runMaintenanceIfDue(ctx context.Context, req *vel log.Info("Running maintenance on backup repository") - // prune failures should be displayed in the `.status.message` field but - // should not cause the repo to move to `NotReady`. - log.Debug("Pruning repo") - job, err := funcStartMaintenanceJob(r.Client, ctx, req, r.repoMaintenanceConfig, r.podResources, r.logLevel, r.logFormat, log) if err != nil { log.WithError(err).Warn("Starting repo maintenance failed") diff --git a/pkg/controller/backup_repository_controller_test.go b/pkg/controller/backup_repository_controller_test.go index b3a358fabc..7ad65de0d8 100644 --- a/pkg/controller/backup_repository_controller_test.go +++ b/pkg/controller/backup_repository_controller_test.go @@ -846,3 +846,340 @@ func TestRecallMaintenance(t *testing.T) { }) } } + +func TestConsolidateHistory(t *testing.T) { + now := time.Now() + + tests := []struct { + name string + cur []velerov1api.BackupRepositoryMaintenanceStatus + coming []velerov1api.BackupRepositoryMaintenanceStatus + expected []velerov1api.BackupRepositoryMaintenanceStatus + }{ + { + name: "zero coming", + cur: []velerov1api.BackupRepositoryMaintenanceStatus{ + { + StartTimestamp: &metav1.Time{Time: now}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour)}, + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + Message: "fake-maintenance-message", + }, + }, + expected: nil, + }, + { + name: "identical coming", + cur: []velerov1api.BackupRepositoryMaintenanceStatus{ + { + StartTimestamp: &metav1.Time{Time: now}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour)}, + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + Message: "fake-maintenance-message", + }, + }, + coming: []velerov1api.BackupRepositoryMaintenanceStatus{ + { + StartTimestamp: &metav1.Time{Time: now}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour)}, + }, + }, + expected: nil, + }, + { + name: "less than limit", + cur: []velerov1api.BackupRepositoryMaintenanceStatus{ + { + StartTimestamp: &metav1.Time{Time: now}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour)}, + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + Message: "fake-maintenance-message", + }, + { + StartTimestamp: &metav1.Time{Time: now.Add(time.Hour)}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)}, + Result: velerov1api.BackupRepositoryMaintenanceFailed, + Message: "fake-maintenance-message-2", + }, + }, + coming: []velerov1api.BackupRepositoryMaintenanceStatus{ + { + StartTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 3)}, + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + Message: "fake-maintenance-message-3", + }, + }, + expected: []velerov1api.BackupRepositoryMaintenanceStatus{ + { + StartTimestamp: &metav1.Time{Time: now}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour)}, + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + Message: "fake-maintenance-message", + }, + { + StartTimestamp: &metav1.Time{Time: now.Add(time.Hour)}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)}, + Result: velerov1api.BackupRepositoryMaintenanceFailed, + Message: "fake-maintenance-message-2", + }, + { + StartTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 3)}, + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + Message: "fake-maintenance-message-3", + }, + }, + }, + { + name: "more than limit", + cur: []velerov1api.BackupRepositoryMaintenanceStatus{ + { + StartTimestamp: &metav1.Time{Time: now}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour)}, + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + Message: "fake-maintenance-message", + }, + { + StartTimestamp: &metav1.Time{Time: now.Add(time.Hour)}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)}, + Result: velerov1api.BackupRepositoryMaintenanceFailed, + Message: "fake-maintenance-message-2", + }, + }, + coming: []velerov1api.BackupRepositoryMaintenanceStatus{ + { + StartTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 3)}, + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + Message: "fake-maintenance-message-3", + }, + { + StartTimestamp: &metav1.Time{Time: now.Add(time.Hour * 3)}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 4)}, + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + Message: "fake-maintenance-message-4", + }, + }, + expected: []velerov1api.BackupRepositoryMaintenanceStatus{ + { + StartTimestamp: &metav1.Time{Time: now.Add(time.Hour)}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)}, + Result: velerov1api.BackupRepositoryMaintenanceFailed, + Message: "fake-maintenance-message-2", + }, + { + StartTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 3)}, + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + Message: "fake-maintenance-message-3", + }, + { + StartTimestamp: &metav1.Time{Time: now.Add(time.Hour * 3)}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 4)}, + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + Message: "fake-maintenance-message-4", + }, + }, + }, + { + name: "more than limit 2", + cur: []velerov1api.BackupRepositoryMaintenanceStatus{ + { + StartTimestamp: &metav1.Time{Time: now}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour)}, + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + Message: "fake-maintenance-message", + }, + { + StartTimestamp: &metav1.Time{Time: now.Add(time.Hour)}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)}, + Result: velerov1api.BackupRepositoryMaintenanceFailed, + Message: "fake-maintenance-message-2", + }, + { + StartTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 3)}, + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + Message: "fake-maintenance-message-3", + }, + }, + coming: []velerov1api.BackupRepositoryMaintenanceStatus{ + { + StartTimestamp: &metav1.Time{Time: now.Add(time.Hour)}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)}, + Result: velerov1api.BackupRepositoryMaintenanceFailed, + Message: "fake-maintenance-message-2", + }, + { + StartTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 3)}, + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + Message: "fake-maintenance-message-3", + }, + { + StartTimestamp: &metav1.Time{Time: now.Add(time.Hour * 3)}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 4)}, + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + Message: "fake-maintenance-message-4", + }, + }, + expected: []velerov1api.BackupRepositoryMaintenanceStatus{ + { + StartTimestamp: &metav1.Time{Time: now.Add(time.Hour)}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)}, + Result: velerov1api.BackupRepositoryMaintenanceFailed, + Message: "fake-maintenance-message-2", + }, + { + StartTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 3)}, + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + Message: "fake-maintenance-message-3", + }, + { + StartTimestamp: &metav1.Time{Time: now.Add(time.Hour * 3)}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 4)}, + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + Message: "fake-maintenance-message-4", + }, + }, + }, + { + name: "coming is not used", + cur: []velerov1api.BackupRepositoryMaintenanceStatus{ + { + StartTimestamp: &metav1.Time{Time: now.Add(time.Hour)}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)}, + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + Message: "fake-maintenance-message", + }, + { + StartTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 3)}, + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + Message: "fake-maintenance-message-3", + }, + { + StartTimestamp: &metav1.Time{Time: now.Add(time.Hour * 3)}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 4)}, + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + Message: "fake-maintenance-message-4", + }, + }, + coming: []velerov1api.BackupRepositoryMaintenanceStatus{ + { + StartTimestamp: &metav1.Time{Time: now}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour)}, + }, + }, + expected: nil, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + consolidated := consolidateHistory(test.coming, test.cur) + if test.expected == nil { + assert.Nil(t, consolidated) + } else { + assert.Len(t, consolidated, len(test.expected)) + for i := 0; i < len(test.expected); i++ { + assert.Equal(t, *test.expected[i].StartTimestamp, *consolidated[i].StartTimestamp) + assert.Equal(t, *test.expected[i].CompleteTimestamp, *consolidated[i].CompleteTimestamp) + assert.Equal(t, test.expected[i].Result, consolidated[i].Result) + assert.Equal(t, test.expected[i].Message, consolidated[i].Message) + } + + assert.Nil(t, consolidateHistory(test.coming, consolidated)) + } + }) + } +} + +func TestGetLastMaintenanceTimeFromHistory(t *testing.T) { + now := time.Now() + + tests := []struct { + name string + history []velerov1api.BackupRepositoryMaintenanceStatus + expected time.Time + }{ + { + name: "first one is nil", + history: []velerov1api.BackupRepositoryMaintenanceStatus{ + { + StartTimestamp: &metav1.Time{Time: now}, + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + Message: "fake-maintenance-message", + }, + { + StartTimestamp: &metav1.Time{Time: now.Add(time.Hour)}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)}, + Result: velerov1api.BackupRepositoryMaintenanceFailed, + Message: "fake-maintenance-message-2", + }, + { + StartTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 3)}, + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + Message: "fake-maintenance-message-3", + }, + }, + expected: now.Add(time.Hour * 3), + }, + { + name: "another one is nil", + history: []velerov1api.BackupRepositoryMaintenanceStatus{ + { + StartTimestamp: &metav1.Time{Time: now}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour)}, + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + Message: "fake-maintenance-message", + }, + { + StartTimestamp: &metav1.Time{Time: now.Add(time.Hour)}, + Result: velerov1api.BackupRepositoryMaintenanceFailed, + Message: "fake-maintenance-message-2", + }, + { + StartTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 3)}, + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + Message: "fake-maintenance-message-3", + }, + }, + expected: now.Add(time.Hour * 3), + }, + { + name: "disordered", + history: []velerov1api.BackupRepositoryMaintenanceStatus{ + { + StartTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 3)}, + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + Message: "fake-maintenance-message-3", + }, + { + StartTimestamp: &metav1.Time{Time: now}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour)}, + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + Message: "fake-maintenance-message", + }, + { + StartTimestamp: &metav1.Time{Time: now.Add(time.Hour)}, + Result: velerov1api.BackupRepositoryMaintenanceFailed, + Message: "fake-maintenance-message-2", + }, + }, + expected: now.Add(time.Hour * 3), + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + time := getLastMaintenanceTimeFromHistory(test.history) + assert.Equal(t, test.expected, time.Time) + }) + } +} diff --git a/pkg/repository/keys/keys.go b/pkg/repository/keys/keys.go index 15f4dae9e0..21423afe09 100644 --- a/pkg/repository/keys/keys.go +++ b/pkg/repository/keys/keys.go @@ -24,11 +24,9 @@ import ( corev1api "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" + corev1client "k8s.io/client-go/kubernetes/typed/core/v1" "github.com/vmware-tanzu/velero/pkg/builder" - - "sigs.k8s.io/controller-runtime/pkg/client" ) const ( @@ -38,13 +36,11 @@ const ( encryptionKey = "static-passw0rd" ) -func EnsureCommonRepositoryKey(cli client.Client, namespace string) error { - existing := &corev1api.Secret{} - err := cli.Get(context.TODO(), types.NamespacedName{Namespace: namespace, Name: credentialsSecretName}, existing) +func EnsureCommonRepositoryKey(secretClient corev1client.SecretsGetter, namespace string) error { + _, err := secretClient.Secrets(namespace).Get(context.TODO(), credentialsSecretName, metav1.GetOptions{}) if err != nil && !apierrors.IsNotFound(err) { return errors.WithStack(err) } - if err == nil { return nil } @@ -62,7 +58,7 @@ func EnsureCommonRepositoryKey(cli client.Client, namespace string) error { }, } - if err := cli.Create(context.TODO(), secret); err != nil { + if _, err = secretClient.Secrets(namespace).Create(context.TODO(), secret, metav1.CreateOptions{}); err != nil { return errors.Wrapf(err, "error creating %s secret", credentialsSecretName) } From fc9683688a9aedfe3dff181c3ddf5cb41d017b88 Mon Sep 17 00:00:00 2001 From: Lyndon-Li Date: Tue, 7 Jan 2025 13:13:50 +0800 Subject: [PATCH 6/9] move maintenance to a separate folder Signed-off-by: Lyndon-Li --- pkg/cmd/cli/repomantenance/maintenance.go | 6 +- .../backup_repository_controller.go | 32 +- .../backup_repository_controller_test.go | 365 ++++++++++++++---- .../{ => maintenance}/maintenance.go | 40 +- .../{ => maintenance}/maintenance_test.go | 26 +- 5 files changed, 343 insertions(+), 126 deletions(-) rename pkg/repository/{ => maintenance}/maintenance.go (88%) rename pkg/repository/{ => maintenance}/maintenance_test.go (97%) diff --git a/pkg/cmd/cli/repomantenance/maintenance.go b/pkg/cmd/cli/repomantenance/maintenance.go index 7ca6d7505e..56a88de7ec 100644 --- a/pkg/cmd/cli/repomantenance/maintenance.go +++ b/pkg/cmd/cli/repomantenance/maintenance.go @@ -7,6 +7,7 @@ import ( "strings" "time" + "github.com/bombsimon/logrusr/v3" "github.com/pkg/errors" "github.com/sirupsen/logrus" "github.com/spf13/cobra" @@ -16,7 +17,6 @@ import ( "k8s.io/client-go/kubernetes" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/log/zap" "github.com/vmware-tanzu/velero/internal/credentials" velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1" @@ -68,9 +68,7 @@ func (o *Options) Run(f velerocli.Factory) { logger := logging.DefaultLogger(o.LogLevelFlag.Parse(), o.FormatFlag.Parse()) logger.SetOutput(os.Stdout) - ctrl.SetLogger(zap.New(zap.UseDevMode(true))) - - time.Sleep(time.Minute) + ctrl.SetLogger(logrusr.New(logger)) pruneError := o.runRepoPrune(f, f.Namespace(), logger) defer func() { diff --git a/pkg/controller/backup_repository_controller.go b/pkg/controller/backup_repository_controller.go index 2f6fa42220..f5f49297f9 100644 --- a/pkg/controller/backup_repository_controller.go +++ b/pkg/controller/backup_repository_controller.go @@ -42,8 +42,8 @@ import ( velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1" "github.com/vmware-tanzu/velero/pkg/constant" "github.com/vmware-tanzu/velero/pkg/label" - "github.com/vmware-tanzu/velero/pkg/repository" repoconfig "github.com/vmware-tanzu/velero/pkg/repository/config" + "github.com/vmware-tanzu/velero/pkg/repository/maintenance" repomanager "github.com/vmware-tanzu/velero/pkg/repository/manager" "github.com/vmware-tanzu/velero/pkg/util/kube" "github.com/vmware-tanzu/velero/pkg/util/logging" @@ -229,7 +229,7 @@ func (r *BackupRepoReconciler) Reconcile(ctx context.Context, req ctrl.Request) return ctrl.Result{}, errors.Wrap(err, "error check and run repo maintenance jobs") } - if err := repository.DeleteOldMaintenanceJobs(r.Client, req.Name, r.keepLatestMaintenanceJobs); err != nil { + if err := maintenance.DeleteOldJobs(r.Client, req.Name, r.keepLatestMaintenanceJobs); err != nil { log.WithError(err).Warn("Failed to delete old maintenance jobs") } } @@ -325,7 +325,7 @@ func ensureRepo(repo *velerov1api.BackupRepository, repoManager repomanager.Mana } func (r *BackupRepoReconciler) recallMaintenance(ctx context.Context, req *velerov1api.BackupRepository, log logrus.FieldLogger) error { - history, err := repository.WaitAllMaintenanceJobComplete(ctx, r.Client, req, defaultMaintenanceStatusQueueLength, log) + history, err := maintenance.WaitAllJobsComplete(ctx, r.Client, req, defaultMaintenanceStatusQueueLength, log) if err != nil { return errors.Wrapf(err, "error waiting incomplete repo maintenance job for repo %s", req.Name) } @@ -362,7 +362,9 @@ func consolidateHistory(coming, cur []velerov1api.BackupRepositoryMaintenanceSta return nil } - if isIdenticalHistory(cur, coming) { + if slices.EqualFunc(cur, coming, func(a, b velerov1api.BackupRepositoryMaintenanceStatus) bool { + return a.StartTimestamp.Equal(b.StartTimestamp) + }) { return nil } @@ -387,7 +389,9 @@ func consolidateHistory(coming, cur []velerov1api.BackupRepositoryMaintenanceSta slices.Reverse(truncated) - if isIdenticalHistory(cur, truncated) { + if slices.EqualFunc(cur, truncated, func(a, b velerov1api.BackupRepositoryMaintenanceStatus) bool { + return a.StartTimestamp.Equal(b.StartTimestamp) + }) { return nil } @@ -410,22 +414,8 @@ func getLastMaintenanceTimeFromHistory(history []velerov1api.BackupRepositoryMai return time } -func isIdenticalHistory(a, b []velerov1api.BackupRepositoryMaintenanceStatus) bool { - if len(a) != len(b) { - return false - } - - for i := 0; i < len(a); i++ { - if !a[i].StartTimestamp.Equal(b[i].StartTimestamp) { - return false - } - } - - return true -} - -var funcStartMaintenanceJob = repository.StartMaintenanceJob -var funcWaitMaintenanceJobComplete = repository.WaitMaintenanceJobComplete +var funcStartMaintenanceJob = maintenance.StartNewJob +var funcWaitMaintenanceJobComplete = maintenance.WaitJobComplete func (r *BackupRepoReconciler) runMaintenanceIfDue(ctx context.Context, req *velerov1api.BackupRepository, log logrus.FieldLogger) error { startTime := r.clock.Now() diff --git a/pkg/controller/backup_repository_controller_test.go b/pkg/controller/backup_repository_controller_test.go index 7ad65de0d8..cb763fb8f7 100644 --- a/pkg/controller/backup_repository_controller_test.go +++ b/pkg/controller/backup_repository_controller_test.go @@ -23,15 +23,15 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/mock" corev1 "k8s.io/api/core/v1" - v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" + "k8s.io/utils/clock" ctrl "sigs.k8s.io/controller-runtime" velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1" "github.com/vmware-tanzu/velero/pkg/builder" - "github.com/vmware-tanzu/velero/pkg/repository" + "github.com/vmware-tanzu/velero/pkg/repository/maintenance" repomokes "github.com/vmware-tanzu/velero/pkg/repository/mocks" repotypes "github.com/vmware-tanzu/velero/pkg/repository/types" velerotest "github.com/vmware-tanzu/velero/pkg/test" @@ -39,7 +39,6 @@ import ( "github.com/vmware-tanzu/velero/pkg/util/logging" "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/client/fake" clientFake "sigs.k8s.io/controller-runtime/pkg/client/fake" batchv1 "k8s.io/api/batch/v1" @@ -131,59 +130,268 @@ func waitMaintenanceJobCompleteFail(client.Client, context.Context, string, stri return velerov1api.BackupRepositoryMaintenanceStatus{}, errors.New("fake-wait-error") } -func waitMaintenanceJobCompleteSucceed(client.Client, context.Context, string, string, logrus.FieldLogger) (velerov1api.BackupRepositoryMaintenanceStatus, error) { - return velerov1api.BackupRepositoryMaintenanceStatus{ - StartTimestamp: &metav1.Time{Time: time.Now()}, - CompleteTimestamp: &metav1.Time{Time: time.Now().Add(time.Hour)}, - }, nil +func waitMaintenanceJobCompleteFunc(now time.Time, result velerov1api.BackupRepositoryMaintenanceResult, message string) func(client.Client, context.Context, string, string, logrus.FieldLogger) (velerov1api.BackupRepositoryMaintenanceStatus, error) { + return func(client.Client, context.Context, string, string, logrus.FieldLogger) (velerov1api.BackupRepositoryMaintenanceStatus, error) { + return velerov1api.BackupRepositoryMaintenanceStatus{ + StartTimestamp: &metav1.Time{Time: now}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour)}, + Result: result, + Message: message, + }, nil + } +} + +type fakeClock struct { + now time.Time +} + +func (f *fakeClock) After(time.Duration) <-chan time.Time { + return nil +} + +func (f *fakeClock) NewTicker(time.Duration) clock.Ticker { + return nil +} +func (f *fakeClock) NewTimer(time.Duration) clock.Timer { + return nil +} + +func (f *fakeClock) Now() time.Time { + return f.now +} + +func (f *fakeClock) Since(time.Time) time.Duration { + return 0 +} + +func (f *fakeClock) Sleep(time.Duration) {} + +func (f *fakeClock) Tick(time.Duration) <-chan time.Time { + return nil +} + +func (f *fakeClock) AfterFunc(time.Duration, func()) clock.Timer { + return nil } func TestRunMaintenanceIfDue(t *testing.T) { - rr := mockBackupRepositoryCR() - reconciler := mockBackupRepoReconciler(t, "", rr, nil) - funcStartMaintenanceJob = startMaintenanceJobFail - err := reconciler.Client.Create(context.TODO(), rr) - assert.NoError(t, err) - lastTm := rr.Status.LastMaintenanceTime - history := rr.Status.RecentMaintenance - err = reconciler.runMaintenanceIfDue(context.TODO(), rr, reconciler.logger) - assert.NoError(t, err) - assert.Equal(t, rr.Status.LastMaintenanceTime, lastTm) - assert.NotEqual(t, rr.Status.RecentMaintenance, history) - - rr = mockBackupRepositoryCR() - reconciler = mockBackupRepoReconciler(t, "", rr, nil) - funcStartMaintenanceJob = startMaintenanceJobSucceed - funcWaitMaintenanceJobComplete = waitMaintenanceJobCompleteFail - err = reconciler.Client.Create(context.TODO(), rr) - assert.NoError(t, err) - lastTm = rr.Status.LastMaintenanceTime - history = rr.Status.RecentMaintenance - err = reconciler.runMaintenanceIfDue(context.TODO(), rr, reconciler.logger) - assert.EqualError(t, err, "error waiting repo maintenance completion status: fake-wait-error") - assert.Equal(t, rr.Status.LastMaintenanceTime, lastTm) - assert.Equal(t, rr.Status.RecentMaintenance, history) - - rr = mockBackupRepositoryCR() - reconciler = mockBackupRepoReconciler(t, "", rr, nil) - funcStartMaintenanceJob = startMaintenanceJobSucceed - funcWaitMaintenanceJobComplete = waitMaintenanceJobCompleteSucceed - err = reconciler.Client.Create(context.TODO(), rr) - assert.NoError(t, err) - lastTm = rr.Status.LastMaintenanceTime - history = rr.Status.RecentMaintenance - err = reconciler.runMaintenanceIfDue(context.TODO(), rr, reconciler.logger) - assert.NoError(t, err) - assert.NotEqual(t, rr.Status.LastMaintenanceTime, lastTm) - assert.NotEqual(t, rr.Status.RecentMaintenance, history) + now := time.Now().Round(time.Second) - rr.Status.LastMaintenanceTime = &metav1.Time{Time: time.Now()} - lastTm = rr.Status.LastMaintenanceTime - history = rr.Status.RecentMaintenance - err = reconciler.runMaintenanceIfDue(context.TODO(), rr, reconciler.logger) - assert.NoError(t, err) - assert.Equal(t, rr.Status.LastMaintenanceTime, lastTm) - assert.Equal(t, rr.Status.RecentMaintenance, history) + tests := []struct { + name string + repo *velerov1api.BackupRepository + startJobFunc func(client.Client, context.Context, *velerov1api.BackupRepository, string, kube.PodResources, logrus.Level, *logging.FormatFlag, logrus.FieldLogger) (string, error) + waitJobFunc func(client.Client, context.Context, string, string, logrus.FieldLogger) (velerov1api.BackupRepositoryMaintenanceStatus, error) + expectedMaintenanceTime time.Time + expectedHistory []velerov1api.BackupRepositoryMaintenanceStatus + expectedErr string + }{ + { + name: "not due", + repo: &velerov1api.BackupRepository{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: velerov1api.DefaultNamespace, + Name: "repo", + }, + Spec: velerov1api.BackupRepositorySpec{ + MaintenanceFrequency: metav1.Duration{Duration: time.Hour}, + }, + Status: velerov1api.BackupRepositoryStatus{ + LastMaintenanceTime: &metav1.Time{Time: now}, + RecentMaintenance: []velerov1api.BackupRepositoryMaintenanceStatus{ + { + StartTimestamp: &metav1.Time{Time: now.Add(-time.Hour)}, + CompleteTimestamp: &metav1.Time{Time: now}, + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + }, + }, + }, + }, + expectedMaintenanceTime: now, + expectedHistory: []velerov1api.BackupRepositoryMaintenanceStatus{ + { + StartTimestamp: &metav1.Time{Time: now.Add(-time.Hour)}, + CompleteTimestamp: &metav1.Time{Time: now}, + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + }, + }, + }, + { + name: "start failed", + repo: &velerov1api.BackupRepository{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: velerov1api.DefaultNamespace, + Name: "repo", + }, + Spec: velerov1api.BackupRepositorySpec{ + MaintenanceFrequency: metav1.Duration{Duration: time.Hour}, + }, + Status: velerov1api.BackupRepositoryStatus{ + LastMaintenanceTime: &metav1.Time{Time: now.Add(-time.Hour - time.Minute)}, + RecentMaintenance: []velerov1api.BackupRepositoryMaintenanceStatus{ + { + StartTimestamp: &metav1.Time{Time: now.Add(-time.Hour * 2)}, + CompleteTimestamp: &metav1.Time{Time: now.Add(-time.Hour)}, + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + }, + }, + }, + }, + startJobFunc: startMaintenanceJobFail, + expectedMaintenanceTime: now.Add(-time.Hour - time.Minute), + expectedHistory: []velerov1api.BackupRepositoryMaintenanceStatus{ + { + StartTimestamp: &metav1.Time{Time: now.Add(-time.Hour * 2)}, + CompleteTimestamp: &metav1.Time{Time: now.Add(-time.Hour)}, + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + }, + { + StartTimestamp: &metav1.Time{Time: now}, + Result: velerov1api.BackupRepositoryMaintenanceFailed, + Message: "Failed to start maintenance job, err: fake-start-error", + }, + }, + }, + { + name: "wait failed", + repo: &velerov1api.BackupRepository{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: velerov1api.DefaultNamespace, + Name: "repo", + }, + Spec: velerov1api.BackupRepositorySpec{ + MaintenanceFrequency: metav1.Duration{Duration: time.Hour}, + }, + Status: velerov1api.BackupRepositoryStatus{ + LastMaintenanceTime: &metav1.Time{Time: now.Add(-time.Hour - time.Minute)}, + RecentMaintenance: []velerov1api.BackupRepositoryMaintenanceStatus{ + { + StartTimestamp: &metav1.Time{Time: now.Add(-time.Hour * 2)}, + CompleteTimestamp: &metav1.Time{Time: now.Add(-time.Hour)}, + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + }, + }, + }, + }, + startJobFunc: startMaintenanceJobSucceed, + waitJobFunc: waitMaintenanceJobCompleteFail, + expectedErr: "error waiting repo maintenance completion status: fake-wait-error", + expectedMaintenanceTime: now.Add(-time.Hour - time.Minute), + expectedHistory: []velerov1api.BackupRepositoryMaintenanceStatus{ + { + StartTimestamp: &metav1.Time{Time: now.Add(-time.Hour * 2)}, + CompleteTimestamp: &metav1.Time{Time: now.Add(-time.Hour)}, + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + }, + }, + }, + { + name: "maintenance failed", + repo: &velerov1api.BackupRepository{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: velerov1api.DefaultNamespace, + Name: "repo", + }, + Spec: velerov1api.BackupRepositorySpec{ + MaintenanceFrequency: metav1.Duration{Duration: time.Hour}, + }, + Status: velerov1api.BackupRepositoryStatus{ + LastMaintenanceTime: &metav1.Time{Time: now.Add(-time.Hour - time.Minute)}, + RecentMaintenance: []velerov1api.BackupRepositoryMaintenanceStatus{ + { + StartTimestamp: &metav1.Time{Time: now.Add(-time.Hour * 2)}, + CompleteTimestamp: &metav1.Time{Time: now.Add(-time.Hour)}, + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + }, + }, + }, + }, + startJobFunc: startMaintenanceJobSucceed, + waitJobFunc: waitMaintenanceJobCompleteFunc(now, velerov1api.BackupRepositoryMaintenanceFailed, "fake-maintenance-message"), + expectedMaintenanceTime: now.Add(-time.Hour - time.Minute), + expectedHistory: []velerov1api.BackupRepositoryMaintenanceStatus{ + { + StartTimestamp: &metav1.Time{Time: now.Add(-time.Hour * 2)}, + CompleteTimestamp: &metav1.Time{Time: now.Add(-time.Hour)}, + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + }, + { + StartTimestamp: &metav1.Time{Time: now}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour)}, + Result: velerov1api.BackupRepositoryMaintenanceFailed, + Message: "fake-maintenance-message", + }, + }, + }, + { + name: "maintenance succeeded", + repo: &velerov1api.BackupRepository{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: velerov1api.DefaultNamespace, + Name: "repo", + }, + Spec: velerov1api.BackupRepositorySpec{ + MaintenanceFrequency: metav1.Duration{Duration: time.Hour}, + }, + Status: velerov1api.BackupRepositoryStatus{ + LastMaintenanceTime: &metav1.Time{Time: now.Add(-time.Hour - time.Minute)}, + RecentMaintenance: []velerov1api.BackupRepositoryMaintenanceStatus{ + { + StartTimestamp: &metav1.Time{Time: now.Add(-time.Hour * 2)}, + CompleteTimestamp: &metav1.Time{Time: now.Add(-time.Hour)}, + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + }, + }, + }, + }, + startJobFunc: startMaintenanceJobSucceed, + waitJobFunc: waitMaintenanceJobCompleteFunc(now, velerov1api.BackupRepositoryMaintenanceSucceeded, ""), + expectedMaintenanceTime: now.Add(time.Hour), + expectedHistory: []velerov1api.BackupRepositoryMaintenanceStatus{ + { + StartTimestamp: &metav1.Time{Time: now.Add(-time.Hour * 2)}, + CompleteTimestamp: &metav1.Time{Time: now.Add(-time.Hour)}, + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + }, + { + StartTimestamp: &metav1.Time{Time: now}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour)}, + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + }, + }, + }, + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + reconciler := mockBackupRepoReconciler(t, "", test.repo, nil) + reconciler.clock = &fakeClock{now} + err := reconciler.Client.Create(context.TODO(), test.repo) + assert.NoError(t, err) + + funcStartMaintenanceJob = test.startJobFunc + funcWaitMaintenanceJobComplete = test.waitJobFunc + + err = reconciler.runMaintenanceIfDue(context.TODO(), test.repo, velerotest.NewLogger()) + if test.expectedErr == "" { + assert.NoError(t, err) + } + + assert.Equal(t, test.expectedMaintenanceTime, test.repo.Status.LastMaintenanceTime.Time) + assert.Len(t, test.repo.Status.RecentMaintenance, len(test.expectedHistory)) + + for i := 0; i < len(test.expectedHistory); i++ { + assert.Equal(t, test.expectedHistory[i].StartTimestamp.Time, test.repo.Status.RecentMaintenance[i].StartTimestamp.Time) + if test.expectedHistory[i].CompleteTimestamp == nil { + assert.Nil(t, test.repo.Status.RecentMaintenance[i].CompleteTimestamp) + } else { + assert.Equal(t, test.expectedHistory[i].CompleteTimestamp.Time, test.repo.Status.RecentMaintenance[i].CompleteTimestamp.Time) + } + + assert.Equal(t, test.expectedHistory[i].Result, test.repo.Status.RecentMaintenance[i].Result) + assert.Equal(t, test.expectedHistory[i].Message, test.repo.Status.RecentMaintenance[i].Message) + } + }) + } } func TestInitializeRepo(t *testing.T) { @@ -742,7 +950,7 @@ func TestUpdateRepoMaintenanceHistory(t *testing.T) { } func TestRecallMaintenance(t *testing.T) { - now := time.Now() + now := time.Now().Round(time.Second) schemeFail := runtime.NewScheme() velerov1api.AddToScheme(schemeFail) @@ -756,7 +964,7 @@ func TestRecallMaintenance(t *testing.T) { ObjectMeta: metav1.ObjectMeta{ Name: "job1", Namespace: velerov1api.DefaultNamespace, - Labels: map[string]string{repository.RepositoryNameLabel: "repo"}, + Labels: map[string]string{maintenance.RepositoryNameLabel: "repo"}, CreationTimestamp: metav1.Time{Time: now.Add(time.Hour)}, }, Status: batchv1.JobStatus{ @@ -766,9 +974,9 @@ func TestRecallMaintenance(t *testing.T) { }, } - jobPodSucceeded := builder.ForPod(velerov1api.DefaultNamespace, "job1").Labels(map[string]string{"job-name": "job1"}).ContainerStatuses(&v1.ContainerStatus{ - State: v1.ContainerState{ - Terminated: &v1.ContainerStateTerminated{}, + jobPodSucceeded := builder.ForPod(velerov1api.DefaultNamespace, "job1").Labels(map[string]string{"job-name": "job1"}).ContainerStatuses(&corev1.ContainerStatus{ + State: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{}, }, }).Result() @@ -777,8 +985,8 @@ func TestRecallMaintenance(t *testing.T) { kubeClientObj []runtime.Object runtimeScheme *runtime.Scheme repoLastMatainTime metav1.Time - expectNewHistory bool - expectTimeUpdate bool + expectNewHistory []velerov1api.BackupRepositoryMaintenanceStatus + expectTimeUpdate *metav1.Time expectedErr string }{ { @@ -798,7 +1006,13 @@ func TestRecallMaintenance(t *testing.T) { jobPodSucceeded, }, repoLastMatainTime: metav1.Time{Time: now.Add(time.Hour * 5)}, - expectNewHistory: true, + expectNewHistory: []velerov1api.BackupRepositoryMaintenanceStatus{ + { + StartTimestamp: &metav1.Time{Time: now.Add(time.Hour)}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)}, + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + }, + }, }, { name: "update last time", @@ -807,8 +1021,14 @@ func TestRecallMaintenance(t *testing.T) { jobSucceeded, jobPodSucceeded, }, - expectNewHistory: true, - expectTimeUpdate: true, + expectNewHistory: []velerov1api.BackupRepositoryMaintenanceStatus{ + { + StartTimestamp: &metav1.Time{Time: now.Add(time.Hour)}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)}, + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + }, + }, + expectTimeUpdate: &metav1.Time{Time: now.Add(time.Hour * 2)}, }, } @@ -821,13 +1041,12 @@ func TestRecallMaintenance(t *testing.T) { test.kubeClientObj = append(test.kubeClientObj, backupRepo) - fakeClientBuilder := fake.NewClientBuilder() + fakeClientBuilder := clientFake.NewClientBuilder() fakeClientBuilder = fakeClientBuilder.WithScheme(test.runtimeScheme) fakeClient := fakeClientBuilder.WithRuntimeObjects(test.kubeClientObj...).Build() r.Client = fakeClient lastTm := backupRepo.Status.LastMaintenanceTime - history := backupRepo.Status.RecentMaintenance err := r.recallMaintenance(context.TODO(), backupRepo, velerotest.NewLogger()) if test.expectedErr != "" { @@ -835,12 +1054,22 @@ func TestRecallMaintenance(t *testing.T) { } else { assert.NoError(t, err) - if test.expectNewHistory { - assert.NotEqual(t, history, backupRepo.Status.RecentMaintenance) + if test.expectNewHistory == nil { + assert.Nil(t, backupRepo.Status.RecentMaintenance) + } else { + assert.Len(t, backupRepo.Status.RecentMaintenance, len(test.expectNewHistory)) + for i := 0; i < len(test.expectNewHistory); i++ { + assert.Equal(t, test.expectNewHistory[i].StartTimestamp.Time, backupRepo.Status.RecentMaintenance[i].StartTimestamp.Time) + assert.Equal(t, test.expectNewHistory[i].CompleteTimestamp.Time, backupRepo.Status.RecentMaintenance[i].CompleteTimestamp.Time) + assert.Equal(t, test.expectNewHistory[i].Result, backupRepo.Status.RecentMaintenance[i].Result) + assert.Equal(t, test.expectNewHistory[i].Message, backupRepo.Status.RecentMaintenance[i].Message) + } } - if test.expectTimeUpdate { - assert.NotEqual(t, lastTm, backupRepo.Status.LastMaintenanceTime) + if test.expectTimeUpdate != nil { + assert.Equal(t, test.expectTimeUpdate.Time, backupRepo.Status.LastMaintenanceTime.Time) + } else { + assert.Equal(t, lastTm, backupRepo.Status.LastMaintenanceTime) } } }) diff --git a/pkg/repository/maintenance.go b/pkg/repository/maintenance/maintenance.go similarity index 88% rename from pkg/repository/maintenance.go rename to pkg/repository/maintenance/maintenance.go index c1b4276a57..807a4113c7 100644 --- a/pkg/repository/maintenance.go +++ b/pkg/repository/maintenance/maintenance.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package repository +package maintenance import ( "context" @@ -67,8 +67,8 @@ func GenerateJobName(repo string) string { return jobName } -// DeleteOldMaintenanceJobs deletes old maintenance jobs and keeps the latest N jobs -func DeleteOldMaintenanceJobs(cli client.Client, repo string, keep int) error { +// DeleteOldJobs deletes old maintenance jobs and keeps the latest N jobs +func DeleteOldJobs(cli client.Client, repo string, keep int) error { // Get the maintenance job list by label jobList := &batchv1.JobList{} err := cli.List(context.TODO(), jobList, client.MatchingLabels(map[string]string{RepositoryNameLabel: repo})) @@ -119,7 +119,7 @@ func waitForJobComplete(ctx context.Context, client client.Client, ns string, jo return ret, err } -func getMaintenanceResultFromJob(cli client.Client, job *batchv1.Job) (string, error) { +func getResultFromJob(cli client.Client, job *batchv1.Job) (string, error) { // Get the maintenance job related pod by label selector podList := &v1.PodList{} err := cli.List(context.TODO(), podList, client.InNamespace(job.Namespace), client.MatchingLabels(map[string]string{"job-name": job.Name})) @@ -148,7 +148,7 @@ func getMaintenanceResultFromJob(cli client.Client, job *batchv1.Job) (string, e return terminated.Message, nil } -// getMaintenanceJobConfig is called to get the Maintenance Job Config for the +// getJobConfig is called to get the Maintenance Job Config for the // BackupRepository specified by the repo parameter. // // Params: @@ -159,7 +159,7 @@ func getMaintenanceResultFromJob(cli client.Client, job *batchv1.Job) (string, e // veleroNamespace: the Velero-installed namespace. It's used to retrieve the BackupRepository. // repoMaintenanceJobConfig: the repository maintenance job ConfigMap name. // repo: the BackupRepository needs to run the maintenance Job. -func getMaintenanceJobConfig( +func getJobConfig( ctx context.Context, client client.Client, logger logrus.FieldLogger, @@ -241,8 +241,8 @@ func getMaintenanceJobConfig( return result, nil } -// WaitMaintenanceJobComplete waits the completion of the specified maintenance job and return the BackupRepositoryMaintenanceStatus -func WaitMaintenanceJobComplete(cli client.Client, ctx context.Context, jobName, ns string, logger logrus.FieldLogger) (velerov1api.BackupRepositoryMaintenanceStatus, error) { +// WaitJobComplete waits the completion of the specified maintenance job and return the BackupRepositoryMaintenanceStatus +func WaitJobComplete(cli client.Client, ctx context.Context, jobName, ns string, logger logrus.FieldLogger) (velerov1api.BackupRepositoryMaintenanceStatus, error) { log := logger.WithField("job name", jobName) maintenanceJob, err := waitForJobComplete(ctx, cli, ns, jobName) @@ -252,17 +252,17 @@ func WaitMaintenanceJobComplete(cli client.Client, ctx context.Context, jobName, log.Info("Maintenance repo complete") - result, err := getMaintenanceResultFromJob(cli, maintenanceJob) + result, err := getResultFromJob(cli, maintenanceJob) if err != nil { log.WithError(err).Warn("Failed to get maintenance job result") } - return composeMaintenanceStatusFromJob(maintenanceJob, result), nil + return composeStatusFromJob(maintenanceJob, result), nil } -// WaitAllMaintenanceJobComplete checks all the incomplete maintenance jobs of the specified repo and wait for them to complete, +// WaitAllJobsComplete checks all the incomplete maintenance jobs of the specified repo and wait for them to complete, // and then return the maintenance jobs' status in the range of limit -func WaitAllMaintenanceJobComplete(ctx context.Context, cli client.Client, repo *velerov1api.BackupRepository, limit int, log logrus.FieldLogger) ([]velerov1api.BackupRepositoryMaintenanceStatus, error) { +func WaitAllJobsComplete(ctx context.Context, cli client.Client, repo *velerov1api.BackupRepository, limit int, log logrus.FieldLogger) ([]velerov1api.BackupRepositoryMaintenanceStatus, error) { jobList := &batchv1.JobList{} err := cli.List(context.TODO(), jobList, &client.ListOptions{ Namespace: repo.Namespace, @@ -303,19 +303,19 @@ func WaitAllMaintenanceJobComplete(ctx context.Context, cli client.Client, repo job = updated } - message, err := getMaintenanceResultFromJob(cli, job) + message, err := getResultFromJob(cli, job) if err != nil { return nil, errors.Wrapf(err, "error getting maintenance job[%s] result", job.Name) } - history = append(history, composeMaintenanceStatusFromJob(job, message)) + history = append(history, composeStatusFromJob(job, message)) } return history, nil } -// StartMaintenanceJob creates a new maintenance job -func StartMaintenanceJob(cli client.Client, ctx context.Context, repo *velerov1api.BackupRepository, repoMaintenanceJobConfig string, +// StartNewJob creates a new maintenance job +func StartNewJob(cli client.Client, ctx context.Context, repo *velerov1api.BackupRepository, repoMaintenanceJobConfig string, podResources kube.PodResources, logLevel logrus.Level, logFormat *logging.FormatFlag, logger logrus.FieldLogger) (string, error) { bsl := &velerov1api.BackupStorageLocation{} if err := cli.Get(ctx, client.ObjectKey{Namespace: repo.Namespace, Name: repo.Spec.BackupStorageLocation}, bsl); err != nil { @@ -329,7 +329,7 @@ func StartMaintenanceJob(cli client.Client, ctx context.Context, repo *velerov1a "repo UID": repo.UID, }) - jobConfig, err := getMaintenanceJobConfig( + jobConfig, err := getJobConfig( ctx, cli, log, @@ -346,7 +346,7 @@ func StartMaintenanceJob(cli client.Client, ctx context.Context, repo *velerov1a log.Info("Starting maintenance repo") - maintenanceJob, err := buildMaintenanceJob(cli, ctx, repo, bsl.Name, jobConfig, podResources, logLevel, logFormat) + maintenanceJob, err := buildJob(cli, ctx, repo, bsl.Name, jobConfig, podResources, logLevel, logFormat) if err != nil { return "", errors.Wrap(err, "error to build maintenance job") } @@ -362,7 +362,7 @@ func StartMaintenanceJob(cli client.Client, ctx context.Context, repo *velerov1a return maintenanceJob.Name, nil } -func buildMaintenanceJob(cli client.Client, ctx context.Context, repo *velerov1api.BackupRepository, bslName string, config *JobConfigs, +func buildJob(cli client.Client, ctx context.Context, repo *velerov1api.BackupRepository, bslName string, config *JobConfigs, podResources kube.PodResources, logLevel logrus.Level, logFormat *logging.FormatFlag) (*batchv1.Job, error) { // Get the Velero server deployment deployment := &appsv1.Deployment{} @@ -479,7 +479,7 @@ func buildMaintenanceJob(cli client.Client, ctx context.Context, repo *velerov1a return job, nil } -func composeMaintenanceStatusFromJob(job *batchv1.Job, message string) velerov1api.BackupRepositoryMaintenanceStatus { +func composeStatusFromJob(job *batchv1.Job, message string) velerov1api.BackupRepositoryMaintenanceStatus { result := velerov1api.BackupRepositoryMaintenanceSucceeded if job.Status.Failed > 0 { result = velerov1api.BackupRepositoryMaintenanceFailed diff --git a/pkg/repository/maintenance_test.go b/pkg/repository/maintenance/maintenance_test.go similarity index 97% rename from pkg/repository/maintenance_test.go rename to pkg/repository/maintenance/maintenance_test.go index 6d04a7837a..1854aa0f9f 100644 --- a/pkg/repository/maintenance_test.go +++ b/pkg/repository/maintenance/maintenance_test.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package repository +package maintenance import ( "context" @@ -76,7 +76,7 @@ func TestGenerateJobName1(t *testing.T) { }) } } -func TestDeleteOldMaintenanceJobs(t *testing.T) { +func TestDeleteOldJobs(t *testing.T) { // Set up test repo and keep value repo := "test-repo" keep := 2 @@ -114,7 +114,7 @@ func TestDeleteOldMaintenanceJobs(t *testing.T) { cli := fake.NewClientBuilder().WithScheme(scheme).WithObjects(objs...).Build() // Call the function - err := DeleteOldMaintenanceJobs(cli, repo, keep) + err := DeleteOldJobs(cli, repo, keep) assert.NoError(t, err) // Get the remaining jobs @@ -208,7 +208,7 @@ func TestWaitForJobComplete(t *testing.T) { } } -func TestGetMaintenanceResultFromJob(t *testing.T) { +func TestGetResultFromJob(t *testing.T) { // Set up test job job := &batchv1.Job{ ObjectMeta: metav1.ObjectMeta{ @@ -230,7 +230,7 @@ func TestGetMaintenanceResultFromJob(t *testing.T) { cli := fake.NewClientBuilder().WithObjects(job, pod).Build() // test an error should be returned - result, err := getMaintenanceResultFromJob(cli, job) + result, err := getResultFromJob(cli, job) assert.Error(t, err) assert.Equal(t, "", result) @@ -245,7 +245,7 @@ func TestGetMaintenanceResultFromJob(t *testing.T) { // Test an error should be returned cli = fake.NewClientBuilder().WithObjects(job, pod).Build() - result, err = getMaintenanceResultFromJob(cli, job) + result, err = getResultFromJob(cli, job) assert.Error(t, err) assert.Equal(t, "", result) @@ -264,12 +264,12 @@ func TestGetMaintenanceResultFromJob(t *testing.T) { // This call should return the termination message with no error cli = fake.NewClientBuilder().WithObjects(job, pod).Build() - result, err = getMaintenanceResultFromJob(cli, job) + result, err = getResultFromJob(cli, job) assert.NoError(t, err) assert.Equal(t, "test message", result) } -func TestGetMaintenanceJobConfig(t *testing.T) { +func TestGetJobConfig(t *testing.T) { ctx := context.Background() logger := logrus.New() veleroNamespace := "velero" @@ -425,7 +425,7 @@ func TestGetMaintenanceJobConfig(t *testing.T) { fakeClient = velerotest.NewFakeControllerRuntimeClient(t) } - jobConfig, err := getMaintenanceJobConfig( + jobConfig, err := getJobConfig( ctx, fakeClient, logger, @@ -444,7 +444,7 @@ func TestGetMaintenanceJobConfig(t *testing.T) { } } -func TestWaitAllMaintenanceJobComplete(t *testing.T) { +func TestWaitAlJobsComplete(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), time.Second*2) veleroNamespace := "velero" @@ -712,7 +712,7 @@ func TestWaitAllMaintenanceJobComplete(t *testing.T) { fakeClient := fakeClientBuilder.WithRuntimeObjects(test.kubeClientObj...).Build() - history, err := WaitAllMaintenanceJobComplete(test.ctx, fakeClient, repo, 3, velerotest.NewLogger()) + history, err := WaitAllJobsComplete(test.ctx, fakeClient, repo, 3, velerotest.NewLogger()) if test.expectedError != "" { assert.EqualError(t, err, test.expectedError) @@ -733,7 +733,7 @@ func TestWaitAllMaintenanceJobComplete(t *testing.T) { cancel() } -func TestBuildMaintenanceJob(t *testing.T) { +func TestBuildJob(t *testing.T) { testCases := []struct { name string m *JobConfigs @@ -872,7 +872,7 @@ func TestBuildMaintenanceJob(t *testing.T) { cli := fake.NewClientBuilder().WithScheme(scheme).WithRuntimeObjects(objs...).Build() // Call the function to test - job, err := buildMaintenanceJob(cli, context.TODO(), param.BackupRepo, param.BackupLocation.Name, tc.m, *tc.m.PodResources, tc.logLevel, tc.logFormat) + job, err := buildJob(cli, context.TODO(), param.BackupRepo, param.BackupLocation.Name, tc.m, *tc.m.PodResources, tc.logLevel, tc.logFormat) // Check the error if tc.expectedError { From 3900f2f1173a1b90e863d10b926e6edf49f8c481 Mon Sep 17 00:00:00 2001 From: Lyndon-Li Date: Wed, 15 Jan 2025 14:53:57 +0800 Subject: [PATCH 7/9] recall repo maintenance history on restart Signed-off-by: Lyndon-Li --- pkg/controller/backup_repository_controller.go | 8 ++++---- pkg/repository/maintenance/maintenance.go | 2 +- pkg/repository/maintenance/maintenance_test.go | 7 +------ 3 files changed, 6 insertions(+), 11 deletions(-) diff --git a/pkg/controller/backup_repository_controller.go b/pkg/controller/backup_repository_controller.go index f5f49297f9..fe59529d48 100644 --- a/pkg/controller/backup_repository_controller.go +++ b/pkg/controller/backup_repository_controller.go @@ -65,13 +65,13 @@ type BackupRepoReconciler struct { repositoryManager repomanager.Manager keepLatestMaintenanceJobs int repoMaintenanceConfig string - podResources kube.PodResources + maintenanceJobResources kube.PodResources logLevel logrus.Level logFormat *logging.FormatFlag } func NewBackupRepoReconciler(namespace string, logger logrus.FieldLogger, client client.Client, repositoryManager repomanager.Manager, - maintenanceFrequency time.Duration, backupRepoConfig string, keepLatestMaintenanceJobs int, repoMaintenanceConfig string, podResources kube.PodResources, + maintenanceFrequency time.Duration, backupRepoConfig string, keepLatestMaintenanceJobs int, repoMaintenanceConfig string, maintenanceJobResources kube.PodResources, logLevel logrus.Level, logFormat *logging.FormatFlag) *BackupRepoReconciler { c := &BackupRepoReconciler{ client, @@ -83,7 +83,7 @@ func NewBackupRepoReconciler(namespace string, logger logrus.FieldLogger, client repositoryManager, keepLatestMaintenanceJobs, repoMaintenanceConfig, - podResources, + maintenanceJobResources, logLevel, logFormat, } @@ -427,7 +427,7 @@ func (r *BackupRepoReconciler) runMaintenanceIfDue(ctx context.Context, req *vel log.Info("Running maintenance on backup repository") - job, err := funcStartMaintenanceJob(r.Client, ctx, req, r.repoMaintenanceConfig, r.podResources, r.logLevel, r.logFormat, log) + job, err := funcStartMaintenanceJob(r.Client, ctx, req, r.repoMaintenanceConfig, r.maintenanceJobResources, r.logLevel, r.logFormat, log) if err != nil { log.WithError(err).Warn("Starting repo maintenance failed") return r.patchBackupRepository(ctx, req, func(rr *velerov1api.BackupRepository) { diff --git a/pkg/repository/maintenance/maintenance.go b/pkg/repository/maintenance/maintenance.go index 807a4113c7..b854046ba0 100644 --- a/pkg/repository/maintenance/maintenance.go +++ b/pkg/repository/maintenance/maintenance.go @@ -284,7 +284,7 @@ func WaitAllJobsComplete(ctx context.Context, cli client.Client, repo *velerov1a history := []velerov1api.BackupRepositoryMaintenanceStatus{} - startPos := len(history) - limit + startPos := len(jobList.Items) - limit if startPos < 0 { startPos = 0 } diff --git a/pkg/repository/maintenance/maintenance_test.go b/pkg/repository/maintenance/maintenance_test.go index 1854aa0f9f..e4335a61fe 100644 --- a/pkg/repository/maintenance/maintenance_test.go +++ b/pkg/repository/maintenance/maintenance_test.go @@ -444,7 +444,7 @@ func TestGetJobConfig(t *testing.T) { } } -func TestWaitAlJobsComplete(t *testing.T) { +func TestWaitAllJobsComplete(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), time.Second*2) veleroNamespace := "velero" @@ -680,11 +680,6 @@ func TestWaitAlJobsComplete(t *testing.T) { jobPodSucceeded3, }, expectedStatus: []velerov1api.BackupRepositoryMaintenanceStatus{ - { - Result: velerov1api.BackupRepositoryMaintenanceSucceeded, - StartTimestamp: &metav1.Time{Time: now}, - CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour)}, - }, { Result: velerov1api.BackupRepositoryMaintenanceFailed, StartTimestamp: &metav1.Time{Time: now.Add(time.Hour)}, From 0045e940723dc4312feb92fd37190f166829693b Mon Sep 17 00:00:00 2001 From: Lyndon-Li Date: Wed, 15 Jan 2025 16:18:14 +0800 Subject: [PATCH 8/9] get maintenance result only for failed jobs Signed-off-by: Lyndon-Li --- pkg/repository/maintenance/maintenance.go | 24 ++++++++++++----- .../maintenance/maintenance_test.go | 26 +++++++++++++++++-- 2 files changed, 41 insertions(+), 9 deletions(-) diff --git a/pkg/repository/maintenance/maintenance.go b/pkg/repository/maintenance/maintenance.go index b854046ba0..a01a2c464d 100644 --- a/pkg/repository/maintenance/maintenance.go +++ b/pkg/repository/maintenance/maintenance.go @@ -250,11 +250,16 @@ func WaitJobComplete(cli client.Client, ctx context.Context, jobName, ns string, return velerov1api.BackupRepositoryMaintenanceStatus{}, errors.Wrap(err, "error to wait for maintenance job complete") } - log.Info("Maintenance repo complete") + log.Infof("Maintenance repo complete, succeeded %v, failed %v", maintenanceJob.Status.Succeeded, maintenanceJob.Status.Failed) - result, err := getResultFromJob(cli, maintenanceJob) - if err != nil { - log.WithError(err).Warn("Failed to get maintenance job result") + result := "" + if maintenanceJob.Status.Failed > 0 { + if r, err := getResultFromJob(cli, maintenanceJob); err != nil { + log.WithError(err).Warn("Failed to get maintenance job result") + result = "Repo maintenance failed but result is not retrieveable" + } else { + result = r + } } return composeStatusFromJob(maintenanceJob, result), nil @@ -303,9 +308,14 @@ func WaitAllJobsComplete(ctx context.Context, cli client.Client, repo *velerov1a job = updated } - message, err := getResultFromJob(cli, job) - if err != nil { - return nil, errors.Wrapf(err, "error getting maintenance job[%s] result", job.Name) + message := "" + if job.Status.Failed > 0 { + if msg, err := getResultFromJob(cli, job); err != nil { + log.WithError(err).Warnf("Failed to get result of maintenance job %s", job.Name) + message = "Repo maintenance failed but result is not retrieveable" + } else { + message = msg + } } history = append(history, composeStatusFromJob(job, message)) diff --git a/pkg/repository/maintenance/maintenance_test.go b/pkg/repository/maintenance/maintenance_test.go index e4335a61fe..7ac65d98e9 100644 --- a/pkg/repository/maintenance/maintenance_test.go +++ b/pkg/repository/maintenance/maintenance_test.go @@ -602,13 +602,35 @@ func TestWaitAllJobsComplete(t *testing.T) { expectedError: "error waiting maintenance job[job1] complete: context deadline exceeded", }, { - name: "get result error", + name: "get result error on succeeded job", ctx: context.TODO(), runtimeScheme: scheme, kubeClientObj: []runtime.Object{ jobSucceeded1, }, - expectedError: "error getting maintenance job[job1] result: no pod found for job job1", + expectedStatus: []velerov1api.BackupRepositoryMaintenanceStatus{ + { + Result: velerov1api.BackupRepositoryMaintenanceSucceeded, + StartTimestamp: &metav1.Time{Time: now}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour)}, + }, + }, + }, + { + name: "get result error on failed job", + ctx: context.TODO(), + runtimeScheme: scheme, + kubeClientObj: []runtime.Object{ + jobFailed1, + }, + expectedStatus: []velerov1api.BackupRepositoryMaintenanceStatus{ + { + Result: velerov1api.BackupRepositoryMaintenanceFailed, + StartTimestamp: &metav1.Time{Time: now.Add(time.Hour)}, + CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)}, + Message: "Repo maintenance failed but result is not retrieveable", + }, + }, }, { name: "less than limit", From 91fcb6511803443388de8e177d211be9445ebaa3 Mon Sep 17 00:00:00 2001 From: Lyndon-Li Date: Thu, 16 Jan 2025 13:38:51 +0800 Subject: [PATCH 9/9] add maintenance wait backoff log Signed-off-by: Lyndon-Li --- pkg/repository/maintenance/maintenance.go | 25 +++++++- .../maintenance/maintenance_test.go | 61 ++++++++++++++++++- pkg/test/test_logger.go | 16 +++++ 3 files changed, 97 insertions(+), 5 deletions(-) diff --git a/pkg/repository/maintenance/maintenance.go b/pkg/repository/maintenance/maintenance.go index a01a2c464d..9694dd4dfb 100644 --- a/pkg/repository/maintenance/maintenance.go +++ b/pkg/repository/maintenance/maintenance.go @@ -20,6 +20,7 @@ import ( "context" "encoding/json" "fmt" + "math" "sort" "time" @@ -92,10 +93,22 @@ func DeleteOldJobs(cli client.Client, repo string, keep int) error { return nil } +var waitCompletionBackOff = wait.Backoff{ + Duration: time.Minute * 20, + Steps: math.MaxInt, + Factor: 2, + Cap: time.Hour * 12, +} + // waitForJobComplete wait for completion of the specified job and update the latest job object -func waitForJobComplete(ctx context.Context, client client.Client, ns string, job string) (*batchv1.Job, error) { +func waitForJobComplete(ctx context.Context, client client.Client, ns string, job string, logger logrus.FieldLogger) (*batchv1.Job, error) { var ret *batchv1.Job + backOff := waitCompletionBackOff + + startTime := time.Now() + nextCheckpoint := startTime.Add(backOff.Step()) + err := wait.PollUntilContextCancel(ctx, time.Second, true, func(ctx context.Context) (bool, error) { updated := &batchv1.Job{} err := client.Get(ctx, types.NamespacedName{Namespace: ns, Name: job}, updated) @@ -113,6 +126,12 @@ func waitForJobComplete(ctx context.Context, client client.Client, ns string, jo return true, nil } + now := time.Now() + if now.After(nextCheckpoint) { + logger.Warnf("Repo maintenance job %s has lasted %v minutes", job, now.Sub(startTime).Minutes()) + nextCheckpoint = now.Add(backOff.Step()) + } + return false, nil }) @@ -245,7 +264,7 @@ func getJobConfig( func WaitJobComplete(cli client.Client, ctx context.Context, jobName, ns string, logger logrus.FieldLogger) (velerov1api.BackupRepositoryMaintenanceStatus, error) { log := logger.WithField("job name", jobName) - maintenanceJob, err := waitForJobComplete(ctx, cli, ns, jobName) + maintenanceJob, err := waitForJobComplete(ctx, cli, ns, jobName, logger) if err != nil { return velerov1api.BackupRepositoryMaintenanceStatus{}, errors.Wrap(err, "error to wait for maintenance job complete") } @@ -300,7 +319,7 @@ func WaitAllJobsComplete(ctx context.Context, cli client.Client, repo *velerov1a if job.Status.Succeeded == 0 && job.Status.Failed == 0 { log.Infof("Waiting for maintenance job %s to complete", job.Name) - updated, err := waitForJobComplete(ctx, cli, job.Namespace, job.Name) + updated, err := waitForJobComplete(ctx, cli, job.Namespace, job.Name, log) if err != nil { return nil, errors.Wrapf(err, "error waiting maintenance job[%s] complete", job.Name) } diff --git a/pkg/repository/maintenance/maintenance_test.go b/pkg/repository/maintenance/maintenance_test.go index 7ac65d98e9..bfe700ce86 100644 --- a/pkg/repository/maintenance/maintenance_test.go +++ b/pkg/repository/maintenance/maintenance_test.go @@ -19,6 +19,7 @@ package maintenance import ( "context" "fmt" + "math" "strings" "testing" "time" @@ -31,6 +32,7 @@ import ( "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/util/wait" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" @@ -148,13 +150,30 @@ func TestWaitForJobComplete(t *testing.T) { scheme := runtime.NewScheme() batchv1.AddToScheme(scheme) + waitCompletionBackOff1 := wait.Backoff{ + Duration: time.Second, + Steps: math.MaxInt, + Factor: 2, + Cap: time.Second * 12, + } + + waitCompletionBackOff2 := wait.Backoff{ + Duration: time.Second, + Steps: math.MaxInt, + Factor: 2, + Cap: time.Second * 2, + } + // Define test cases tests := []struct { description string // Test case description kubeClientObj []runtime.Object runtimeScheme *runtime.Scheme jobStatus batchv1.JobStatus // Job status to set for the test - expectError bool // Whether an error is expected + logBackOff wait.Backoff + updateAfter time.Duration + expectedLogs int + expectError bool // Whether an error is expected }{ { description: "wait error", @@ -183,6 +202,26 @@ func TestWaitForJobComplete(t *testing.T) { }, expectError: false, }, + { + description: "Log backoff not to cap", + runtimeScheme: scheme, + kubeClientObj: []runtime.Object{ + job, + }, + logBackOff: waitCompletionBackOff1, + updateAfter: time.Second * 8, + expectedLogs: 3, + }, + { + description: "Log backoff to cap", + runtimeScheme: scheme, + kubeClientObj: []runtime.Object{ + job, + }, + logBackOff: waitCompletionBackOff2, + updateAfter: time.Second * 6, + expectedLogs: 3, + }, } // Run tests @@ -195,8 +234,24 @@ func TestWaitForJobComplete(t *testing.T) { fakeClientBuilder = fakeClientBuilder.WithScheme(tc.runtimeScheme) fakeClient := fakeClientBuilder.WithRuntimeObjects(tc.kubeClientObj...).Build() + buffer := []string{} + logger := velerotest.NewMultipleLogger(&buffer) + + waitCompletionBackOff = tc.logBackOff + + if tc.updateAfter != 0 { + go func() { + time.Sleep(tc.updateAfter) + + original := job.DeepCopy() + job.Status.Succeeded = 1 + err := fakeClient.Status().Patch(context.Background(), job, client.MergeFrom(original)) + require.NoError(t, err) + }() + } + // Call the function - _, err := waitForJobComplete(context.Background(), fakeClient, job.Namespace, job.Name) + _, err := waitForJobComplete(context.Background(), fakeClient, job.Namespace, job.Name, logger) // Check if the error matches the expectation if tc.expectError { @@ -204,6 +259,8 @@ func TestWaitForJobComplete(t *testing.T) { } else { assert.NoError(t, err) } + + assert.LessOrEqual(t, len(buffer), tc.expectedLogs) }) } } diff --git a/pkg/test/test_logger.go b/pkg/test/test_logger.go index 65dc8422a7..6a3f3b9196 100644 --- a/pkg/test/test_logger.go +++ b/pkg/test/test_logger.go @@ -62,3 +62,19 @@ func NewSingleLoggerWithHooks(buffer *string, hooks []logrus.Hook) logrus.FieldL return logrus.NewEntry(logger) } + +type multipleLogRecorder struct { + buffer *[]string +} + +func (m *multipleLogRecorder) Write(p []byte) (n int, err error) { + *m.buffer = append(*m.buffer, string(p[:])) + return len(p), nil +} + +func NewMultipleLogger(buffer *[]string) logrus.FieldLogger { + logger := logrus.New() + logger.Out = &multipleLogRecorder{buffer} + logger.Level = logrus.TraceLevel + return logrus.NewEntry(logger) +}