From 9276f5a8668f2234974a386f014bfceba6e8875a Mon Sep 17 00:00:00 2001 From: Sourav Maji Date: Wed, 4 Sep 2024 10:52:23 -0700 Subject: [PATCH] [controller] Remove older backup versions after retention period (#1149) If there are old lingering version and there are newer repush version in regular cadence, today Venice will not delete the old version as the getLatestVersionPromoteToCurrentTimestamp updated and never fall off retention period. This PR fixed that by checking version creation time and delete if its older than retention period. --------- Co-authored-by: Sourav Maji --- .../StoreBackupVersionCleanupService.java | 21 ++++++------------- .../TestStoreBackupVersionCleanupService.java | 15 +++++++++++++ 2 files changed, 21 insertions(+), 15 deletions(-) diff --git a/services/venice-controller/src/main/java/com/linkedin/venice/controller/StoreBackupVersionCleanupService.java b/services/venice-controller/src/main/java/com/linkedin/venice/controller/StoreBackupVersionCleanupService.java index 68380ff1d1..ad634459a7 100644 --- a/services/venice-controller/src/main/java/com/linkedin/venice/controller/StoreBackupVersionCleanupService.java +++ b/services/venice-controller/src/main/java/com/linkedin/venice/controller/StoreBackupVersionCleanupService.java @@ -265,28 +265,19 @@ protected boolean cleanupBackupVersion(Store store, String clusterName) { return false; } - // If the current version is from repush, do not delete the version before previous current version, - // instead delete the repush source version from which it was repushed as they hold identical data. - // During later iteration of this thread, it will delete the older backup after threshold retention time. + // This will delete backup versions which satisfy any of the following conditions + // 1. If the current version is from repush, delete the repush source backup version + // 2. Current version is from a repush, but still a lingering version older than retention period. + // 3. Current version is not repush and is older than retention, delete any versions < current version. int repushSourceVersion = store.getVersionOrThrow(currentVersion).getRepushSourceVersion(); List readyToBeRemovedVersions = versions.stream() .filter( v -> repushSourceVersion > NON_EXISTING_VERSION - ? v.getNumber() == repushSourceVersion + ? (v.getNumber() == repushSourceVersion || v.getNumber() < currentVersion + && v.getCreatedTime() + defaultBackupVersionRetentionMs < time.getMilliseconds()) : v.getNumber() < currentVersion) .collect(Collectors.toList()); - // If there are still leaking versions due to consecutive repushes with some version failing in other fabric, - // there could be versions with repushSourceVersion does not match current version, delete them after backup - // retention period. - if (readyToBeRemovedVersions.isEmpty()) { - for (Version version: versions) { - if (version.getNumber() < currentVersion && store.getLatestVersionPromoteToCurrentTimestamp() - + defaultBackupVersionRetentionMs < time.getMilliseconds()) { - readyToBeRemovedVersions.add(version); - } - } - } if (readyToBeRemovedVersions.isEmpty()) { return false; } diff --git a/services/venice-controller/src/test/java/com/linkedin/venice/controller/TestStoreBackupVersionCleanupService.java b/services/venice-controller/src/test/java/com/linkedin/venice/controller/TestStoreBackupVersionCleanupService.java index 04d8b530d0..8e453028a2 100644 --- a/services/venice-controller/src/test/java/com/linkedin/venice/controller/TestStoreBackupVersionCleanupService.java +++ b/services/venice-controller/src/test/java/com/linkedin/venice/controller/TestStoreBackupVersionCleanupService.java @@ -239,6 +239,21 @@ public void testCleanupBackupVersionRepush() { 1, TimeUnit.SECONDS, () -> verify(admin, atLeast(1)).deleteOldVersionInStore(clusterName, storeWithRollback.getName(), 2)); + + versions.clear(); + versions.put(1, VersionStatus.ONLINE); + versions.put(3, VersionStatus.ONLINE); + Store storeLingeringVersion = mockStore(-1, System.currentTimeMillis() - defaultRetentionMs * 2, versions, 3); + version = storeLingeringVersion.getVersion(2); + doReturn(2).when(version).getRepushSourceVersion(); + doReturn(1L).when(version).getCreatedTime(); + + // should delete version 1 as its lingerning and not a repush source version. + Assert.assertTrue(service.cleanupBackupVersion(storeLingeringVersion, clusterName)); + TestUtils.waitForNonDeterministicAssertion( + 1, + TimeUnit.SECONDS, + () -> verify(admin, atLeast(1)).deleteOldVersionInStore(clusterName, storeLingeringVersion.getName(), 1)); } @Test