diff --git a/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_loadstate.cpp b/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_loadstate.cpp index b8a7aa26cb8..73bd5a8b793 100644 --- a/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_loadstate.cpp +++ b/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_loadstate.cpp @@ -3,6 +3,7 @@ #include #include +#include namespace NCloud::NBlockStore::NStorage { @@ -201,6 +202,17 @@ void TDiskRegistryActor::CompleteLoadState( ScheduleMakeBackup(ctx, args.LastBackupTime); ScheduleDiskRegistryAgentListExpiredParamsCleanup(ctx); + + if (auto orphanDevices = State->FindOrphanDevices()) { + LOG_INFO( + ctx, + TBlockStoreComponents::DISK_REGISTRY, + "Found devices without agent and try to remove them: " + "DeviceUUIDs=%s", + JoinSeq(" ", orphanDevices).c_str()); + + ExecuteTx(ctx, std::move(orphanDevices)); + } } } // namespace NCloud::NBlockStore::NStorage diff --git a/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_remove_orphan_devices.cpp b/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_remove_orphan_devices.cpp new file mode 100644 index 00000000000..ed9cb52a251 --- /dev/null +++ b/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_remove_orphan_devices.cpp @@ -0,0 +1,46 @@ +#include "disk_registry_actor.h" + + +namespace NCloud::NBlockStore::NStorage { + +using namespace NActors; + +using namespace NKikimr; +using namespace NKikimr::NTabletFlatExecutor; + +//////////////////////////////////////////////////////////////////////////////// + +bool TDiskRegistryActor::PrepareRemoveOrphanDevices( + const TActorContext& ctx, + TTransactionContext& tx, + TTxDiskRegistry::TRemoveOrphanDevices& args) +{ + Y_UNUSED(ctx); + Y_UNUSED(tx); + Y_UNUSED(args); + + return true; +} + +void TDiskRegistryActor::ExecuteRemoveOrphanDevices( + const TActorContext& ctx, + TTransactionContext& tx, + TTxDiskRegistry::TRemoveOrphanDevices& args) +{ + Y_UNUSED(ctx); + + TDiskRegistryDatabase db(tx.DB); + if (!args.OrphanDevices) { + State->RemoveOrphanDevices(db, args.OrphanDevices); + } +} + +void TDiskRegistryActor::CompleteRemoveOrphanDevices( + const TActorContext& ctx, + TTxDiskRegistry::TRemoveOrphanDevices& args) +{ + Y_UNUSED(args); + Y_UNUSED(ctx); +} + +} // namespace NCloud::NBlockStore::NStorage diff --git a/cloud/blockstore/libs/storage/disk_registry/disk_registry_state.cpp b/cloud/blockstore/libs/storage/disk_registry/disk_registry_state.cpp index 1591631abf5..9b8da57cb2b 100644 --- a/cloud/blockstore/libs/storage/disk_registry/disk_registry_state.cpp +++ b/cloud/blockstore/libs/storage/disk_registry/disk_registry_state.cpp @@ -3689,6 +3689,8 @@ void TDiskRegistryState::ForgetDevices( DeviceList.ForgetDevice(id); db.DeleteSuspendedDevice(id); db.DeleteDirtyDevice(id); + + DeleteAutomaticallyReplacedDevice(db, id); } } @@ -7506,6 +7508,45 @@ TVector TDiskRegistryState::QueryAgentsInfo() const return ret; } +TVector TDiskRegistryState::FindOrphanDevices() const +{ + THashSet allKnownDevicesWithAgents; + for (const auto& agent: AgentList.GetAgents()) { + for (const auto& device: agent.GetDevices()) { + const auto& deviceUUID = device.GetDeviceUUID(); + allKnownDevicesWithAgents.insert(deviceUUID); + } + } + + TVector orphanDevices; + for (auto& deviceUUID: DeviceList.GetDirtyDevicesId()) { + if (!allKnownDevicesWithAgents.contains(deviceUUID)) { + orphanDevices.emplace_back(std::move(deviceUUID)); + } + } + for (auto& device: DeviceList.GetSuspendedDevices()) { + if (!allKnownDevicesWithAgents.contains(device.GetId())) { + orphanDevices.emplace_back(std::move(*device.MutableId())); + } + } + for (const auto& deviceUUID: AutomaticallyReplacedDeviceIds) { + if (!allKnownDevicesWithAgents.contains(deviceUUID)) { + orphanDevices.emplace_back(deviceUUID); + } + } + + SortUnique(orphanDevices); + + return orphanDevices; +} + +void TDiskRegistryState::RemoveOrphanDevices( + TDiskRegistryDatabase& db, + const TVector& orphanDevicesIds) +{ + ForgetDevices(db, orphanDevicesIds); +} + std::optional TDiskRegistryState::GetDiskBlockCount( const TDiskId& diskId) const { diff --git a/cloud/blockstore/libs/storage/disk_registry/disk_registry_state.h b/cloud/blockstore/libs/storage/disk_registry/disk_registry_state.h index 3d806eb2189..3b78b8786fb 100644 --- a/cloud/blockstore/libs/storage/disk_registry/disk_registry_state.h +++ b/cloud/blockstore/libs/storage/disk_registry/disk_registry_state.h @@ -864,6 +864,12 @@ class TDiskRegistryState TVector QueryAgentsInfo() const; + TVector FindOrphanDevices() const; + + void RemoveOrphanDevices( + TDiskRegistryDatabase& db, + const TVector& orphanDevicesIds); + private: void ProcessConfig(const NProto::TDiskRegistryConfig& config); void ProcessDisks(TVector disks); diff --git a/cloud/blockstore/libs/storage/disk_registry/disk_registry_state_ut.cpp b/cloud/blockstore/libs/storage/disk_registry/disk_registry_state_ut.cpp index 8c2110ef9da..652f20a241c 100644 --- a/cloud/blockstore/libs/storage/disk_registry/disk_registry_state_ut.cpp +++ b/cloud/blockstore/libs/storage/disk_registry/disk_registry_state_ut.cpp @@ -11911,6 +11911,126 @@ Y_UNIT_TEST_SUITE(TDiskRegistryStateTest) // Crit event shouldn't be reported UNIT_ASSERT_VALUES_EQUAL(2, criticalEvents->Val()); } -} + Y_UNIT_TEST(ShouldRemoveAlreadyLeakedDevices) + { + TTestExecutor executor; + + executor.WriteTx([&](TDiskRegistryDatabase db) { db.InitSchema(); }); + + const auto agent = AgentConfig( + 1, + "agent-1", + { + Device("NVMENBS01", "uuid-2.1", "rack-2"), + Device("NVMENBS02", "uuid-2.2", "rack-2"), + Device("NVMENBS03", "uuid-2.3", "rack-2"), + }); + + const TString leakedDirtyDevice = "uuid-100.1"; + const TString leakedSuspendedDevice = "uuid-100.2"; + const TString leakedAutomaticallyReplacedDevice = "uuid-100.3"; + + const TVector allLeakedDevices = { + leakedDirtyDevice, + leakedSuspendedDevice, + leakedAutomaticallyReplacedDevice}; + + // Add leaked devices. + executor.WriteTx( + [&](TDiskRegistryDatabase db) + { + db.UpdateDirtyDevice(leakedDirtyDevice, ""); + NProto::TSuspendedDevice device; + device.SetId(leakedSuspendedDevice); + db.UpdateSuspendedDevice(device); + db.AddAutomaticallyReplacedDevice( + TAutomaticallyReplacedDeviceInfo{ + leakedAutomaticallyReplacedDevice, + Now()}); + }); + + // Register agent. + executor.WriteTx( + [&](TDiskRegistryDatabase db) + { + auto state = TDiskRegistryStateBuilder::LoadState(db).Build(); + + auto orphanDevices = state.FindOrphanDevices(); + UNIT_ASSERT_EQUAL(static_cast(3), orphanDevices.size()); + for (const auto& leakedDevice: allLeakedDevices) { + UNIT_ASSERT_UNEQUAL( + orphanDevices.end(), + Find(orphanDevices, leakedDevice)); + } + + state.RemoveOrphanDevices(db, orphanDevices); + + // Check that device cleaned up from tables. + const auto dirtyDevicesFromState = state.GetDirtyDevices(); + UNIT_ASSERT_EQUAL( + dirtyDevicesFromState.end(), + FindIf( + dirtyDevicesFromState, + [&](const TDeviceConfig& val) + { return val.deviceuuid() == leakedDirtyDevice; })); + + TVector dirtyDevicesDb; + db.ReadDirtyDevices(dirtyDevicesDb); + UNIT_ASSERT_EQUAL( + dirtyDevicesDb.end(), + FindIf( + dirtyDevicesDb, + [&](const TDirtyDevice& val) + { return val.Id == leakedDirtyDevice; })); + + const auto suspendedDevicesFromState = + state.GetSuspendedDevices(); + + auto deviceIdPredicateForSuspendDevices = + [&](const NProto::TSuspendedDevice& val) + { + return val.GetId() == leakedSuspendedDevice; + }; + + UNIT_ASSERT_EQUAL( + suspendedDevicesFromState.end(), + FindIf( + suspendedDevicesFromState, + deviceIdPredicateForSuspendDevices)); + + TVector suspendedDevicesDb; + db.ReadSuspendedDevices(suspendedDevicesDb); + UNIT_ASSERT_EQUAL( + suspendedDevicesDb.end(), + FindIf( + suspendedDevicesDb, + deviceIdPredicateForSuspendDevices)); + + auto deviceIdPredicateForAutomaticallyReplacedDevices = + [&](const TAutomaticallyReplacedDeviceInfo& val) + { + return val.DeviceId == leakedAutomaticallyReplacedDevice; + }; + + const auto automaticallyReplacedDevicesFromState = + state.GetAutomaticallyReplacedDevices(); + UNIT_ASSERT_EQUAL( + automaticallyReplacedDevicesFromState.end(), + FindIf( + automaticallyReplacedDevicesFromState, + deviceIdPredicateForAutomaticallyReplacedDevices)); + + TDeque + automaticallyReplacedDevicesFromDb; + db.ReadAutomaticallyReplacedDevices( + automaticallyReplacedDevicesFromDb); + UNIT_ASSERT_EQUAL( + automaticallyReplacedDevicesFromDb.end(), + FindIf( + automaticallyReplacedDevicesFromDb, + deviceIdPredicateForAutomaticallyReplacedDevices)); + }); + } +} } // namespace NCloud::NBlockStore::NStorage diff --git a/cloud/blockstore/libs/storage/disk_registry/disk_registry_state_ut_cms.cpp b/cloud/blockstore/libs/storage/disk_registry/disk_registry_state_ut_cms.cpp index 9235a6c8fa8..898b67e72f2 100644 --- a/cloud/blockstore/libs/storage/disk_registry/disk_registry_state_ut_cms.cpp +++ b/cloud/blockstore/libs/storage/disk_registry/disk_registry_state_ut_cms.cpp @@ -720,6 +720,114 @@ Y_UNIT_TEST_SUITE(TDiskRegistryStateCMSTest) }); } + Y_UNIT_TEST(ShouldRemoveDevicesAfterAgentDelete) + { + TTestExecutor executor; + executor.WriteTx([&](TDiskRegistryDatabase db) { db.InitSchema(); }); + + const auto agent = AgentConfig( + 2, + "agent-2", + { + Device("NVMENBS01", "uuid-2.1", "rack-2"), + Device("NVMENBS02", "uuid-2.2", "rack-2"), + Device("NVMENBS03", "uuid-2.3", "rack-2"), + }); + + // Init state. + { + TDiskRegistryState state = + TDiskRegistryStateBuilder().WithConfig({agent}).Build(); + + // Register agent. + executor.WriteTx( + [&](TDiskRegistryDatabase db) + { + UNIT_ASSERT_SUCCESS(RegisterAgent(state, db, agent, Now())); + for (const auto& device: agent.GetDevices()) { + state.MarkDeviceAsClean( + Now(), + db, + device.GetDeviceUUID()); + } + }); + + // Mark devices. + executor.WriteTx( + [&](TDiskRegistryDatabase db) + { + state.MarkDeviceAsDirty( + db, + agent.devices()[0].deviceuuid()); + state.SuspendDevice(db, agent.devices()[1].deviceuuid()); + db.AddAutomaticallyReplacedDevice( + TAutomaticallyReplacedDeviceInfo{ + agent.devices()[2].deviceuuid(), + Now()}); + }); + } + + executor.WriteTx( + [&](TDiskRegistryDatabase db) + { + // Load state from db. + auto state = TDiskRegistryStateBuilder::LoadState(db) + .WithConfig({agent}) + .Build(); + + // Remove agent. + TVector affectedDisks; + TDuration timeout; + + UNIT_ASSERT_SUCCESS(state.UpdateCmsHostState( + db, + agent.GetAgentId(), + NProto::AGENT_STATE_WARNING, + Now(), + false, // dryRun + affectedDisks, + timeout)); + + UNIT_ASSERT_VALUES_EQUAL(1, state.GetAgents().size()); + + UNIT_ASSERT_SUCCESS(state.PurgeHost( + db, + agent.GetAgentId(), + Now(), + false, // dryRun + affectedDisks)); + + UNIT_ASSERT_SUCCESS(state.UpdateAgentState( + db, + agent.GetAgentId(), + NProto::AGENT_STATE_UNAVAILABLE, + Now(), + "lost", + affectedDisks)); + + UNIT_ASSERT_VALUES_EQUAL(0, state.GetAgents().size()); + + UNIT_ASSERT_VALUES_EQUAL(0, state.GetDirtyDevices().size()); + TVector dirtyDevices; + db.ReadDirtyDevices(dirtyDevices); + UNIT_ASSERT_VALUES_EQUAL(0, dirtyDevices.size()); + + UNIT_ASSERT_VALUES_EQUAL(0, state.GetSuspendedDevices().size()); + TVector suspendedDevices; + db.ReadSuspendedDevices(suspendedDevices); + UNIT_ASSERT_VALUES_EQUAL(0, suspendedDevices.size()); + + UNIT_ASSERT_VALUES_EQUAL( + 0, + state.GetAutomaticallyReplacedDevices().size()); + TDeque + automaticalyReplacedDevices; + db.ReadAutomaticallyReplacedDevices( + automaticalyReplacedDevices); + UNIT_ASSERT_VALUES_EQUAL(0, automaticalyReplacedDevices.size()); + }); + } + Y_UNIT_TEST(ShouldReturnAffectedDisksFromPurgeHost) { TTestExecutor executor; diff --git a/cloud/blockstore/libs/storage/disk_registry/disk_registry_tx.h b/cloud/blockstore/libs/storage/disk_registry/disk_registry_tx.h index 43619484ef4..b3172e87be2 100644 --- a/cloud/blockstore/libs/storage/disk_registry/disk_registry_tx.h +++ b/cloud/blockstore/libs/storage/disk_registry/disk_registry_tx.h @@ -61,6 +61,7 @@ namespace NCloud::NBlockStore::NStorage { xxx(DeallocateCheckpoint, __VA_ARGS__) \ xxx(SetCheckpointDataState, __VA_ARGS__) \ xxx(PurgeHostCms, __VA_ARGS__) \ + xxx(RemoveOrphanDevices, __VA_ARGS__) \ // BLOCKSTORE_DISK_REGISTRY_TRANSACTIONS //////////////////////////////////////////////////////////////////////////////// @@ -1429,6 +1430,24 @@ struct TTxDiskRegistry Error.Clear(); } }; + + // + // RemoveOrphanDevices + // + + struct TRemoveOrphanDevices + { + TVector OrphanDevices; + + explicit TRemoveOrphanDevices(TVector orphanDevices) + : OrphanDevices(std::move(orphanDevices)) + {} + + void Clear() + { + // nothing to do + } + }; }; } // namespace NCloud::NBlockStore::NStorage diff --git a/cloud/blockstore/libs/storage/disk_registry/model/device_list.cpp b/cloud/blockstore/libs/storage/disk_registry/model/device_list.cpp index 4cbe6b32257..e02f063f8af 100644 --- a/cloud/blockstore/libs/storage/disk_registry/model/device_list.cpp +++ b/cloud/blockstore/libs/storage/disk_registry/model/device_list.cpp @@ -774,6 +774,11 @@ TVector TDeviceList::GetDirtyDevices() const return devices; } +TVector TDeviceList::GetDirtyDevicesId() const +{ + return {DirtyDevices.begin(), DirtyDevices.end()}; +} + bool TDeviceList::IsDirtyDevice(const TDeviceId& uuid) const { return DirtyDevices.contains(uuid); diff --git a/cloud/blockstore/libs/storage/disk_registry/model/device_list.h b/cloud/blockstore/libs/storage/disk_registry/model/device_list.h index a3c4bbdb444..b345f04263b 100644 --- a/cloud/blockstore/libs/storage/disk_registry/model/device_list.h +++ b/cloud/blockstore/libs/storage/disk_registry/model/device_list.h @@ -123,6 +123,7 @@ class TDeviceList [[nodiscard]] TVector GetBrokenDevices() const; [[nodiscard]] TVector GetDirtyDevices() const; + [[nodiscard]] TVector GetDirtyDevicesId() const; NProto::TDeviceConfig AllocateDevice( const TDiskId& diskId, diff --git a/cloud/blockstore/libs/storage/disk_registry/testlib/test_state.cpp b/cloud/blockstore/libs/storage/disk_registry/testlib/test_state.cpp index 308004cdbb4..6538736fb00 100644 --- a/cloud/blockstore/libs/storage/disk_registry/testlib/test_state.cpp +++ b/cloud/blockstore/libs/storage/disk_registry/testlib/test_state.cpp @@ -460,6 +460,29 @@ TString GetReplicaTableRepr( //////////////////////////////////////////////////////////////////////////////// +TDiskRegistryStateBuilder TDiskRegistryStateBuilder::LoadState( + TDiskRegistryDatabase& db) +{ + TDiskRegistryStateBuilder builder; + + db.ReadDiskRegistryConfig(builder.Config); + db.ReadDirtyDevices(builder.DirtyDevices); + db.ReadAgents(builder.Agents); + db.ReadDisks(builder.Disks); + db.ReadPlacementGroups(builder.PlacementGroups); + db.ReadBrokenDisks(builder.BrokenDisks); + db.ReadDisksToReallocate(builder.DisksToReallocate); + db.ReadErrorNotifications(builder.ErrorNotifications); + db.ReadUserNotifications(builder.UserNotifications); + db.ReadDisksToCleanup(builder.DisksToCleanup); + db.ReadOutdatedVolumeConfigs(builder.OutdatedVolumeConfigs); + db.ReadSuspendedDevices(builder.SuspendedDevices); + db.ReadAutomaticallyReplacedDevices(builder.AutomaticallyReplacedDevices); + db.ReadDiskRegistryAgentListParams(builder.DiskRegistryAgentListParams); + + return builder; +} + TDiskRegistryState TDiskRegistryStateBuilder::Build() { return TDiskRegistryState( diff --git a/cloud/blockstore/libs/storage/disk_registry/testlib/test_state.h b/cloud/blockstore/libs/storage/disk_registry/testlib/test_state.h index dc45e642aa6..a60a5c9d287 100644 --- a/cloud/blockstore/libs/storage/disk_registry/testlib/test_state.h +++ b/cloud/blockstore/libs/storage/disk_registry/testlib/test_state.h @@ -254,6 +254,8 @@ struct TDiskRegistryStateBuilder TDeque AutomaticallyReplacedDevices; THashMap DiskRegistryAgentListParams; + static TDiskRegistryStateBuilder LoadState(TDiskRegistryDatabase& db); + TDiskRegistryState Build(); TDiskRegistryStateBuilder& With(TStorageConfigPtr config); diff --git a/cloud/blockstore/libs/storage/disk_registry/ya.make b/cloud/blockstore/libs/storage/disk_registry/ya.make index 944f24ec2b9..fc8d924270c 100644 --- a/cloud/blockstore/libs/storage/disk_registry/ya.make +++ b/cloud/blockstore/libs/storage/disk_registry/ya.make @@ -6,6 +6,7 @@ SRCS( disk_registry_actor_backup_state.cpp disk_registry_actor_change_disk_device.cpp disk_registry_actor_checkpoint.cpp + disk_registry_actor_remove_orphan_devices.cpp disk_registry_actor_cleanup.cpp disk_registry_actor_cms.cpp disk_registry_actor_config.cpp