From cb268966daf72cc2defb6e552df090abd7a562eb Mon Sep 17 00:00:00 2001 From: Pavel Misko Date: Thu, 14 Nov 2024 12:57:58 +0100 Subject: [PATCH] issue-2074: clean an unoccupied device if its configuration has been changed (#2204) * issue-2074: clean a device if its serial number has been changed --- .../libs/diagnostics/critical_events.h | 1 + .../disk_registry/disk_registry_state.cpp | 99 +++++++++------- .../disk_registry/disk_registry_state.h | 4 - .../disk_registry/disk_registry_state_ut.cpp | 110 +++++++++++++++++- 4 files changed, 165 insertions(+), 49 deletions(-) diff --git a/cloud/blockstore/libs/diagnostics/critical_events.h b/cloud/blockstore/libs/diagnostics/critical_events.h index 7395c1239a3..c66e9609149 100644 --- a/cloud/blockstore/libs/diagnostics/critical_events.h +++ b/cloud/blockstore/libs/diagnostics/critical_events.h @@ -62,6 +62,7 @@ namespace NCloud::NBlockStore { xxx(DiskRegistryAgentDevicePoolConfigMismatch) \ xxx(DiskRegistryPurgeHostError) \ xxx(DiskRegistryCleanupAgentConfigError) \ + xxx(DiskRegistryOccupiedDeviceConfigurationHasChanged) \ // BLOCKSTORE_CRITICAL_EVENTS #define BLOCKSTORE_IMPOSSIBLE_EVENTS(xxx) \ diff --git a/cloud/blockstore/libs/storage/disk_registry/disk_registry_state.cpp b/cloud/blockstore/libs/storage/disk_registry/disk_registry_state.cpp index 51e768684c4..26257fe49e6 100644 --- a/cloud/blockstore/libs/storage/disk_registry/disk_registry_state.cpp +++ b/cloud/blockstore/libs/storage/disk_registry/disk_registry_state.cpp @@ -270,6 +270,32 @@ auto CollectAllocatedDevices(const TVector& disks) return r; } +bool HasNewLayout( + const NProto::TDeviceConfig& newConfig, + const NProto::TDeviceConfig& oldConfig) +{ + auto key = [] (const NProto::TDeviceConfig& device) { + return std::make_tuple( + device.GetBlockSize(), + device.GetBlocksCount(), + device.GetUnadjustedBlockCount(), + TStringBuf {device.GetDeviceName()} + ); + }; + + return key(oldConfig) != key(newConfig) || + (oldConfig.GetPhysicalOffset() && + oldConfig.GetPhysicalOffset() != newConfig.GetPhysicalOffset()); +} + +bool HasNewSerialNumber( + const NProto::TDeviceConfig& newConfig, + const NProto::TDeviceConfig& oldConfig) +{ + return oldConfig.GetSerialNumber() && + oldConfig.GetSerialNumber() != newConfig.GetSerialNumber(); +} + } // namespace //////////////////////////////////////////////////////////////////////////////// @@ -942,15 +968,40 @@ auto TDiskRegistryState::RegisterAgent( for (auto& d: *agent.MutableDevices()) { const auto& uuid = d.GetDeviceUUID(); - auto error = CheckDestructiveConfigurationChange(d, r.OldConfigs); - if (HasError(error)) { - STORAGE_ERROR(error.GetMessage()); + auto* oldConfig = r.OldConfigs.FindPtr(uuid); + const auto diskId = FindDisk(uuid); - SetDeviceErrorState(d, timestamp, error.GetMessage()); + const bool isChangeDestructive = + oldConfig && (HasNewLayout(d, *oldConfig) || + HasNewSerialNumber(d, *oldConfig)); + + if (diskId && isChangeDestructive) { + TString message = + TStringBuilder() + << "Device configuration has changed: " << *oldConfig + << " -> " << d << ". Affected disk: " << diskId; + + if (d.GetState() != NProto::DEVICE_STATE_ERROR || + HasNewLayout(d, *oldConfig)) + { + ReportDiskRegistryOccupiedDeviceConfigurationHasChanged( + message); + } else { + STORAGE_WARN(message); + } + + SetDeviceErrorState(d, timestamp, std::move(message)); continue; } + // To prevent data leakage we should clean the available device if + // its configuration has been changed + if (diskId.empty() && isChangeDestructive && !IsDirtyDevice(uuid)) { + DeviceList.MarkDeviceAsDirty(uuid); + db.UpdateDirtyDevice(uuid, {}); + } + AdjustDeviceIfNeeded(d, timestamp); if (r.NewDevices.contains(uuid)) { SuspendDeviceIfNeeded(db, d); @@ -1041,46 +1092,6 @@ auto TDiskRegistryState::RegisterAgent( .DevicesToDisableIO = std::move(devicesToDisableIO)}; } -NProto::TError TDiskRegistryState::CheckDestructiveConfigurationChange( - const NProto::TDeviceConfig& device, - const THashMap& oldConfigs) const -{ - auto* oldConfig = oldConfigs.FindPtr(device.GetDeviceUUID()); - if (!oldConfig) { - return {}; - } - - const auto diskId = FindDisk(device.GetDeviceUUID()); - - if (diskId.empty()) { - return {}; - } - - auto key = [] (const NProto::TDeviceConfig& d) { - return std::make_tuple( - d.GetBlockSize(), - d.GetBlocksCount(), - d.GetUnadjustedBlockCount(), - TStringBuf {d.GetDeviceName()} - ); - }; - - const auto oldKey = key(*oldConfig); - const auto newKey = key(device); - - if (oldKey == newKey && (oldConfig->GetSerialNumber().empty() - || oldConfig->GetSerialNumber() == device.GetSerialNumber()) - && (oldConfig->GetPhysicalOffset() == 0 - || oldConfig->GetPhysicalOffset() == device.GetPhysicalOffset())) - { - return {}; - } - - return MakeError(E_ARGUMENT, TStringBuilder() - << "Device configuration has changed: " - << *oldConfig << " -> " << device << ". Affected disk: " << diskId); -} - NProto::TError TDiskRegistryState::UnregisterAgent( TDiskRegistryDatabase& db, ui32 nodeId) diff --git a/cloud/blockstore/libs/storage/disk_registry/disk_registry_state.h b/cloud/blockstore/libs/storage/disk_registry/disk_registry_state.h index 019b021903f..293b4305da3 100644 --- a/cloud/blockstore/libs/storage/disk_registry/disk_registry_state.h +++ b/cloud/blockstore/libs/storage/disk_registry/disk_registry_state.h @@ -1269,10 +1269,6 @@ class TDiskRegistryState TDiskId& affectedDisk, TDuration& timeout); - NProto::TError CheckDestructiveConfigurationChange( - const NProto::TDeviceConfig& device, - const THashMap& oldConfigs) const; - void ResetMigrationStartTsIfNeeded(TDiskState& disk); struct TConfigUpdateEffect; diff --git a/cloud/blockstore/libs/storage/disk_registry/disk_registry_state_ut.cpp b/cloud/blockstore/libs/storage/disk_registry/disk_registry_state_ut.cpp index 14716915646..ed0b41bdf26 100644 --- a/cloud/blockstore/libs/storage/disk_registry/disk_registry_state_ut.cpp +++ b/cloud/blockstore/libs/storage/disk_registry/disk_registry_state_ut.cpp @@ -8677,7 +8677,7 @@ Y_UNIT_TEST_SUITE(TDiskRegistryStateTest) }); const auto agent1b = AgentConfig(1000, { - Device("dev-1", "uuid-1.2", "rack-1"), + Device("dev-2", "uuid-1.2", "rack-1"), }); TDiskRegistryState state = TDiskRegistryStateBuilder() @@ -11774,6 +11774,114 @@ Y_UNIT_TEST_SUITE(TDiskRegistryStateTest) UNIT_ASSERT_VALUES_EQUAL(10_GB, allocatedBytes->Val()); UNIT_ASSERT_VALUES_EQUAL(0, dirtyBytes->Val()); } + + Y_UNIT_TEST(ShouldCleanUnoccupiedDevicesWithNewSerialNumber) + { + auto monitoring = CreateMonitoringServiceStub(); + auto rootGroup = monitoring->GetCounters() + ->GetSubgroup("counters", "blockstore"); + + auto serverGroup = rootGroup->GetSubgroup("component", "server"); + InitCriticalEventsCounter(serverGroup); + + auto criticalEvents = serverGroup->FindCounter( + "AppCriticalEvents/" + "DiskRegistryOccupiedDeviceConfigurationHasChanged"); + + UNIT_ASSERT_VALUES_EQUAL(0, criticalEvents->Val()); + + TTestExecutor executor; + executor.WriteTx([&] (TDiskRegistryDatabase db) { + db.InitSchema(); + }); + + auto agentConfig = AgentConfig(1, { + Device("NVMENBS01", "uuid-1.1"), + Device("NVMENBS02", "uuid-1.2"), + }); + + TDiskRegistryState state = TDiskRegistryStateBuilder() + .WithKnownAgents({agentConfig}) + .WithDisks({ + Disk("disk-1", {"uuid-1.2"}), + }) + .Build(); + + executor.WriteTx( + [&](TDiskRegistryDatabase db) + { + auto [r, error] = state.RegisterAgent(db, agentConfig, Now()); + + UNIT_ASSERT_SUCCESS(error); + UNIT_ASSERT_VALUES_EQUAL(0, r.AffectedDisks.size()); + UNIT_ASSERT_VALUES_EQUAL(0, state.GetDirtyDevices().size()); + }); + + agentConfig.MutableDevices(0)->SetSerialNumber("SN-X-1"); + agentConfig.MutableDevices(1)->SetSerialNumber("SN-Y-1"); + agentConfig.MutableDevices(0)->SetPhysicalOffset(1000); + agentConfig.MutableDevices(1)->SetPhysicalOffset(1000); + + executor.WriteTx( + [&](TDiskRegistryDatabase db) + { + auto [r, error] = state.RegisterAgent(db, agentConfig, Now()); + + UNIT_ASSERT_SUCCESS(error); + UNIT_ASSERT_VALUES_EQUAL(0, r.AffectedDisks.size()); + UNIT_ASSERT_VALUES_EQUAL(0, state.GetDirtyDevices().size()); + }); + + agentConfig.MutableDevices(0)->SetSerialNumber("SN-X-2"); + agentConfig.MutableDevices(1)->SetSerialNumber("SN-Y-2"); + + executor.WriteTx( + [&](TDiskRegistryDatabase db) + { + auto [r, error] = state.RegisterAgent(db, agentConfig, Now()); + + UNIT_ASSERT_SUCCESS(error); + UNIT_ASSERT_VALUES_EQUAL(1, r.AffectedDisks.size()); + UNIT_ASSERT_VALUES_EQUAL("disk-1", r.AffectedDisks[0]); + const auto& dd = state.GetDirtyDevices(); + UNIT_ASSERT_VALUES_EQUAL(1, dd.size()); + UNIT_ASSERT_VALUES_EQUAL("uuid-1.1", dd[0].GetDeviceUUID()); + UNIT_ASSERT_VALUES_EQUAL( + agentConfig.GetDevices(0).GetSerialNumber(), + dd[0].GetSerialNumber()); + }); + + UNIT_ASSERT_VALUES_EQUAL(1, criticalEvents->Val()); + + // Change the PhysicalOffset for uuid-1.2 + agentConfig.MutableDevices(1)->SetPhysicalOffset(42); + + executor.WriteTx( + [&](TDiskRegistryDatabase db) + { + auto [r, error] = state.RegisterAgent(db, agentConfig, Now()); + + UNIT_ASSERT_SUCCESS(error); + UNIT_ASSERT_VALUES_EQUAL(0, r.AffectedDisks.size()); + }); + + UNIT_ASSERT_VALUES_EQUAL(2, criticalEvents->Val()); + + // Change the SN for uuid-1.2 once more + agentConfig.MutableDevices(1)->SetSerialNumber("SN-Y-3"); + + executor.WriteTx( + [&](TDiskRegistryDatabase db) + { + auto [r, error] = state.RegisterAgent(db, agentConfig, Now()); + + UNIT_ASSERT_SUCCESS(error); + UNIT_ASSERT_VALUES_EQUAL(0, r.AffectedDisks.size()); + }); + + // Crit event shouldn't be reported + UNIT_ASSERT_VALUES_EQUAL(2, criticalEvents->Val()); + } } } // namespace NCloud::NBlockStore::NStorage