Skip to content

Commit

Permalink
issue-2074: clean an unoccupied device if its configuration has been …
Browse files Browse the repository at this point in the history
…changed (#2204)

* issue-2074: clean a device if its serial number has been changed
  • Loading branch information
sharpeye authored Nov 14, 2024
1 parent 65bc209 commit cb26896
Show file tree
Hide file tree
Showing 4 changed files with 165 additions and 49 deletions.
1 change: 1 addition & 0 deletions cloud/blockstore/libs/diagnostics/critical_events.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ namespace NCloud::NBlockStore {
xxx(DiskRegistryAgentDevicePoolConfigMismatch) \
xxx(DiskRegistryPurgeHostError) \
xxx(DiskRegistryCleanupAgentConfigError) \
xxx(DiskRegistryOccupiedDeviceConfigurationHasChanged) \
// BLOCKSTORE_CRITICAL_EVENTS

#define BLOCKSTORE_IMPOSSIBLE_EVENTS(xxx) \
Expand Down
99 changes: 55 additions & 44 deletions cloud/blockstore/libs/storage/disk_registry/disk_registry_state.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,32 @@ auto CollectAllocatedDevices(const TVector<NProto::TDiskConfig>& disks)
return r;
}

bool HasNewLayout(
const NProto::TDeviceConfig& newConfig,
const NProto::TDeviceConfig& oldConfig)
{
auto key = [] (const NProto::TDeviceConfig& device) {
return std::make_tuple(
device.GetBlockSize(),
device.GetBlocksCount(),
device.GetUnadjustedBlockCount(),
TStringBuf {device.GetDeviceName()}
);
};

return key(oldConfig) != key(newConfig) ||
(oldConfig.GetPhysicalOffset() &&
oldConfig.GetPhysicalOffset() != newConfig.GetPhysicalOffset());
}

bool HasNewSerialNumber(
const NProto::TDeviceConfig& newConfig,
const NProto::TDeviceConfig& oldConfig)
{
return oldConfig.GetSerialNumber() &&
oldConfig.GetSerialNumber() != newConfig.GetSerialNumber();
}

} // namespace

////////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -942,15 +968,40 @@ auto TDiskRegistryState::RegisterAgent(
for (auto& d: *agent.MutableDevices()) {
const auto& uuid = d.GetDeviceUUID();

auto error = CheckDestructiveConfigurationChange(d, r.OldConfigs);
if (HasError(error)) {
STORAGE_ERROR(error.GetMessage());
auto* oldConfig = r.OldConfigs.FindPtr(uuid);
const auto diskId = FindDisk(uuid);

SetDeviceErrorState(d, timestamp, error.GetMessage());
const bool isChangeDestructive =
oldConfig && (HasNewLayout(d, *oldConfig) ||
HasNewSerialNumber(d, *oldConfig));

if (diskId && isChangeDestructive) {
TString message =
TStringBuilder()
<< "Device configuration has changed: " << *oldConfig
<< " -> " << d << ". Affected disk: " << diskId;

if (d.GetState() != NProto::DEVICE_STATE_ERROR ||
HasNewLayout(d, *oldConfig))
{
ReportDiskRegistryOccupiedDeviceConfigurationHasChanged(
message);
} else {
STORAGE_WARN(message);
}

SetDeviceErrorState(d, timestamp, std::move(message));

continue;
}

// To prevent data leakage we should clean the available device if
// its configuration has been changed
if (diskId.empty() && isChangeDestructive && !IsDirtyDevice(uuid)) {
DeviceList.MarkDeviceAsDirty(uuid);
db.UpdateDirtyDevice(uuid, {});
}

AdjustDeviceIfNeeded(d, timestamp);
if (r.NewDevices.contains(uuid)) {
SuspendDeviceIfNeeded(db, d);
Expand Down Expand Up @@ -1041,46 +1092,6 @@ auto TDiskRegistryState::RegisterAgent(
.DevicesToDisableIO = std::move(devicesToDisableIO)};
}

NProto::TError TDiskRegistryState::CheckDestructiveConfigurationChange(
const NProto::TDeviceConfig& device,
const THashMap<TDeviceId, NProto::TDeviceConfig>& oldConfigs) const
{
auto* oldConfig = oldConfigs.FindPtr(device.GetDeviceUUID());
if (!oldConfig) {
return {};
}

const auto diskId = FindDisk(device.GetDeviceUUID());

if (diskId.empty()) {
return {};
}

auto key = [] (const NProto::TDeviceConfig& d) {
return std::make_tuple(
d.GetBlockSize(),
d.GetBlocksCount(),
d.GetUnadjustedBlockCount(),
TStringBuf {d.GetDeviceName()}
);
};

const auto oldKey = key(*oldConfig);
const auto newKey = key(device);

if (oldKey == newKey && (oldConfig->GetSerialNumber().empty()
|| oldConfig->GetSerialNumber() == device.GetSerialNumber())
&& (oldConfig->GetPhysicalOffset() == 0
|| oldConfig->GetPhysicalOffset() == device.GetPhysicalOffset()))
{
return {};
}

return MakeError(E_ARGUMENT, TStringBuilder()
<< "Device configuration has changed: "
<< *oldConfig << " -> " << device << ". Affected disk: " << diskId);
}

NProto::TError TDiskRegistryState::UnregisterAgent(
TDiskRegistryDatabase& db,
ui32 nodeId)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1269,10 +1269,6 @@ class TDiskRegistryState
TDiskId& affectedDisk,
TDuration& timeout);

NProto::TError CheckDestructiveConfigurationChange(
const NProto::TDeviceConfig& device,
const THashMap<TDeviceId, NProto::TDeviceConfig>& oldConfigs) const;

void ResetMigrationStartTsIfNeeded(TDiskState& disk);

struct TConfigUpdateEffect;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8677,7 +8677,7 @@ Y_UNIT_TEST_SUITE(TDiskRegistryStateTest)
});

const auto agent1b = AgentConfig(1000, {
Device("dev-1", "uuid-1.2", "rack-1"),
Device("dev-2", "uuid-1.2", "rack-1"),
});

TDiskRegistryState state = TDiskRegistryStateBuilder()
Expand Down Expand Up @@ -11774,6 +11774,114 @@ Y_UNIT_TEST_SUITE(TDiskRegistryStateTest)
UNIT_ASSERT_VALUES_EQUAL(10_GB, allocatedBytes->Val());
UNIT_ASSERT_VALUES_EQUAL(0, dirtyBytes->Val());
}

Y_UNIT_TEST(ShouldCleanUnoccupiedDevicesWithNewSerialNumber)
{
auto monitoring = CreateMonitoringServiceStub();
auto rootGroup = monitoring->GetCounters()
->GetSubgroup("counters", "blockstore");

auto serverGroup = rootGroup->GetSubgroup("component", "server");
InitCriticalEventsCounter(serverGroup);

auto criticalEvents = serverGroup->FindCounter(
"AppCriticalEvents/"
"DiskRegistryOccupiedDeviceConfigurationHasChanged");

UNIT_ASSERT_VALUES_EQUAL(0, criticalEvents->Val());

TTestExecutor executor;
executor.WriteTx([&] (TDiskRegistryDatabase db) {
db.InitSchema();
});

auto agentConfig = AgentConfig(1, {
Device("NVMENBS01", "uuid-1.1"),
Device("NVMENBS02", "uuid-1.2"),
});

TDiskRegistryState state = TDiskRegistryStateBuilder()
.WithKnownAgents({agentConfig})
.WithDisks({
Disk("disk-1", {"uuid-1.2"}),
})
.Build();

executor.WriteTx(
[&](TDiskRegistryDatabase db)
{
auto [r, error] = state.RegisterAgent(db, agentConfig, Now());

UNIT_ASSERT_SUCCESS(error);
UNIT_ASSERT_VALUES_EQUAL(0, r.AffectedDisks.size());
UNIT_ASSERT_VALUES_EQUAL(0, state.GetDirtyDevices().size());
});

agentConfig.MutableDevices(0)->SetSerialNumber("SN-X-1");
agentConfig.MutableDevices(1)->SetSerialNumber("SN-Y-1");
agentConfig.MutableDevices(0)->SetPhysicalOffset(1000);
agentConfig.MutableDevices(1)->SetPhysicalOffset(1000);

executor.WriteTx(
[&](TDiskRegistryDatabase db)
{
auto [r, error] = state.RegisterAgent(db, agentConfig, Now());

UNIT_ASSERT_SUCCESS(error);
UNIT_ASSERT_VALUES_EQUAL(0, r.AffectedDisks.size());
UNIT_ASSERT_VALUES_EQUAL(0, state.GetDirtyDevices().size());
});

agentConfig.MutableDevices(0)->SetSerialNumber("SN-X-2");
agentConfig.MutableDevices(1)->SetSerialNumber("SN-Y-2");

executor.WriteTx(
[&](TDiskRegistryDatabase db)
{
auto [r, error] = state.RegisterAgent(db, agentConfig, Now());

UNIT_ASSERT_SUCCESS(error);
UNIT_ASSERT_VALUES_EQUAL(1, r.AffectedDisks.size());
UNIT_ASSERT_VALUES_EQUAL("disk-1", r.AffectedDisks[0]);
const auto& dd = state.GetDirtyDevices();
UNIT_ASSERT_VALUES_EQUAL(1, dd.size());
UNIT_ASSERT_VALUES_EQUAL("uuid-1.1", dd[0].GetDeviceUUID());
UNIT_ASSERT_VALUES_EQUAL(
agentConfig.GetDevices(0).GetSerialNumber(),
dd[0].GetSerialNumber());
});

UNIT_ASSERT_VALUES_EQUAL(1, criticalEvents->Val());

// Change the PhysicalOffset for uuid-1.2
agentConfig.MutableDevices(1)->SetPhysicalOffset(42);

executor.WriteTx(
[&](TDiskRegistryDatabase db)
{
auto [r, error] = state.RegisterAgent(db, agentConfig, Now());

UNIT_ASSERT_SUCCESS(error);
UNIT_ASSERT_VALUES_EQUAL(0, r.AffectedDisks.size());
});

UNIT_ASSERT_VALUES_EQUAL(2, criticalEvents->Val());

// Change the SN for uuid-1.2 once more
agentConfig.MutableDevices(1)->SetSerialNumber("SN-Y-3");

executor.WriteTx(
[&](TDiskRegistryDatabase db)
{
auto [r, error] = state.RegisterAgent(db, agentConfig, Now());

UNIT_ASSERT_SUCCESS(error);
UNIT_ASSERT_VALUES_EQUAL(0, r.AffectedDisks.size());
});

// Crit event shouldn't be reported
UNIT_ASSERT_VALUES_EQUAL(2, criticalEvents->Val());
}
}

} // namespace NCloud::NBlockStore::NStorage

0 comments on commit cb26896

Please sign in to comment.