From 2876ba42c43c18664efd1fa3db856b7211ea4232 Mon Sep 17 00:00:00 2001 From: Semyon Danilov Date: Mon, 6 Jan 2025 21:05:11 +0400 Subject: [PATCH] Fix TSlay and THarakiri in ReadOnly mode (#13162) --- .../lib/dstool_cmd_cluster_workload_run.py | 61 ++++- .../nodewarden/node_warden_pdisk.cpp | 17 +- .../pdisk/blobstorage_pdisk_impl.cpp | 24 +- .../pdisk/blobstorage_pdisk_ut.cpp | 90 +++++-- .../blobstorage/pdisk/mock/pdisk_mock.cpp | 9 + .../ut_blobstorage/read_only_pdisk.cpp | 238 ++++++++++++++---- ydb/core/mind/bscontroller/config.cpp | 1 + ydb/core/protos/blobstorage_config.proto | 1 + 8 files changed, 345 insertions(+), 96 deletions(-) diff --git a/ydb/apps/dstool/lib/dstool_cmd_cluster_workload_run.py b/ydb/apps/dstool/lib/dstool_cmd_cluster_workload_run.py index 766841bc03a0..2aae0af01fdd 100644 --- a/ydb/apps/dstool/lib/dstool_cmd_cluster_workload_run.py +++ b/ydb/apps/dstool/lib/dstool_cmd_cluster_workload_run.py @@ -19,6 +19,7 @@ def add_options(p): p.add_argument('--enable-kill-tablets', action='store_true', help='Enable tablet killer') p.add_argument('--enable-kill-blob-depot', action='store_true', help='Enable BlobDepot killer') p.add_argument('--enable-restart-pdisks', action='store_true', help='Enable PDisk restarter') + p.add_argument('--enable-readonly-pdisks', action='store_true', help='Enable SetPDiskReadOnly requests') p.add_argument('--kill-signal', type=str, default='KILL', help='Kill signal to send to restart node') p.add_argument('--sleep-before-rounds', type=float, default=1, help='Seconds to sleep before rounds') p.add_argument('--no-fail-model-check', action='store_true', help='Do not check VDisk states before taking action') @@ -104,6 +105,12 @@ def do(args): if vslot.ReadOnly } + pdisk_readonly = { + (pdisk.NodeId, pdisk.PDiskId) + for pdisk in base_config.PDisk + if pdisk.ReadOnly + } + if (len(pdisk_keys) == 0): # initialize pdisk_keys and pdisk_key_versions for node_id in {pdisk.NodeId for pdisk in base_config.PDisk}: @@ -144,6 +151,25 @@ def match(x): return False return True + def can_act_on_pdisk(node_id, pdisk_id): + def match(x): + return node_id == x[0] and pdisk_id == x[1] + + for group in base_config.Group: + if any(map(match, map(common.get_vslot_id, group.VSlotId))): + if not common.is_dynamic_group(group.GroupId): + return False + + content = { + common.get_vdisk_id_short(vslot): not match(vslot_id) and vslot.Ready and vdisk_status[vslot_id + common.get_vdisk_id(vslot)] + for vslot_id in map(common.get_vslot_id, group.VSlotId) + for vslot in [vslot_map[vslot_id]] + } + common.print_if_verbose(args, content, file=sys.stderr) + if not grouptool.check_fail_model(content, group.ErasureSpecies): + return False + return True + def do_restart(node_id): host = node_fqdn_map[node_id] if args.enable_pdisk_encryption_keys_changes: @@ -166,6 +192,20 @@ def do_restart_pdisk(node_id, pdisk_id): if not response.Success: raise Exception('Unexpected error from BSC: %s' % response.ErrorDescription) + def do_readonly_pdisk(node_id, pdisk_id, readonly): + assert can_act_on_vslot(node_id, pdisk_id) + request = common.kikimr_bsconfig.TConfigRequest(IgnoreDegradedGroupsChecks=True) + cmd = request.Command.add().SetPDiskReadOnly + cmd.TargetPDiskId.NodeId = node_id + cmd.TargetPDiskId.PDiskId = pdisk_id + cmd.Value = readonly + try: + response = common.invoke_bsc_request(request) + except Exception as e: + raise Exception('failed to perform SetPDiskReadOnly request: %s' % e) + if not response.Success: + raise Exception('Unexpected error from BSC: %s' % response.ErrorDescription) + def do_evict(vslot_id): assert can_act_on_vslot(*vslot_id) try: @@ -253,15 +293,16 @@ def do_kill_blob_depot(): readonlies = [] unreadonlies = [] pdisk_restarts = [] + make_pdisks_readonly = [] + make_pdisks_not_readonly = [] for vslot in base_config.VSlot: if common.is_dynamic_group(vslot.GroupId): vslot_id = common.get_vslot_id(vslot.VSlotId) + node_id, pdisk_id = vslot_id[:2] vdisk_id = '[%08x:%d:%d:%d]' % (vslot.GroupId, vslot.FailRealmIdx, vslot.FailDomainIdx, vslot.VDiskIdx) if vslot_id in vslot_readonly and not args.disable_readonly: unreadonlies.append(('un-readonly vslot id: %s, vdisk id: %s' % (vslot_id, vdisk_id), (do_readonly, vslot, False))) - if can_act_on_vslot(*vslot_id[:2]) and args.enable_restart_pdisks: - pdisk_restarts.append(('restart pdisk node_id: %d, pdisk_id: %d' % vslot_id[:2], (do_restart_pdisk, *vslot_id[:2]))) if can_act_on_vslot(*vslot_id) and (recent_restarts or args.disable_restarts): if not args.disable_evicts: evicts.append(('evict vslot id: %s, vdisk id: %s' % (vslot_id, vdisk_id), (do_evict, vslot_id))) @@ -270,6 +311,18 @@ def do_kill_blob_depot(): if not args.disable_readonly: readonlies.append(('readonly vslot id: %s, vdisk id: %s' % (vslot_id, vdisk_id), (do_readonly, vslot, True))) + for pdisk in base_config.PDisk: + node_id, pdisk_id = pdisk.NodeId, pdisk.PDiskId + + if can_act_on_pdisk(node_id, pdisk_id): + if args.enable_restart_pdisks: + pdisk_restarts.append(('restart pdisk node_id: %d, pdisk_id: %d' % (node_id, pdisk_id), (do_restart_pdisk, node_id, pdisk_id))) + if args.enable_readonly_pdisks: + make_pdisks_readonly.append(('readonly pdisk node_id: %d, pdisk_id: %d' % (node_id, pdisk_id), (do_readonly_pdisk, node_id, pdisk_id, True))) + + if (node_id, pdisk_id) in pdisk_readonly and args.enable_readonly_pdisks: + make_pdisks_not_readonly.append(('un-readonly pdisk node_id: %d, pdisk_id: %d' % (node_id, pdisk_id), (do_readonly_pdisk, node_id, pdisk_id, False))) + def pick(v): action_name, action = random.choice(v) print(action_name) @@ -285,6 +338,10 @@ def pick(v): possible_actions.append(('un-readonly', (pick, unreadonlies))) if pdisk_restarts: possible_actions.append(('restart-pdisk', (pick, pdisk_restarts))) + if make_pdisks_readonly: + possible_actions.append(('make-pdisks-readonly', (pick, make_pdisks_readonly))) + if make_pdisks_not_readonly: + possible_actions.append(('make-pdisks-not-readonly', (pick, make_pdisks_not_readonly))) restarts = [] diff --git a/ydb/core/blobstorage/nodewarden/node_warden_pdisk.cpp b/ydb/core/blobstorage/nodewarden/node_warden_pdisk.cpp index 5ab2900a247b..d29112084e1e 100644 --- a/ydb/core/blobstorage/nodewarden/node_warden_pdisk.cpp +++ b/ydb/core/blobstorage/nodewarden/node_warden_pdisk.cpp @@ -410,21 +410,20 @@ namespace NKikimr::NStorage { const TPDiskKey key(pdisk); - if (pdisk.HasReadOnly()) { - if (auto it = LocalPDisks.find({pdisk.GetNodeID(), pdisk.GetPDiskID()}); it != LocalPDisks.end()) { - auto& record = it->second; + auto localPdiskIt = LocalPDisks.find({pdisk.GetNodeID(), pdisk.GetPDiskID()}); + if (localPdiskIt != LocalPDisks.end()) { + auto& record = localPdiskIt->second; - if (!record.Record.HasReadOnly() || record.Record.GetReadOnly() != pdisk.GetReadOnly()) { - // Changing read-only flag requires restart. - entityStatus = NKikimrBlobStorage::RESTART; - } + if (record.Record.GetReadOnly() != pdisk.GetReadOnly()) { + // Changing read-only flag requires restart. + entityStatus = NKikimrBlobStorage::RESTART; } } switch (entityStatus) { case NKikimrBlobStorage::RESTART: - if (auto it = LocalPDisks.find({pdisk.GetNodeID(), pdisk.GetPDiskID()}); it != LocalPDisks.end()) { - it->second.Record = pdisk; + if (localPdiskIt != LocalPDisks.end()) { + localPdiskIt->second.Record = pdisk; } DoRestartLocalPDisk(pdisk); [[fallthrough]]; diff --git a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_impl.cpp b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_impl.cpp index 9aa7eeb1d1be..56f23529a445 100644 --- a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_impl.cpp +++ b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_impl.cpp @@ -3788,8 +3788,6 @@ bool TPDisk::HandleReadOnlyIfWrite(TRequestBase *request) { case ERequestType::RequestChunkReadPiece: case ERequestType::RequestYardInit: case ERequestType::RequestCheckSpace: - case ERequestType::RequestHarakiri: - case ERequestType::RequestYardSlay: case ERequestType::RequestYardControl: case ERequestType::RequestWhiteboartReport: case ERequestType::RequestHttpInfo: @@ -3805,16 +3803,16 @@ bool TPDisk::HandleReadOnlyIfWrite(TRequestBase *request) { // Can't be processed in read-only mode. case ERequestType::RequestLogWrite: { - TLogWrite &ev = *static_cast(request); + TLogWrite &req = *static_cast(request); NPDisk::TEvLogResult* result = new NPDisk::TEvLogResult(NKikimrProto::CORRUPTED, 0, errorReason); - result->Results.push_back(NPDisk::TEvLogResult::TRecord(ev.Lsn, ev.Cookie)); + result->Results.push_back(NPDisk::TEvLogResult::TRecord(req.Lsn, req.Cookie)); PCtx->ActorSystem->Send(sender, result); - ev.Replied = true; + req.Replied = true; return true; } case ERequestType::RequestChunkWrite: { - TChunkWrite &ev = *static_cast(request); - SendChunkWriteError(ev, errorReason, NKikimrProto::CORRUPTED); + TChunkWrite &req = *static_cast(request); + SendChunkWriteError(req, errorReason, NKikimrProto::CORRUPTED); return true; } case ERequestType::RequestChunkReserve: @@ -3829,6 +3827,18 @@ bool TPDisk::HandleReadOnlyIfWrite(TRequestBase *request) { case ERequestType::RequestChunkForget: PCtx->ActorSystem->Send(sender, new NPDisk::TEvChunkForgetResult(NKikimrProto::CORRUPTED, 0, errorReason)); return true; + case ERequestType::RequestHarakiri: + PCtx->ActorSystem->Send(sender, new NPDisk::TEvHarakiriResult(NKikimrProto::CORRUPTED, 0, errorReason)); + return true; + case ERequestType::RequestYardSlay: { + TSlay &req = *static_cast(request); + // We send NOTREADY, since BSController can't handle CORRUPTED or ERROR. + // If for some reason the disk will become *not* read-only, the request will be retried and VDisk will be slain. + // If not, we will be retrying the request until the disk is replaced during maintenance. + PCtx->ActorSystem->Send(sender, new NPDisk::TEvSlayResult(NKikimrProto::NOTREADY, 0, + req.VDiskId, req.SlayOwnerRound, req.PDiskId, req.VSlotId, errorReason)); + return true; + } case ERequestType::RequestWriteMetadata: case ERequestType::RequestWriteMetadataResult: diff --git a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_ut.cpp b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_ut.cpp index 6f2053349863..d570183fa267 100644 --- a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_ut.cpp +++ b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_ut.cpp @@ -1359,34 +1359,74 @@ Y_UNIT_TEST_SUITE(ReadOnlyPDisk) { vdisk.Init(); // Should start ok. vdisk.ReadLog(); // Should be able to read log. - { - // Should fail on writing log. - auto evLog = MakeHolder(vdisk.PDiskParams->Owner, vdisk.PDiskParams->OwnerRound, 0, TRcBuf(PrepareData(1)), - TLsnSeg(), nullptr); - auto res = testCtx.TestResponse(evLog.Release(), NKikimrProto::CORRUPTED); - - UNIT_ASSERT_STRING_CONTAINS(res->ErrorReason, "PDisk is in read-only mode"); - } - { - // Should fail on reserving chunk. - auto res = testCtx.TestResponse( - new NPDisk::TEvChunkReserve(vdisk.PDiskParams->Owner, vdisk.PDiskParams->OwnerRound, 1), - NKikimrProto::CORRUPTED); + } - UNIT_ASSERT_STRING_CONTAINS(res->ErrorReason, "PDisk is in read-only mode"); - } - { - // Should fail on writing chunk. - TString chunkWriteData = PrepareData(1); - auto counter = MakeIntrusive<::NMonitoring::TCounterForPtr>(); - TMemoryConsumer consumer(counter); - TTrackableBuffer buffer(std::move(consumer), chunkWriteData.data(), chunkWriteData.size()); - auto res = testCtx.TestResponse( - new NPDisk::TEvChunkWrite(vdisk.PDiskParams->Owner, vdisk.PDiskParams->OwnerRound, - 0, 0, new NPDisk::TEvChunkWrite::TBufBackedUpParts(std::move(buffer)), nullptr, false, 0), - NKikimrProto::CORRUPTED); + template + auto CheckReadOnlyRequest(Args&&... args) { + return [args = std::make_tuple(std::forward(args)...)](TActorTestContext& testCtx) { + Request* req = std::apply([](auto&&... unpackedArgs) { + return new Request(std::forward(unpackedArgs)...); + }, args); + THolder res = testCtx.TestResponse(req); + + UNIT_ASSERT_VALUES_EQUAL(res->Status, ExpectedStatus); UNIT_ASSERT_STRING_CONTAINS(res->ErrorReason, "PDisk is in read-only mode"); + }; + } + + Y_UNIT_TEST(ReadOnlyPDiskEvents) { + TActorTestContext testCtx{{}}; + TVDiskMock vdisk(&testCtx); + vdisk.InitFull(); + vdisk.SendEvLogSync(); + + auto cfg = testCtx.GetPDiskConfig(); + cfg->ReadOnly = true; + testCtx.UpdateConfigRecreatePDisk(cfg); + + vdisk.Init(); + vdisk.ReadLog(); + + std::vector> eventSenders = { + // Should fail on writing log. (ERequestType::RequestLogWrite) + CheckReadOnlyRequest( + vdisk.PDiskParams->Owner, vdisk.PDiskParams->OwnerRound, 0, + TRcBuf(PrepareData(1)), TLsnSeg(), nullptr + ), + // Should fail on writing chunk. (ERequestType::RequestChunkWrite) + CheckReadOnlyRequest( + vdisk.PDiskParams->Owner, vdisk.PDiskParams->OwnerRound, + 0, 0, nullptr, nullptr, false, 0 + ), + // Should fail on reserving chunk. (ERequestType::RequestChunkReserve) + CheckReadOnlyRequest( + vdisk.PDiskParams->Owner, vdisk.PDiskParams->OwnerRound, 1 + ), + // Should fail on chunk lock. (ERequestType::RequestChunkLock) + CheckReadOnlyRequest( + NPDisk::TEvChunkLock::ELockFrom::LOG, 0, NKikimrBlobStorage::TPDiskSpaceColor::YELLOW + ), + // Should fail on chunk unlock. (ERequestType::RequestChunkUnlock) + CheckReadOnlyRequest( + NPDisk::TEvChunkLock::ELockFrom::LOG + ), + // Should fail on chunk forget. (ERequestType::RequestChunkForget) + CheckReadOnlyRequest( + vdisk.PDiskParams->Owner, vdisk.PDiskParams->OwnerRound + 1 + ), + // Should fail on harakiri. (ERequestType::RequestHarakiri) + CheckReadOnlyRequest( + vdisk.PDiskParams->Owner, vdisk.PDiskParams->OwnerRound + 1 + ), + // Should fail on slaying vdisk. (ERequestType::RequestYardSlay) + CheckReadOnlyRequest( + vdisk.VDiskID, vdisk.PDiskParams->OwnerRound + 1, 0, 0 + ) + }; + + for (auto sender : eventSenders) { + sender(testCtx); } } } diff --git a/ydb/core/blobstorage/pdisk/mock/pdisk_mock.cpp b/ydb/core/blobstorage/pdisk/mock/pdisk_mock.cpp index 88113367e7e3..b2b43a4bb386 100644 --- a/ydb/core/blobstorage/pdisk/mock/pdisk_mock.cpp +++ b/ydb/core/blobstorage/pdisk/mock/pdisk_mock.cpp @@ -447,8 +447,17 @@ class TPDiskMockActor : public TActorBootstrapped { void Handle(NPDisk::TEvSlay::TPtr ev) { auto *msg = ev->Get(); PDISK_MOCK_LOG(INFO, PDM17, "received TEvSlay", (Msg, msg->ToString())); + auto res = std::make_unique(NKikimrProto::OK, GetStatusFlags(), msg->VDiskId, msg->SlayOwnerRound, msg->PDiskId, msg->VSlotId, TString()); + + if (Impl.IsDiskReadOnly) { + res->Status = NKikimrProto::NOTREADY; + res->ErrorReason = "PDisk is in read-only mode"; + Send(ev->Sender, res.release()); + return; + } + bool found = false; for (auto& [ownerId, owner] : Impl.Owners) { if (!owner.VDiskId.SameExceptGeneration(msg->VDiskId)) { diff --git a/ydb/core/blobstorage/ut_blobstorage/read_only_pdisk.cpp b/ydb/core/blobstorage/ut_blobstorage/read_only_pdisk.cpp index fb0788cb4c0d..76406f7a6418 100644 --- a/ydb/core/blobstorage/ut_blobstorage/read_only_pdisk.cpp +++ b/ydb/core/blobstorage/ut_blobstorage/read_only_pdisk.cpp @@ -4,6 +4,25 @@ Y_UNIT_TEST_SUITE(BSCReadOnlyPDisk) { + NKikimrBlobStorage::TConfigRequest CreateReadOnlyRequest(ui32 nodeId, ui32 pdiskId, bool readOnly, bool ignoreDegraded = false) { + NKikimrBlobStorage::TConfigRequest request; + request.SetIgnoreDegradedGroupsChecks(ignoreDegraded); + + NKikimrBlobStorage::TSetPDiskReadOnly* cmd = request.AddCommand()->MutableSetPDiskReadOnly(); + auto pdisk = cmd->MutableTargetPDiskId(); + cmd->SetValue(readOnly); + pdisk->SetNodeId(nodeId); + pdisk->SetPDiskId(pdiskId); + + return std::move(request); + } + + NKikimrBlobStorage::TConfigResponse SetReadOnly(TEnvironmentSetup& env, ui32 nodeId, ui32 pdiskId, bool readOnly, bool ignoreDegraded = false) { + NKikimrBlobStorage::TConfigRequest request = CreateReadOnlyRequest(nodeId, pdiskId, readOnly, ignoreDegraded); + + return env.Invoke(request); + } + Y_UNIT_TEST(ReadOnlyNotAllowed) { TEnvironmentSetup env({ .NodeCount = 10, @@ -34,16 +53,7 @@ Y_UNIT_TEST_SUITE(BSCReadOnlyPDisk) { for (; it != diskGuids.end(); it++, i++) { auto& diskId = it->first; - NKikimrBlobStorage::TConfigRequest request; - request.SetIgnoreDegradedGroupsChecks(true); - - NKikimrBlobStorage::TSetPDiskReadOnly* cmd = request.AddCommand()->MutableSetPDiskReadOnly(); - auto pdiskId = cmd->MutableTargetPDiskId(); - cmd->SetValue(true); - pdiskId->SetNodeId(diskId.NodeId); - pdiskId->SetPDiskId(diskId.PDiskId); - - auto response = env.Invoke(request); + auto response = SetReadOnly(env, diskId.NodeId, diskId.PDiskId, true, true); if (i < 2) { // Two disks can be set ReadOnly. @@ -74,6 +84,23 @@ Y_UNIT_TEST_SUITE(BSCReadOnlyPDisk) { env.Runtime->SendToPipe(env.TabletId, actorId, ev.release(), 0, TTestActorSystem::GetPipeConfigWithRetries()); } + void CheckDiskIsReadOnly(TEnvironmentSetup& env, const TPDiskId& diskId) { + auto config = env.FetchBaseConfig(); + + for (const NKikimrBlobStorage::TBaseConfig::TPDisk& pdisk : config.GetPDisk()) { + if (pdisk.GetNodeId() == diskId.NodeId && pdisk.GetPDiskId() == diskId.PDiskId) { + UNIT_ASSERT_VALUES_EQUAL(true, pdisk.GetReadOnly()); + break; + } + } + + auto stateIt = env.PDiskMockStates.find(std::pair(diskId.NodeId, diskId.PDiskId)); + + UNIT_ASSERT(stateIt != env.PDiskMockStates.end()); + + UNIT_ASSERT(stateIt->second->IsDiskReadOnly()); + } + Y_UNIT_TEST(RestartAndReadOnlyConsecutive) { // This test ensures that restart that sets disk to read-only is not lost when regular restart is in progress. TEnvironmentSetup env({ @@ -114,16 +141,7 @@ Y_UNIT_TEST_SUITE(BSCReadOnlyPDisk) { } { - NKikimrBlobStorage::TConfigRequest request; - request.SetIgnoreDegradedGroupsChecks(true); - - NKikimrBlobStorage::TSetPDiskReadOnly* cmd = request.AddCommand()->MutableSetPDiskReadOnly(); - auto pdiskId = cmd->MutableTargetPDiskId(); - cmd->SetValue(true); - pdiskId->SetNodeId(diskId.NodeId); - pdiskId->SetPDiskId(diskId.PDiskId); - - auto response = env.Invoke(request); + auto response = SetReadOnly(env, diskId.NodeId, diskId.PDiskId, true, true); UNIT_ASSERT_C(response.GetSuccess(), response.GetErrorDescription()); } @@ -153,11 +171,7 @@ Y_UNIT_TEST_SUITE(BSCReadOnlyPDisk) { UNIT_ASSERT(gotReport); - auto stateIt = env.PDiskMockStates.find(std::pair(diskId.NodeId, diskId.PDiskId)); - - UNIT_ASSERT(stateIt != env.PDiskMockStates.end()); - - UNIT_ASSERT(stateIt->second->IsDiskReadOnly()); + CheckDiskIsReadOnly(env, diskId); } Y_UNIT_TEST(ReadOnlyOneByOne) { @@ -191,16 +205,7 @@ Y_UNIT_TEST_SUITE(BSCReadOnlyPDisk) { auto& diskId = it->first; for (auto val : {true, false}) { - NKikimrBlobStorage::TConfigRequest request; - request.SetIgnoreDegradedGroupsChecks(true); - - NKikimrBlobStorage::TSetPDiskReadOnly* cmd = request.AddCommand()->MutableSetPDiskReadOnly(); - auto pdiskId = cmd->MutableTargetPDiskId(); - cmd->SetValue(val); - pdiskId->SetNodeId(diskId.NodeId); - pdiskId->SetPDiskId(diskId.PDiskId); - - Invoke(env, request); + Invoke(env, CreateReadOnlyRequest(diskId.NodeId, diskId.PDiskId, val, true)); TInstant barrier = env.Runtime->GetClock() + TDuration::Minutes(5); @@ -234,6 +239,10 @@ Y_UNIT_TEST_SUITE(BSCReadOnlyPDisk) { UNIT_ASSERT(gotServiceSetUpdate); UNIT_ASSERT(gotConfigResponse); + if (val) { + CheckDiskIsReadOnly(env, diskId); + } + // Wait for VSlot to become ready after PDisk restart due to ReadOnly status being changed. env.Sim(TDuration::Seconds(30)); } @@ -292,15 +301,8 @@ Y_UNIT_TEST_SUITE(BSCReadOnlyPDisk) { // Restarting the owner of an already broken disk in a broken group must be allowed auto& [targetNodeId, targetPDiskId, unused1, unused2] = vdisks[0]; - NKikimrBlobStorage::TConfigRequest request; - - NKikimrBlobStorage::TSetPDiskReadOnly* cmd = request.AddCommand()->MutableSetPDiskReadOnly(); - auto pdiskId = cmd->MutableTargetPDiskId(); - cmd->SetValue(true); - pdiskId->SetNodeId(targetNodeId); - pdiskId->SetPDiskId(targetPDiskId); - - auto response = env.Invoke(request); + auto response = SetReadOnly(env, targetNodeId, targetPDiskId, true); + UNIT_ASSERT_C(response.GetSuccess(), response.GetErrorDescription()); // Wait until pdisk restarts and node warden sends "pdisk restarted" to BSC. @@ -319,6 +321,8 @@ Y_UNIT_TEST_SUITE(BSCReadOnlyPDisk) { }); UNIT_ASSERT(gotPdiskReport); + + CheckDiskIsReadOnly(env, {targetNodeId, targetPDiskId}); } Y_UNIT_TEST(SetGoodDiskInBrokenGroupReadOnlyNotAllowed) { @@ -352,17 +356,145 @@ Y_UNIT_TEST_SUITE(BSCReadOnlyPDisk) { } // However making the owner of a single good disk ReadOnly must be prohibited - NKikimrBlobStorage::TConfigRequest request; - - NKikimrBlobStorage::TSetPDiskReadOnly* cmd = request.AddCommand()->MutableSetPDiskReadOnly(); - auto pdiskId = cmd->MutableTargetPDiskId(); - cmd->SetValue(true); - pdiskId->SetNodeId(targetNodeId); - pdiskId->SetPDiskId(targetPDiskId); - - auto response = env.Invoke(request); + auto response = SetReadOnly(env, targetNodeId, targetPDiskId, true); UNIT_ASSERT_C(!response.GetSuccess(), "ReadOnly should've been prohibited"); UNIT_ASSERT_STRING_CONTAINS(response.GetErrorDescription(), "Disintegrated"); } + + Y_UNIT_TEST(ReadOnlySlay) { + TEnvironmentSetup env{{ + .NodeCount = 8, + .VDiskReplPausedAtStart = true, + .Erasure = TBlobStorageGroupType::Erasure4Plus2Block, + }}; + auto& runtime = env.Runtime; + + env.EnableDonorMode(); + env.CreateBoxAndPool(2, 1); + env.CommenceReplication(); + env.Sim(TDuration::Seconds(30)); + + const ui32 groupId = env.GetGroups().front(); + + const TActorId edge = runtime->AllocateEdgeActor(1, __FILE__, __LINE__); + for (ui32 i = 0; i < 100; ++i) { + const TString buffer = TStringBuilder() << "blob number " << i; + TLogoBlobID id(1, 1, 1, 0, buffer.size(), 0); + runtime->WrapInActorContext(edge, [&] { + SendToBSProxy(edge, groupId, new TEvBlobStorage::TEvPut(id, buffer, TInstant::Max())); + }); + auto res = env.WaitForEdgeActorEvent(edge, false); + UNIT_ASSERT_VALUES_EQUAL(res->Get()->Status, NKikimrProto::OK); + } + + // Wait for sync and stuff. + env.Sim(TDuration::Seconds(3)); + + // Move slot out the disk. + auto info = env.GetGroupInfo(groupId); + const TVDiskID& vdiskId = info->GetVDiskId(0); + const TActorId& vdiskActorId = info->GetActorId(0); + + ui32 targetNodeId, targetPDiskId; + std::tie(targetNodeId, targetPDiskId, std::ignore) = DecomposeVDiskServiceId(vdiskActorId); + + { + auto response = SetReadOnly(env, targetNodeId, targetPDiskId, true); + + UNIT_ASSERT_C(response.GetSuccess(), "ReadOnly should've been allowed"); + } + + env.SettlePDisk(vdiskActorId); + env.Sim(TDuration::Seconds(30)); + + // Find our donor and acceptor disks. + auto baseConfig = env.FetchBaseConfig(); + bool found = false; + std::pair donorPDiskId; + std::tuple acceptor; + std::tuple donorId; + for (const auto& slot : baseConfig.GetVSlot()) { + if (slot.DonorsSize()) { + UNIT_ASSERT(!found); + UNIT_ASSERT_VALUES_EQUAL(slot.DonorsSize(), 1); + const auto& donor = slot.GetDonors(0); + const auto& id = donor.GetVSlotId(); + UNIT_ASSERT_VALUES_EQUAL(vdiskActorId, MakeBlobStorageVDiskID(id.GetNodeId(), id.GetPDiskId(), id.GetVSlotId())); + UNIT_ASSERT_VALUES_EQUAL(VDiskIDFromVDiskID(donor.GetVDiskId()), vdiskId); + donorPDiskId = {id.GetNodeId(), id.GetPDiskId()}; + donorId = {id.GetNodeId(), id.GetPDiskId(), id.GetVSlotId()}; + const auto& acceptorId = slot.GetVSlotId(); + acceptor = {acceptorId.GetNodeId(), acceptorId.GetPDiskId(), acceptorId.GetVSlotId()}; + found = true; + } + } + UNIT_ASSERT(found); + + // Restart with formatting. + env.Cleanup(); + const size_t num = env.PDiskMockStates.erase(donorPDiskId); + UNIT_ASSERT_VALUES_EQUAL(num, 1); + env.Initialize(); + + // Wait for initialization. + env.Sim(TDuration::Seconds(30)); + + // Ensure donor finished its job. + baseConfig = env.FetchBaseConfig(); + found = false; + for (const auto& slot : baseConfig.GetVSlot()) { + const auto& id = slot.GetVSlotId(); + if (std::make_tuple(id.GetNodeId(), id.GetPDiskId(), id.GetVSlotId()) == acceptor) { + UNIT_ASSERT(!found); + UNIT_ASSERT_VALUES_EQUAL(slot.DonorsSize(), 0); + UNIT_ASSERT_VALUES_EQUAL(slot.GetStatus(), "REPLICATING"); + found = true; + } + } + UNIT_ASSERT(found); + + // Ensure donor was not slain yet. + TInstant barrier = env.Runtime->GetClock() + TDuration::Minutes(10); + env.Runtime->Sim([&] { return env.Runtime->GetClock() <= barrier; }, [&](IEventHandle &witnessedEvent) { + switch (witnessedEvent.GetTypeRewrite()) { + case TEvBlobStorage::TEvControllerNodeReport::EventType: { + UNIT_ASSERT(false); + break; + } + } + }); + + // Now make disk not read-only so that it can slay donor vdisk. + { + auto response = SetReadOnly(env, targetNodeId, targetPDiskId, false); + + UNIT_ASSERT_C(response.GetSuccess(), "ReadOnly should've been allowed"); + } + + // Ensure donor vdisk was slain. + barrier = env.Runtime->GetClock() + TDuration::Minutes(10); + bool gotNodeReport = false; + env.Runtime->Sim([&] { return env.Runtime->GetClock() <= barrier && (!gotNodeReport); }, [&](IEventHandle &witnessedEvent) { + switch (witnessedEvent.GetTypeRewrite()) { + case TEvBlobStorage::TEvControllerNodeReport::EventType: { + auto *nodeReport = witnessedEvent.Get(); + if (nodeReport) { + const auto& vdisks = nodeReport->Record.GetVDiskReports(); + if (vdisks.size() == 1) { + auto& vdisk = vdisks[0]; + const auto& vslotId = vdisk.GetVSlotId(); + std::tuple vdiskId = {vslotId.GetNodeId(), vslotId.GetPDiskId(), vslotId.GetVSlotId()}; + UNIT_ASSERT_VALUES_EQUAL(donorId, vdiskId); + UNIT_ASSERT_EQUAL(vdisk.GetPhase(), NKikimrBlobStorage::TEvControllerNodeReport::DESTROYED); + gotNodeReport = true; + } + } + break; + } + } + }); + + UNIT_ASSERT(gotNodeReport); + } } diff --git a/ydb/core/mind/bscontroller/config.cpp b/ydb/core/mind/bscontroller/config.cpp index bc06dcaa5854..faca35b91ded 100644 --- a/ydb/core/mind/bscontroller/config.cpp +++ b/ydb/core/mind/bscontroller/config.cpp @@ -993,6 +993,7 @@ namespace NKikimr::NBsController { pb->MutablePDiskMetrics()->ClearPDiskId(); pb->SetExpectedSerial(pdisk.ExpectedSerial); pb->SetLastSeenSerial(pdisk.LastSeenSerial); + pb->SetReadOnly(pdisk.Mood == TPDiskMood::ReadOnly); } void TBlobStorageController::Serialize(NKikimrBlobStorage::TVSlotId *pb, TVSlotId id) { diff --git a/ydb/core/protos/blobstorage_config.proto b/ydb/core/protos/blobstorage_config.proto index f8b4aa98ec5f..66bc5363a260 100644 --- a/ydb/core/protos/blobstorage_config.proto +++ b/ydb/core/protos/blobstorage_config.proto @@ -630,6 +630,7 @@ message TBaseConfig { EDecommitStatus DecommitStatus = 16; string ExpectedSerial = 17; string LastSeenSerial = 18; + bool ReadOnly = 19; } message TVSlot { message TDonorDisk {