Skip to content

Commit

Permalink
Add PURGE_HOST cms action (#2348)
Browse files Browse the repository at this point in the history
* Add PURGE_HOST cms action

* Review fixes

* Review fixes 2
  • Loading branch information
komarevtsev-d authored Nov 1, 2024
1 parent d5b6ea7 commit b29fd3a
Show file tree
Hide file tree
Showing 29 changed files with 974 additions and 191 deletions.
9 changes: 8 additions & 1 deletion cloud/blockstore/config/storage.proto
Original file line number Diff line number Diff line change
Expand Up @@ -1013,7 +1013,7 @@ message TStorageServiceConfig

// CMS actions such as "REMOVE_HOST" and "REMOVE_DEVICE" will attempt to
// remove devices and the host from DR memory (including persistent storage).
optional bool CleanupDRConfigOnCMSActions = 375;
// optional bool CleanupDRConfigOnCMSActions = 375; // obsolete

// Resync range, if scrubbing found a mismatch.
optional bool ResyncRangeAfterScrubbing = 376;
Expand Down Expand Up @@ -1046,4 +1046,11 @@ message TStorageServiceConfig

// Timeout between disks reallocations.
optional uint32 DiskRegistryDisksNotificationTimeout = 387;

// Whether the disk registry is allowed to allocate local disks while agent
// is in warning state.
// When enabled, the Disk Registry REMOVE_HOST CMS action will not "forget"
// agents devices and will not suspend local devices. Instead, PURGE_HOST
// should be used for these purposes.
optional bool DiskRegistryAlwaysAllocatesLocalDisks = 388;
}
2 changes: 2 additions & 0 deletions cloud/blockstore/libs/diagnostics/critical_events.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ namespace NCloud::NBlockStore {
xxx(BlockDigestMismatchInBlob) \
xxx(DiskRegistryResumeDeviceFailed) \
xxx(DiskRegistryAgentDevicePoolConfigMismatch) \
xxx(DiskRegistryPurgeHostError) \
xxx(DiskRegistryCleanupAgentConfigError) \
// BLOCKSTORE_CRITICAL_EVENTS

#define BLOCKSTORE_IMPOSSIBLE_EVENTS(xxx) \
Expand Down
2 changes: 1 addition & 1 deletion cloud/blockstore/libs/storage/core/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -431,7 +431,7 @@ TDuration MSeconds(ui32 value)
xxx(MaxLocalVolumes, ui32, 100 )\
\
xxx(DiskRegistryVolumeConfigUpdatePeriod, TDuration, Minutes(5) )\
xxx(CleanupDRConfigOnCMSActions, bool, false )\
xxx(DiskRegistryAlwaysAllocatesLocalDisks, bool, false )\
\
xxx(ReassignRequestRetryTimeout, TDuration, Seconds(5) )\
xxx(ReassignChannelsPercentageThreshold, ui32, 10 )\
Expand Down
2 changes: 1 addition & 1 deletion cloud/blockstore/libs/storage/core/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -461,7 +461,7 @@ class TStorageConfig
ui32 GetMaxLocalVolumes() const;

TDuration GetDiskRegistryVolumeConfigUpdatePeriod() const;
bool GetCleanupDRConfigOnCMSActions() const;
bool GetDiskRegistryAlwaysAllocatesLocalDisks() const;
TDuration GetReassignRequestRetryTimeout() const;
ui32 GetReassignChannelsPercentageThreshold() const;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ class TCmsRequestActor final
const TEvDiskRegistryPrivate::TEvUpdateCmsHostStateResponse::TPtr& ev,
const TActorContext& ctx);

void HandlePurgeHostCmsResponse(
const TEvDiskRegistryPrivate::TEvPurgeHostCmsResponse::TPtr& ev,
const TActorContext& ctx);

void HandleGetDependentDisksResponse(
const TEvDiskRegistry::TEvGetDependentDisksResponse::TPtr& ev,
const TActorContext& ctx);
Expand Down Expand Up @@ -172,6 +176,16 @@ void TCmsRequestActor::SendNextRequest(const TActorContext& ctx)
break;
}

case NProto::TAction_EType::TAction_EType_PURGE_HOST: {
using TRequest = TEvDiskRegistryPrivate::TEvPurgeHostCmsRequest;
auto request = std::make_unique<TRequest>(
action.GetHost(),
action.GetDryRun());

NCloud::Send(ctx, Owner, std::move(request));
break;
}

default: {
auto& result = *Response->Record.MutableActionResults()->Add();
*result.MutableResult() = MakeError(
Expand Down Expand Up @@ -212,6 +226,13 @@ void TCmsRequestActor::HandleCmsActionResponse(
SendNextRequest(ctx);
}

void TCmsRequestActor::HandlePurgeHostCmsResponse(
const TEvDiskRegistryPrivate::TEvPurgeHostCmsResponse::TPtr& ev,
const TActorContext& ctx)
{
HandleCmsActionResponse(*ev->Get(), ctx);
}

void TCmsRequestActor::HandleCmsActionResponseProto(
const TEvDiskRegistry::TEvGetDependentDisksResponse& response,
const TActorContext& ctx)
Expand Down Expand Up @@ -272,6 +293,10 @@ STFUNC(TCmsRequestActor::StateWork)
TEvDiskRegistryPrivate::TEvUpdateCmsHostStateResponse,
HandleUpdateCmsHostStateResponse);

HFunc(
TEvDiskRegistryPrivate::TEvPurgeHostCmsResponse,
HandlePurgeHostCmsResponse);

HFunc(
TEvDiskRegistry::TEvGetDependentDisksResponse,
HandleGetDependentDisksResponse);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
#include "disk_registry_actor.h"
#include "disk_registry_database.h"

namespace NCloud::NBlockStore::NStorage {

using namespace NActors;
using namespace NKikimr;
using namespace NKikimr::NTabletFlatExecutor;

////////////////////////////////////////////////////////////////////////////////

void TDiskRegistryActor::HandlePurgeHostCms(
const TEvDiskRegistryPrivate::TEvPurgeHostCmsRequest::TPtr& ev,
const TActorContext& ctx)
{
BLOCKSTORE_DISK_REGISTRY_COUNTER(PurgeHostCms);

auto* msg = ev->Get();

auto requestInfo = CreateRequestInfo(
ev->Sender,
ev->Cookie,
msg->CallContext);

LOG_INFO(
ctx,
TBlockStoreComponents::DISK_REGISTRY,
"[%lu] Received PurgeHostCms request: Host=%s",
TabletID(),
msg->Host.c_str());

ExecuteTx<TPurgeHostCms>(
ctx,
std::move(requestInfo),
std::move(msg->Host),
msg->DryRun);
}

////////////////////////////////////////////////////////////////////////////////

bool TDiskRegistryActor::PreparePurgeHostCms(
const TActorContext& ctx,
TTransactionContext& tx,
TTxDiskRegistry::TPurgeHostCms& args)
{
Y_UNUSED(ctx);
Y_UNUSED(tx);
Y_UNUSED(args);

return true;
}

void TDiskRegistryActor::ExecutePurgeHostCms(
const TActorContext& ctx,
TTransactionContext& tx,
TTxDiskRegistry::TPurgeHostCms& args)
{
TDiskRegistryDatabase db(tx.DB);
args.Error = State->PurgeHost(
db,
args.Host,
ctx.Now(),
args.DryRun,
args.AffectedDisks);
}

void TDiskRegistryActor::CompletePurgeHostCms(
const TActorContext& ctx,
TTxDiskRegistry::TPurgeHostCms& args)
{
LOG_INFO(
ctx,
TBlockStoreComponents::DISK_REGISTRY,
"PurgeHostCms result: Host=%s Error=%s AffectedDisks=%s",
args.Host.c_str(),
FormatError(args.Error).c_str(),
[&]
{
TStringStream out;
out << "[";
for (const auto& diskId: args.AffectedDisks) {
out << " " << diskId << ":"
<< NProto::EDiskState_Name(State->GetDiskState(diskId));
}
out << " ]";
return out.Str();
}().c_str());

ReallocateDisks(ctx);
NotifyUsers(ctx);
PublishDiskStates(ctx);

SecureErase(ctx);
StartMigration(ctx);

auto response =
std::make_unique<TEvDiskRegistryPrivate::TEvPurgeHostCmsResponse>(
std::move(args.Error),
TDuration(), // Timeout
std::move(args.AffectedDisks));

NCloud::Reply(ctx, *args.RequestInfo, std::move(response));
}

} // namespace NCloud::NBlockStore::NStorage
Original file line number Diff line number Diff line change
Expand Up @@ -300,54 +300,14 @@ void TDiskRegistryActor::CompleteCleanupDevices(
void TDiskRegistryActor::SecureErase(const TActorContext& ctx)
{
auto dirtyDevices = State->GetDirtyDevices();
EraseIf(dirtyDevices, [&] (auto& d) {
if (d.GetState() == NProto::DEVICE_STATE_ERROR) {
LOG_DEBUG(ctx, TBlockStoreComponents::DISK_REGISTRY,
"[%lu] Skip SecureErase for device '%s'. Device in error state",
TabletID(),
d.GetDeviceUUID().c_str());

return true;
}

if (State->IsAutomaticallyReplaced(d.GetDeviceUUID())) {
LOG_DEBUG(ctx, TBlockStoreComponents::DISK_REGISTRY,
"[%lu] Skip SecureErase for device '%s'."
" Device was automatically replaced recently.",
TabletID(),
d.GetDeviceUUID().c_str(),
d.GetNodeId());

return true;
}

auto* agent = State->FindAgent(d.GetNodeId());
if (!agent) {
LOG_DEBUG(ctx, TBlockStoreComponents::DISK_REGISTRY,
"[%lu] Skip SecureErase for device '%s'."
" Agent for node id %d not found",
TabletID(),
d.GetDeviceUUID().c_str(),
d.GetNodeId());

return true;
}

if (agent->GetState() == NProto::AGENT_STATE_UNAVAILABLE) {
LOG_DEBUG(ctx, TBlockStoreComponents::DISK_REGISTRY,
"[%lu] Skip SecureErase for device '%s'."
" Agent is unavailable",
TabletID(),
d.GetDeviceUUID().c_str());

return true;
}

return false;
});
EraseIf(
dirtyDevices,
[this](auto& device) { return !State->CanSecureErase(device); });

if (!dirtyDevices) {
LOG_DEBUG(ctx, TBlockStoreComponents::DISK_REGISTRY,
LOG_DEBUG(
ctx,
TBlockStoreComponents::DISK_REGISTRY,
"[%lu] Nothing to erase",
TabletID());

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ void TDiskRegistryActor::HandleUpdateCmsHostState(
const TEvDiskRegistryPrivate::TEvUpdateCmsHostStateRequest::TPtr& ev,
const TActorContext& ctx)
{
BLOCKSTORE_DISK_REGISTRY_COUNTER(UpdateCmsHostDeviceState);
BLOCKSTORE_DISK_REGISTRY_COUNTER(UpdateCmsHostState);

auto* msg = ev->Get();

Expand Down Expand Up @@ -55,9 +55,6 @@ void TDiskRegistryActor::ExecuteUpdateCmsHostState(
TTransactionContext& tx,
TTxDiskRegistry::TUpdateCmsHostState& args)
{
Y_UNUSED(ctx);
Y_UNUSED(args);

TDiskRegistryDatabase db(tx.DB);

args.TxTs = ctx.Now();
Expand Down Expand Up @@ -94,7 +91,7 @@ void TDiskRegistryActor::CompleteUpdateCmsHostState(
out << " " << diskId
<< ":" << NProto::EDiskState_Name(State->GetDiskState(diskId));
}
out << "]";
out << " ]";
return out.Str();
}().c_str());

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@ using TVolumeConfig = NKikimrBlockStore::TVolumeConfig;
xxx(FinishVolumeConfigUpdate, __VA_ARGS__) \
xxx(RestoreDiskRegistryPart, __VA_ARGS__) \
xxx(SwitchAgentDisksToReadOnly, __VA_ARGS__) \
xxx(PurgeHostCms, __VA_ARGS__) \
// BLOCKSTORE_DISK_REGISTRY_REQUESTS_PRIVATE

////////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -563,8 +564,26 @@ struct TEvDiskRegistryPrivate
{}
};

//
// PurgeHostCms
//

struct TPurgeHostCmsRequest
{
TString Host;
bool DryRun;

TPurgeHostCmsRequest(
TString host,
bool dryRun)
: Host(std::move(host))
, DryRun(dryRun)
{}
};

using TUpdateCmsHostDeviceStateResponse = TCmsActionResponse;
using TUpdateCmsHostStateResponse = TCmsActionResponse;
using TPurgeHostCmsResponse = TCmsActionResponse;

//
// StartMigration
Expand Down
Loading

0 comments on commit b29fd3a

Please sign in to comment.