From b3c6ed541bab76ab4a5e828edeb29a5135647236 Mon Sep 17 00:00:00 2001 From: Anton Myagkov Date: Mon, 18 Nov 2024 12:01:55 +0100 Subject: [PATCH] issue-1444: change GetCpuWait return type to TResultOrError (#2485) --- .../volume_balancer/volume_balancer_actor.cpp | 10 ++++- .../volume_balancer/volume_balancer_ut.cpp | 2 +- .../service/service_actor_update_stats.cpp | 39 +++++++++------- .../libs/diagnostics/cgroup_stats_fetcher.cpp | 31 ++++++------- .../libs/diagnostics/cgroup_stats_fetcher.h | 3 +- .../diagnostics/cgroup_stats_fetcher_ut.cpp | 45 +++++++++++++++---- .../tools/analytics/cpu-wait-monitor/main.cpp | 12 +++-- 7 files changed, 96 insertions(+), 46 deletions(-) diff --git a/cloud/blockstore/libs/storage/volume_balancer/volume_balancer_actor.cpp b/cloud/blockstore/libs/storage/volume_balancer/volume_balancer_actor.cpp index 95d380f66ae..19d8a97da82 100644 --- a/cloud/blockstore/libs/storage/volume_balancer/volume_balancer_actor.cpp +++ b/cloud/blockstore/libs/storage/volume_balancer/volume_balancer_actor.cpp @@ -245,7 +245,15 @@ void TVolumeBalancerActor::HandleGetVolumeStatsResponse( auto now = ctx.Now(); auto interval = (now - LastCpuWaitQuery).MicroSeconds(); - auto cpuLack = CpuLackPercentsMultiplier * CgroupStatsFetcher->GetCpuWait().MicroSeconds(); + auto [cpuWait, error] = CgroupStatsFetcher->GetCpuWait(); + if (HasError(error)) { + LOG_ERROR_S( + ctx, + TBlockStoreComponents::VOLUME_BALANCER, + "Failed to get CpuWait stats: " << error); + } + auto cpuLack = + CpuLackPercentsMultiplier * cpuWait.MicroSeconds(); cpuLack /= interval; *CpuWait = cpuLack; diff --git a/cloud/blockstore/libs/storage/volume_balancer/volume_balancer_ut.cpp b/cloud/blockstore/libs/storage/volume_balancer/volume_balancer_ut.cpp index 3928bf6d6ee..4837d25424f 100644 --- a/cloud/blockstore/libs/storage/volume_balancer/volume_balancer_ut.cpp +++ b/cloud/blockstore/libs/storage/volume_balancer/volume_balancer_ut.cpp @@ -211,7 +211,7 @@ struct TCgroupStatsFetcherMock: public NCloud::NStorage::ICgroupStatsFetcher { } - TDuration GetCpuWait() override + TResultOrError GetCpuWait() override { return Value; }; diff --git a/cloud/filestore/libs/storage/service/service_actor_update_stats.cpp b/cloud/filestore/libs/storage/service/service_actor_update_stats.cpp index 9a1ad875e55..f0d7901d849 100644 --- a/cloud/filestore/libs/storage/service/service_actor_update_stats.cpp +++ b/cloud/filestore/libs/storage/service/service_actor_update_stats.cpp @@ -33,27 +33,36 @@ void TStorageServiceActor::HandleUpdateStats( } } if (CgroupStatsFetcher && CpuWait) { - auto now = ctx.Now(); auto interval = (now - LastCpuWaitQuery).MicroSeconds(); - auto cpuWaitValue = CgroupStatsFetcher->GetCpuWait().MicroSeconds(); - auto cpuLack = CpuLackPercentsMultiplier * cpuWaitValue / interval; - - LOG_DEBUG_S( - ctx, - TFileStoreComponents::SERVICE, - "CpuWait stats: lack = " << cpuLack << "; interval = " << interval - << "; wait = " << cpuWaitValue); - - *CpuWait = cpuLack; - LastCpuWaitQuery = now; + if (auto [cpuWait, error] = CgroupStatsFetcher->GetCpuWait(); + !HasError(error)) + { + auto cpuWaitValue = cpuWait.MicroSeconds(); + auto cpuLack = CpuLackPercentsMultiplier * cpuWaitValue / interval; - if (cpuLack >= StorageConfig->GetCpuLackThreshold()) { - LOG_WARN_S( + LOG_DEBUG_S( + ctx, + TFileStoreComponents::SERVICE, + "CpuWait stats: lack = " << cpuLack + << "; interval = " << interval + << "; wait = " << cpuWaitValue); + + *CpuWait = cpuLack; + LastCpuWaitQuery = now; + + if (cpuLack >= StorageConfig->GetCpuLackThreshold()) { + LOG_WARN_S( + ctx, + TFileStoreComponents::SERVICE, + "Cpu wait is " << cpuLack); + } + } else { + LOG_ERROR_S( ctx, TFileStoreComponents::SERVICE, - "Cpu wait is " << cpuLack); + "Failed to get CpuWait stats: " << error); } } diff --git a/cloud/storage/core/libs/diagnostics/cgroup_stats_fetcher.cpp b/cloud/storage/core/libs/diagnostics/cgroup_stats_fetcher.cpp index 505b2ea46a5..bb253450ad5 100644 --- a/cloud/storage/core/libs/diagnostics/cgroup_stats_fetcher.cpp +++ b/cloud/storage/core/libs/diagnostics/cgroup_stats_fetcher.cpp @@ -75,17 +75,21 @@ struct TCgroupStatsFetcher final return; } - Last = GetCpuWait(); + if (auto [cpuWait, error] = GetCpuWait(); HasError(error)) { + STORAGE_ERROR("Failed to get CpuWait stats: " << error); + } else { + Last = cpuWait; + } } void Stop() override { } - TDuration GetCpuWait() override + TResultOrError GetCpuWait() override { if (!CpuAcctWait.IsOpen()) { - return {}; + return MakeError(E_INVALID_STATE, "Failed to open " + StatsFile); } try { @@ -95,9 +99,8 @@ struct TCgroupStatsFetcher final if (CpuAcctWait.GetLength() >= bufSize - 1) { ReportCpuWaitFatalError(); - STORAGE_ERROR(StatsFile << " is too large"); CpuAcctWait.Close(); - return {}; + return MakeError(E_INVALID_STATE, StatsFile + " is too large"); } char buf[bufSize]; @@ -110,13 +113,11 @@ struct TCgroupStatsFetcher final auto value = TDuration::MicroSeconds(FromString(buf) / 1000); if (value < Last) { - STORAGE_ERROR( - ReportCpuWaitCounterReadError( - TStringBuilder() << StatsFile << - " : new value " << value << - " is less than previous " << Last)); + auto errorMessage = ReportCpuWaitCounterReadError( + TStringBuilder() << StatsFile << " : new value " << value + << " is less than previous " << Last); Last = value; - return {}; + return MakeError(E_INVALID_STATE, std::move(errorMessage)); } auto retval = value - Last; Last = value; @@ -124,9 +125,9 @@ struct TCgroupStatsFetcher final return retval; } catch (...) { ReportCpuWaitFatalError(); - STORAGE_ERROR(BuildErrorMessageFromException()) + auto errorMessage = BuildErrorMessageFromException(); CpuAcctWait.Close(); - return {}; + return MakeError(E_FAIL, std::move(errorMessage)); } } @@ -169,9 +170,9 @@ struct TCgroupStatsFetcherStub final { } - TDuration GetCpuWait() override + TResultOrError GetCpuWait() override { - return {}; + return TDuration::Zero(); } }; diff --git a/cloud/storage/core/libs/diagnostics/cgroup_stats_fetcher.h b/cloud/storage/core/libs/diagnostics/cgroup_stats_fetcher.h index 9cd559ef1a1..a4b30e03c74 100644 --- a/cloud/storage/core/libs/diagnostics/cgroup_stats_fetcher.h +++ b/cloud/storage/core/libs/diagnostics/cgroup_stats_fetcher.h @@ -2,6 +2,7 @@ #include "public.h" +#include #include #include @@ -21,7 +22,7 @@ struct ICgroupStatsFetcher { virtual ~ICgroupStatsFetcher() = default; - virtual TDuration GetCpuWait() = 0; + virtual TResultOrError GetCpuWait() = 0; }; using ICgroupStatsFetcherPtr = std::shared_ptr; diff --git a/cloud/storage/core/libs/diagnostics/cgroup_stats_fetcher_ut.cpp b/cloud/storage/core/libs/diagnostics/cgroup_stats_fetcher_ut.cpp index 31f6861a0ac..635aa3d4af7 100644 --- a/cloud/storage/core/libs/diagnostics/cgroup_stats_fetcher_ut.cpp +++ b/cloud/storage/core/libs/diagnostics/cgroup_stats_fetcher_ut.cpp @@ -66,14 +66,26 @@ Y_UNIT_TEST_SUITE(TCGroupStatFetcherTest) }); fetcher->Start(); - UNIT_ASSERT_VALUES_EQUAL(TDuration(), fetcher->GetCpuWait()); + auto cpuWait = fetcher->GetCpuWait(); + UNIT_ASSERT_C(!HasError(cpuWait), cpuWait.GetError()); + UNIT_ASSERT_VALUES_EQUAL( + TDuration::MicroSeconds(0), + cpuWait.GetResult()); UpdateCGroupWaitDuration(statsFile, TDuration::MicroSeconds(20)); - UNIT_ASSERT_VALUES_EQUAL(TDuration::MicroSeconds(10), fetcher->GetCpuWait()); + cpuWait = fetcher->GetCpuWait(); + UNIT_ASSERT_C(!HasError(cpuWait), cpuWait.GetError()); + UNIT_ASSERT_VALUES_EQUAL( + TDuration::MicroSeconds(10), + cpuWait.GetResult()); fetcher->Stop(); - UNIT_ASSERT_VALUES_EQUAL(0, serverGroup->GetCounter("AppCriticalEvents/CpuWaitCounterReadError", true)->Val()); + UNIT_ASSERT_VALUES_EQUAL( + 0, + serverGroup + ->GetCounter("AppCriticalEvents/CpuWaitCounterReadError", true) + ->Val()); } Y_UNIT_TEST(ShouldReportErrorIfFileIsMissing) @@ -100,8 +112,10 @@ Y_UNIT_TEST_SUITE(TCGroupStatFetcherTest) UNIT_ASSERT_VALUES_EQUAL(1, failCounter->Val()); - UNIT_ASSERT_VALUES_EQUAL(TDuration(), fetcher->GetCpuWait()); - UNIT_ASSERT_VALUES_EQUAL(TDuration(), fetcher->GetCpuWait()); + auto cpuWait = fetcher->GetCpuWait(); + UNIT_ASSERT_C(HasError(cpuWait), cpuWait.GetError()); + cpuWait = fetcher->GetCpuWait(); + UNIT_ASSERT_C(HasError(cpuWait), cpuWait.GetError()); fetcher->Stop(); } @@ -127,12 +141,25 @@ Y_UNIT_TEST_SUITE(TCGroupStatFetcherTest) fetcher->Start(); UpdateCGroupWaitDuration(statsFile, TDuration::MicroSeconds(80)); - UNIT_ASSERT_VALUES_EQUAL(TDuration(), fetcher->GetCpuWait()); - UNIT_ASSERT_VALUES_EQUAL(1, serverGroup->GetCounter("AppCriticalEvents/CpuWaitCounterReadError", true)->Val()); + auto cpuWait = fetcher->GetCpuWait(); + UNIT_ASSERT_C(HasError(cpuWait), cpuWait.GetError()); + UNIT_ASSERT_VALUES_EQUAL( + 1, + serverGroup + ->GetCounter("AppCriticalEvents/CpuWaitCounterReadError", true) + ->Val()); UpdateCGroupWaitDuration(statsFile, TDuration::MicroSeconds(100)); - UNIT_ASSERT_VALUES_EQUAL(TDuration::MicroSeconds(20), fetcher->GetCpuWait()); - UNIT_ASSERT_VALUES_EQUAL(1, serverGroup->GetCounter("AppCriticalEvents/CpuWaitCounterReadError", true)->Val()); + cpuWait = fetcher->GetCpuWait(); + UNIT_ASSERT_C(!HasError(cpuWait), cpuWait.GetError()); + UNIT_ASSERT_VALUES_EQUAL( + TDuration::MicroSeconds(20), + cpuWait.GetResult()); + UNIT_ASSERT_VALUES_EQUAL( + 1, + serverGroup + ->GetCounter("AppCriticalEvents/CpuWaitCounterReadError", true) + ->Val()); fetcher->Stop(); } diff --git a/cloud/storage/core/tools/analytics/cpu-wait-monitor/main.cpp b/cloud/storage/core/tools/analytics/cpu-wait-monitor/main.cpp index f248e10348c..d0accaa737a 100644 --- a/cloud/storage/core/tools/analytics/cpu-wait-monitor/main.cpp +++ b/cloud/storage/core/tools/analytics/cpu-wait-monitor/main.cpp @@ -80,10 +80,14 @@ int main(int argc, const char** argv) while (!options.SampleCount || numSamples--) { Sleep(pollInterval); - auto waitTime = 100 * statsFetcher->GetCpuWait().MicroSeconds(); - auto interval = pollInterval.MicroSeconds(); - - Cout << (waitTime / interval) << Endl; + auto cpuWait = statsFetcher->GetCpuWait(); + if (!HasError(cpuWait)) { + auto waitTime = 100 * cpuWait.GetResult().MicroSeconds(); + auto interval = pollInterval.MicroSeconds(); + Cout << (waitTime / interval) << Endl; + } else { + Cout << cpuWait.GetError() << Endl; + } } return 0;