Skip to content

Commit

Permalink
issue-1444: change GetCpuWait return type to TResultOrError (#2485)
Browse files Browse the repository at this point in the history
  • Loading branch information
antonmyagkov authored Nov 18, 2024
1 parent fb77d86 commit b3c6ed5
Show file tree
Hide file tree
Showing 7 changed files with 96 additions and 46 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,15 @@ void TVolumeBalancerActor::HandleGetVolumeStatsResponse(
auto now = ctx.Now();

auto interval = (now - LastCpuWaitQuery).MicroSeconds();
auto cpuLack = CpuLackPercentsMultiplier * CgroupStatsFetcher->GetCpuWait().MicroSeconds();
auto [cpuWait, error] = CgroupStatsFetcher->GetCpuWait();
if (HasError(error)) {
LOG_ERROR_S(
ctx,
TBlockStoreComponents::VOLUME_BALANCER,
"Failed to get CpuWait stats: " << error);
}
auto cpuLack =
CpuLackPercentsMultiplier * cpuWait.MicroSeconds();
cpuLack /= interval;
*CpuWait = cpuLack;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ struct TCgroupStatsFetcherMock: public NCloud::NStorage::ICgroupStatsFetcher
{
}

TDuration GetCpuWait() override
TResultOrError<TDuration> GetCpuWait() override
{
return Value;
};
Expand Down
39 changes: 24 additions & 15 deletions cloud/filestore/libs/storage/service/service_actor_update_stats.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,27 +33,36 @@ void TStorageServiceActor::HandleUpdateStats(
}
}
if (CgroupStatsFetcher && CpuWait) {

auto now = ctx.Now();

auto interval = (now - LastCpuWaitQuery).MicroSeconds();
auto cpuWaitValue = CgroupStatsFetcher->GetCpuWait().MicroSeconds();
auto cpuLack = CpuLackPercentsMultiplier * cpuWaitValue / interval;

LOG_DEBUG_S(
ctx,
TFileStoreComponents::SERVICE,
"CpuWait stats: lack = " << cpuLack << "; interval = " << interval
<< "; wait = " << cpuWaitValue);

*CpuWait = cpuLack;
LastCpuWaitQuery = now;
if (auto [cpuWait, error] = CgroupStatsFetcher->GetCpuWait();
!HasError(error))
{
auto cpuWaitValue = cpuWait.MicroSeconds();
auto cpuLack = CpuLackPercentsMultiplier * cpuWaitValue / interval;

if (cpuLack >= StorageConfig->GetCpuLackThreshold()) {
LOG_WARN_S(
LOG_DEBUG_S(
ctx,
TFileStoreComponents::SERVICE,
"CpuWait stats: lack = " << cpuLack
<< "; interval = " << interval
<< "; wait = " << cpuWaitValue);

*CpuWait = cpuLack;
LastCpuWaitQuery = now;

if (cpuLack >= StorageConfig->GetCpuLackThreshold()) {
LOG_WARN_S(
ctx,
TFileStoreComponents::SERVICE,
"Cpu wait is " << cpuLack);
}
} else {
LOG_ERROR_S(
ctx,
TFileStoreComponents::SERVICE,
"Cpu wait is " << cpuLack);
"Failed to get CpuWait stats: " << error);
}
}

Expand Down
31 changes: 16 additions & 15 deletions cloud/storage/core/libs/diagnostics/cgroup_stats_fetcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,17 +75,21 @@ struct TCgroupStatsFetcher final
return;
}

Last = GetCpuWait();
if (auto [cpuWait, error] = GetCpuWait(); HasError(error)) {
STORAGE_ERROR("Failed to get CpuWait stats: " << error);
} else {
Last = cpuWait;
}
}

void Stop() override
{
}

TDuration GetCpuWait() override
TResultOrError<TDuration> GetCpuWait() override
{
if (!CpuAcctWait.IsOpen()) {
return {};
return MakeError(E_INVALID_STATE, "Failed to open " + StatsFile);
}

try {
Expand All @@ -95,9 +99,8 @@ struct TCgroupStatsFetcher final

if (CpuAcctWait.GetLength() >= bufSize - 1) {
ReportCpuWaitFatalError();
STORAGE_ERROR(StatsFile << " is too large");
CpuAcctWait.Close();
return {};
return MakeError(E_INVALID_STATE, StatsFile + " is too large");
}

char buf[bufSize];
Expand All @@ -110,23 +113,21 @@ struct TCgroupStatsFetcher final
auto value = TDuration::MicroSeconds(FromString<ui64>(buf) / 1000);

if (value < Last) {
STORAGE_ERROR(
ReportCpuWaitCounterReadError(
TStringBuilder() << StatsFile <<
" : new value " << value <<
" is less than previous " << Last));
auto errorMessage = ReportCpuWaitCounterReadError(
TStringBuilder() << StatsFile << " : new value " << value
<< " is less than previous " << Last);
Last = value;
return {};
return MakeError(E_INVALID_STATE, std::move(errorMessage));
}
auto retval = value - Last;
Last = value;

return retval;
} catch (...) {
ReportCpuWaitFatalError();
STORAGE_ERROR(BuildErrorMessageFromException())
auto errorMessage = BuildErrorMessageFromException();
CpuAcctWait.Close();
return {};
return MakeError(E_FAIL, std::move(errorMessage));
}
}

Expand Down Expand Up @@ -169,9 +170,9 @@ struct TCgroupStatsFetcherStub final
{
}

TDuration GetCpuWait() override
TResultOrError<TDuration> GetCpuWait() override
{
return {};
return TDuration::Zero();
}
};

Expand Down
3 changes: 2 additions & 1 deletion cloud/storage/core/libs/diagnostics/cgroup_stats_fetcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#include "public.h"

#include <cloud/storage/core/libs/common/error.h>
#include <cloud/storage/core/libs/common/startable.h>

#include <util/datetime/base.h>
Expand All @@ -21,7 +22,7 @@ struct ICgroupStatsFetcher
{
virtual ~ICgroupStatsFetcher() = default;

virtual TDuration GetCpuWait() = 0;
virtual TResultOrError<TDuration> GetCpuWait() = 0;
};

using ICgroupStatsFetcherPtr = std::shared_ptr<ICgroupStatsFetcher>;
Expand Down
45 changes: 36 additions & 9 deletions cloud/storage/core/libs/diagnostics/cgroup_stats_fetcher_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,14 +66,26 @@ Y_UNIT_TEST_SUITE(TCGroupStatFetcherTest)
});
fetcher->Start();

UNIT_ASSERT_VALUES_EQUAL(TDuration(), fetcher->GetCpuWait());
auto cpuWait = fetcher->GetCpuWait();
UNIT_ASSERT_C(!HasError(cpuWait), cpuWait.GetError());
UNIT_ASSERT_VALUES_EQUAL(
TDuration::MicroSeconds(0),
cpuWait.GetResult());

UpdateCGroupWaitDuration(statsFile, TDuration::MicroSeconds(20));
UNIT_ASSERT_VALUES_EQUAL(TDuration::MicroSeconds(10), fetcher->GetCpuWait());
cpuWait = fetcher->GetCpuWait();
UNIT_ASSERT_C(!HasError(cpuWait), cpuWait.GetError());
UNIT_ASSERT_VALUES_EQUAL(
TDuration::MicroSeconds(10),
cpuWait.GetResult());

fetcher->Stop();

UNIT_ASSERT_VALUES_EQUAL(0, serverGroup->GetCounter("AppCriticalEvents/CpuWaitCounterReadError", true)->Val());
UNIT_ASSERT_VALUES_EQUAL(
0,
serverGroup
->GetCounter("AppCriticalEvents/CpuWaitCounterReadError", true)
->Val());
}

Y_UNIT_TEST(ShouldReportErrorIfFileIsMissing)
Expand All @@ -100,8 +112,10 @@ Y_UNIT_TEST_SUITE(TCGroupStatFetcherTest)

UNIT_ASSERT_VALUES_EQUAL(1, failCounter->Val());

UNIT_ASSERT_VALUES_EQUAL(TDuration(), fetcher->GetCpuWait());
UNIT_ASSERT_VALUES_EQUAL(TDuration(), fetcher->GetCpuWait());
auto cpuWait = fetcher->GetCpuWait();
UNIT_ASSERT_C(HasError(cpuWait), cpuWait.GetError());
cpuWait = fetcher->GetCpuWait();
UNIT_ASSERT_C(HasError(cpuWait), cpuWait.GetError());

fetcher->Stop();
}
Expand All @@ -127,12 +141,25 @@ Y_UNIT_TEST_SUITE(TCGroupStatFetcherTest)
fetcher->Start();

UpdateCGroupWaitDuration(statsFile, TDuration::MicroSeconds(80));
UNIT_ASSERT_VALUES_EQUAL(TDuration(), fetcher->GetCpuWait());
UNIT_ASSERT_VALUES_EQUAL(1, serverGroup->GetCounter("AppCriticalEvents/CpuWaitCounterReadError", true)->Val());
auto cpuWait = fetcher->GetCpuWait();
UNIT_ASSERT_C(HasError(cpuWait), cpuWait.GetError());
UNIT_ASSERT_VALUES_EQUAL(
1,
serverGroup
->GetCounter("AppCriticalEvents/CpuWaitCounterReadError", true)
->Val());

UpdateCGroupWaitDuration(statsFile, TDuration::MicroSeconds(100));
UNIT_ASSERT_VALUES_EQUAL(TDuration::MicroSeconds(20), fetcher->GetCpuWait());
UNIT_ASSERT_VALUES_EQUAL(1, serverGroup->GetCounter("AppCriticalEvents/CpuWaitCounterReadError", true)->Val());
cpuWait = fetcher->GetCpuWait();
UNIT_ASSERT_C(!HasError(cpuWait), cpuWait.GetError());
UNIT_ASSERT_VALUES_EQUAL(
TDuration::MicroSeconds(20),
cpuWait.GetResult());
UNIT_ASSERT_VALUES_EQUAL(
1,
serverGroup
->GetCounter("AppCriticalEvents/CpuWaitCounterReadError", true)
->Val());

fetcher->Stop();
}
Expand Down
12 changes: 8 additions & 4 deletions cloud/storage/core/tools/analytics/cpu-wait-monitor/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,10 +80,14 @@ int main(int argc, const char** argv)
while (!options.SampleCount || numSamples--) {
Sleep(pollInterval);

auto waitTime = 100 * statsFetcher->GetCpuWait().MicroSeconds();
auto interval = pollInterval.MicroSeconds();

Cout << (waitTime / interval) << Endl;
auto cpuWait = statsFetcher->GetCpuWait();
if (!HasError(cpuWait)) {
auto waitTime = 100 * cpuWait.GetResult().MicroSeconds();
auto interval = pollInterval.MicroSeconds();
Cout << (waitTime / interval) << Endl;
} else {
Cout << cpuWait.GetError() << Endl;
}
}

return 0;
Expand Down

0 comments on commit b3c6ed5

Please sign in to comment.