From bbf13511e0d4c8e4b21046b5985aba695b857f81 Mon Sep 17 00:00:00 2001 From: Stephanie Han Date: Thu, 19 Sep 2024 10:32:31 -0700 Subject: [PATCH] Add getFileRawDataSize Test Utilities (#73) Summary: Pull Request resolved: https://github.com/facebookincubator/nimble/pull/73 # Changes Adding functions to the TestUtils.h in tests folders of both `encodings` and `velox` which allows us to calculate the raw data size of files by reading. - Note the special handling for Nullable encoding types and String data types in the `getRawChunkSize` function. # Context This is necessary to check the correctness of the raw data size stat added in the following diff, which calculates the size when writing. This change will also be used by nimble_dump later on (to be implemented in a later diff). Reviewed By: sdruzkin, helfman Differential Revision: D61050482 --- dwio/nimble/encodings/tests/CMakeLists.txt | 4 + .../tests/EncodingSelectionTests.cpp | 76 +++++++++- dwio/nimble/encodings/tests/TestUtils.cpp | 138 ++++++++++++++++++ dwio/nimble/encodings/tests/TestUtils.h | 8 +- dwio/nimble/velox/tests/TestUtils.h | 52 +++++++ 5 files changed, 272 insertions(+), 6 deletions(-) create mode 100644 dwio/nimble/encodings/tests/TestUtils.cpp create mode 100644 dwio/nimble/velox/tests/TestUtils.h diff --git a/dwio/nimble/encodings/tests/CMakeLists.txt b/dwio/nimble/encodings/tests/CMakeLists.txt index 0751320..ed23d61 100644 --- a/dwio/nimble/encodings/tests/CMakeLists.txt +++ b/dwio/nimble/encodings/tests/CMakeLists.txt @@ -11,6 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +add_library(nimble_encodings_tests_utils TestUtils.cpp) +target_link_libraries(nimble_encodings_tests_utils nimble_encodings) + add_executable( nimble_encodings_tests ConstantEncodingTests.cpp @@ -27,6 +30,7 @@ add_test(nimble_encodings_tests nimble_encodings_tests) target_link_libraries( nimble_encodings_tests + nimble_encodings_tests_utils nimble_encodings nimble_common nimble_tools_common diff --git a/dwio/nimble/encodings/tests/EncodingSelectionTests.cpp b/dwio/nimble/encodings/tests/EncodingSelectionTests.cpp index 110c9dd..5c5aa94 100644 --- a/dwio/nimble/encodings/tests/EncodingSelectionTests.cpp +++ b/dwio/nimble/encodings/tests/EncodingSelectionTests.cpp @@ -21,6 +21,7 @@ #include "dwio/nimble/encodings/EncodingFactory.h" #include "dwio/nimble/encodings/EncodingSelectionPolicy.h" #include "dwio/nimble/encodings/NullableEncoding.h" +#include "dwio/nimble/encodings/tests/TestUtils.h" #include "dwio/nimble/tools/EncodingUtilities.h" using namespace ::facebook; @@ -112,6 +113,12 @@ void test(std::span values, std::vector expected) { auto serialized = nimble::EncodingFactory::encode(std::move(policy), values, buffer); + // test getRawDataSize + auto size = + facebook::nimble::test::TestUtils::getRawDataSize(*pool, serialized); + auto expectedSize = values.size_bytes(); + ASSERT_EQ(size, expectedSize); + LOG(INFO) << "Final size: " << serialized.size(); ASSERT_GT(expected.size(), 0); @@ -554,6 +561,12 @@ TEST(EncodingSelectionBoolTests, SelectTrivial) { auto serialized = nimble::EncodingFactory::encode(std::move(policy), values, buffer); + // test getRawDataSize + auto size = + facebook::nimble::test::TestUtils::getRawDataSize(*pool, serialized); + auto expectedSize = values.size() * sizeof(T); + ASSERT_EQ(size, expectedSize); + LOG(INFO) << "Final size: " << serialized.size(); verifyEncodingTree( @@ -599,6 +612,12 @@ TEST(EncodingSelectionBoolTests, SelectRunLength) { auto serialized = nimble::EncodingFactory::encode(std::move(policy), values, buffer); + // test getRawDataSize + auto size = + facebook::nimble::test::TestUtils::getRawDataSize(*pool, serialized); + auto expectedSize = values.size() * sizeof(T); + ASSERT_EQ(size, expectedSize); + LOG(INFO) << "Final size: " << serialized.size(); verifyEncodingTree( @@ -635,6 +654,12 @@ TEST(EncodingSelectionStringTests, SelectConst) { auto serialized = nimble::EncodingFactory::encode(std::move(policy), values, buffer); + // test getRawDataSize + auto size = + facebook::nimble::test::TestUtils::getRawDataSize(*pool, serialized); + auto expectedSize = value.size() * values.size(); + ASSERT_EQ(size, expectedSize); + LOG(INFO) << "Final size: " << serialized.size(); verifyEncodingTree( @@ -659,10 +684,14 @@ TEST(EncodingSelectionStringTests, SelectMainlyConst) { std::string(5000, '\0'), }) { std::vector values; - values.resize(1000); + auto expectedSize = 0; + + auto resize = 1000; + values.resize(resize); for (auto i = 0; i < values.size(); ++i) { values[i] = value; } + expectedSize += resize * value.size(); std::vector uncommonValues; for (auto i = 0; i < values.size() / 20; ++i) { @@ -670,13 +699,20 @@ TEST(EncodingSelectionStringTests, SelectMainlyConst) { } for (auto i = 0; i < uncommonValues.size(); ++i) { - values[i * 20] = uncommonValues[i]; + std::string_view val = uncommonValues[i]; + values[i * 20] = val; + expectedSize += val.size() - value.size(); } auto policy = getRootManualSelectionPolicy(); auto serialized = nimble::EncodingFactory::encode(std::move(policy), values, buffer); + // test getRawDataSize + auto size = + facebook::nimble::test::TestUtils::getRawDataSize(*pool, serialized); + ASSERT_EQ(size, expectedSize); + LOG(INFO) << "Final size: " << serialized.size(); verifyEncodingTree( @@ -730,14 +766,21 @@ TEST(EncodingSelectionStringTests, SelectTrivial) { } std::vector values; + auto expectedSize = 0; values.resize(cache.size()); for (auto i = 0; i < cache.size(); ++i) { values[i] = cache[i]; + expectedSize += cache[i].size(); } auto serialized = nimble::EncodingFactory::encode(std::move(policy), values, buffer); + // test getRawDataSize + auto size = + facebook::nimble::test::TestUtils::getRawDataSize(*pool, serialized); + ASSERT_EQ(size, expectedSize); + LOG(INFO) << "Final size: " << serialized.size(); verifyEncodingTree( @@ -769,14 +812,22 @@ TEST(EncodingSelectionStringTests, SelectDictionary) { auto policy = getRootManualSelectionPolicy(); std::vector values; + auto expectedSize = 0; values.resize(10000); for (auto i = 0; i < values.size(); ++i) { - values[i] = uniqueValues[folly::Random::rand32(rng) % uniqueValues.size()]; + T val = uniqueValues[folly::Random::rand32(rng) % uniqueValues.size()]; + values[i] = val; + expectedSize += val.size(); } auto serialized = nimble::EncodingFactory::encode(std::move(policy), values, buffer); + // test getRawDataSize + auto size = + facebook::nimble::test::TestUtils::getRawDataSize(*pool, serialized); + ASSERT_EQ(size, expectedSize); + LOG(INFO) << "Final size: " << serialized.size(); verifyEncodingTree( @@ -821,12 +872,15 @@ TEST(EncodingSelectionStringTests, SelectRunLength) { } std::vector values; + auto expectedSize = 0; values.reserve(valueCount); auto index = 0; for (const auto length : runLengths) { for (auto i = 0; i < length; ++i) { - values.emplace_back( - index % 2 == 0 ? "abcdefghijklmnopqrstuvwxyz" : "1234567890"); + std::string_view val = + ((index % 2 == 0) ? "abcdefghijklmnopqrstuvwxyz" : "1234567890"); + values.emplace_back(val); + expectedSize += val.size(); } ++index; } @@ -835,6 +889,11 @@ TEST(EncodingSelectionStringTests, SelectRunLength) { auto serialized = nimble::EncodingFactory::encode(std::move(policy), values, buffer); + // test getRawDataSize + auto size = + facebook::nimble::test::TestUtils::getRawDataSize(*pool, serialized); + ASSERT_EQ(size, expectedSize); + LOG(INFO) << "Final size: " << serialized.size(); verifyEncodingTree( @@ -878,5 +937,12 @@ TEST(EncodingSelectionTests, TestNullable) { auto serialized = nimble::EncodingFactory::encodeNullable( std::move(policy), data, nulls, buffer); + + // test getRawDataSize + auto size = + facebook::nimble::test::TestUtils::getRawDataSize(*pool, serialized); + auto expectedSize = 15 + 6; // 15 bytes for string data, 6 bytes for nulls + ASSERT_EQ(size, expectedSize); + LOG(INFO) << "Final size: " << serialized.size(); } diff --git a/dwio/nimble/encodings/tests/TestUtils.cpp b/dwio/nimble/encodings/tests/TestUtils.cpp new file mode 100644 index 0000000..e84da9f --- /dev/null +++ b/dwio/nimble/encodings/tests/TestUtils.cpp @@ -0,0 +1,138 @@ +/* + * Copyright (c) Meta Platforms, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dwio/nimble/encodings/tests/TestUtils.h" +#include "dwio/nimble/encodings/EncodingUtils.h" + +namespace facebook::nimble::test { + +static constexpr int kRowCountOffset = 2; +static constexpr int kPrefixSize = 6; +static constexpr int kCompressionTypeSize = 1; + +uint64_t TestUtils::getRawDataSize( + velox::memory::MemoryPool& memoryPool, + std::string_view encodingStr) { + auto encoding = EncodingFactory::decode(memoryPool, encodingStr); + EncodingType encodingType = encoding->encodingType(); + DataType dataType = encoding->dataType(); + uint32_t rowCount = encoding->rowCount(); + + if (encodingType == EncodingType::Sentinel) { + NIMBLE_NOT_SUPPORTED("Sentinel encoding is not supported"); + } + + if (encodingType == EncodingType::Nullable) { + auto pos = encodingStr.data() + kPrefixSize; + auto nonNullsSize = encoding::readUint32(pos); + auto nonNullsCount = encoding::peek(pos + kRowCountOffset); + // We do not count the bits indicating non-null, therefore we only + // include the size of the null bits and the non-null values. + return getRawDataSize(memoryPool, {pos, nonNullsSize}) + + (rowCount - nonNullsCount); + } else { + if (dataType != DataType::String) { + auto typeSize = nimble::detail::dataTypeSize(dataType); + auto result = typeSize * rowCount; + return result; + } else { + auto pos = encodingStr.data() + kPrefixSize; // Skip the prefix. + uint64_t result = 0; + + switch (encodingType) { + case EncodingType::Trivial: { + pos += kCompressionTypeSize; + auto lengthsSize = encoding::readUint32(pos); + auto lengths = + EncodingFactory::decode(memoryPool, {pos, lengthsSize}); + std::vector buffer(rowCount); + lengths->materialize(rowCount, buffer.data()); + result += std::accumulate(buffer.begin(), buffer.end(), 0); + break; + } + + case EncodingType::Constant: { + auto valueSize = encoding::readUint32(pos); + result += rowCount * valueSize; + break; + } + + case EncodingType::MainlyConstant: { + auto isCommonSize = encoding::readUint32(pos); + pos += isCommonSize; + auto otherValuesSize = encoding::readUint32(pos); + auto otherValuesOffset = pos; + auto otherValuesCount = + encoding::peek(pos + kRowCountOffset); + pos += otherValuesSize; + auto constantValueSize = encoding::readUint32(pos); + result += (rowCount - otherValuesCount) * constantValueSize; + result += + getRawDataSize(memoryPool, {otherValuesOffset, otherValuesSize}); + break; + } + + case EncodingType::Dictionary: { + auto alphabetSize = encoding::readUint32(pos); + auto alphabetCount = encoding::peek(pos + kRowCountOffset); + auto alphabet = + EncodingFactory::decode(memoryPool, {pos, alphabetSize}); + std::vector alphabetBuffer(alphabetCount); + alphabet->materialize(alphabetCount, alphabetBuffer.data()); + + pos += alphabetSize; + auto indicesSize = encodingStr.length() - (pos - encodingStr.data()); + auto indices = + EncodingFactory::decode(memoryPool, {pos, indicesSize}); + std::vector indicesBuffer(rowCount); + indices->materialize(rowCount, indicesBuffer.data()); + for (int i = 0; i < rowCount; ++i) { + result += alphabetBuffer[indicesBuffer[i]].size(); + } + break; + } + + case EncodingType::RLE: { + auto runLengthsSize = encoding::readUint32(pos); + auto runLengthsCount = + encoding::peek(pos + kRowCountOffset); + auto runLengths = + EncodingFactory::decode(memoryPool, {pos, runLengthsSize}); + std::vector runLengthsBuffer(runLengthsCount); + runLengths->materialize(runLengthsCount, runLengthsBuffer.data()); + + pos += runLengthsSize; + auto runValuesSize = + encodingStr.length() - (pos - encodingStr.data()); + auto runValues = + EncodingFactory::decode(memoryPool, {pos, runValuesSize}); + std::vector runValuesBuffer(runLengthsCount); + runValues->materialize(runLengthsCount, runValuesBuffer.data()); + + for (int i = 0; i < runLengthsCount; ++i) { + result += runLengthsBuffer[i] * runValuesBuffer[i].size(); + } + break; + } + + default: + NIMBLE_NOT_SUPPORTED("Encoding type does not support strings."); + } + return result; + } + } +} +} // namespace facebook::nimble::test diff --git a/dwio/nimble/encodings/tests/TestUtils.h b/dwio/nimble/encodings/tests/TestUtils.h index 8f09677..2738b14 100644 --- a/dwio/nimble/encodings/tests/TestUtils.h +++ b/dwio/nimble/encodings/tests/TestUtils.h @@ -16,7 +16,6 @@ #pragma once #include "dwio/nimble/encodings/ConstantEncoding.h" -#include "dwio/nimble/encodings/DeltaEncoding.h" #include "dwio/nimble/encodings/DictionaryEncoding.h" #include "dwio/nimble/encodings/Encoding.h" #include "dwio/nimble/encodings/EncodingFactory.h" @@ -236,4 +235,11 @@ class Encoder { encodeNullable(buffer, values, nulls, compressionType)); } }; + +class TestUtils { + public: + static uint64_t getRawDataSize( + velox::memory::MemoryPool& memoryPool, + std::string_view encodingStr); +}; } // namespace facebook::nimble::test diff --git a/dwio/nimble/velox/tests/TestUtils.h b/dwio/nimble/velox/tests/TestUtils.h new file mode 100644 index 0000000..4703238 --- /dev/null +++ b/dwio/nimble/velox/tests/TestUtils.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) Meta Platforms, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "dwio/nimble/encodings/tests/TestUtils.h" +#include "dwio/nimble/tablet/TabletReader.h" +#include "dwio/nimble/velox/ChunkedStream.h" + +namespace facebook::nimble::test { +// Calculate the raw Stream Size. +inline std::uint64_t getRawStreamSize( + velox::memory::MemoryPool& pool, + nimble::TabletReader& tablet) { + // Calculate expected size by summing stream sizes. + uint64_t expected = 0; + for (auto i = 0; i < tablet.stripeCount(); ++i) { + auto stripeIdentifier = tablet.getStripeIdentifier(i); + + auto numStreams = tablet.streamCount(stripeIdentifier); + std::vector identifiers(numStreams); + std::iota(identifiers.begin(), identifiers.end(), 0); + auto streams = tablet.load(stripeIdentifier, identifiers); + + // Skip nullStreams indicated by nullptr. + for (auto& stream : streams) { + if (stream == nullptr) { + continue; + } + nimble::InMemoryChunkedStream chunkedStream{pool, std::move(stream)}; + while (chunkedStream.hasNext()) { + auto chunk = chunkedStream.nextChunk(); + expected += TestUtils::getRawDataSize(pool, chunk); + } + } + } + return expected; +} + +} // namespace facebook::nimble::test