Skip to content

Commit

Permalink
Add getFileRawDataSize Test Utilities (#73)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #73

# Changes

Adding functions to the TestUtils.h in tests folders of both `encodings` and `velox` which allows us to calculate the raw data size of files by reading.
- Note the special handling for Nullable encoding types and String data types in the `getRawChunkSize` function.

# Context
This is necessary to check the correctness of the raw data size stat added in the following diff, which calculates the size when writing.
This change will also be used by nimble_dump later on (to be implemented in a later diff).

Reviewed By: sdruzkin, helfman

Differential Revision: D61050482
  • Loading branch information
phoenixawe authored and facebook-github-bot committed Sep 19, 2024
1 parent 6a53405 commit bbf1351
Show file tree
Hide file tree
Showing 5 changed files with 272 additions and 6 deletions.
4 changes: 4 additions & 0 deletions dwio/nimble/encodings/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
add_library(nimble_encodings_tests_utils TestUtils.cpp)
target_link_libraries(nimble_encodings_tests_utils nimble_encodings)

add_executable(
nimble_encodings_tests
ConstantEncodingTests.cpp
Expand All @@ -27,6 +30,7 @@ add_test(nimble_encodings_tests nimble_encodings_tests)

target_link_libraries(
nimble_encodings_tests
nimble_encodings_tests_utils
nimble_encodings
nimble_common
nimble_tools_common
Expand Down
76 changes: 71 additions & 5 deletions dwio/nimble/encodings/tests/EncodingSelectionTests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "dwio/nimble/encodings/EncodingFactory.h"
#include "dwio/nimble/encodings/EncodingSelectionPolicy.h"
#include "dwio/nimble/encodings/NullableEncoding.h"
#include "dwio/nimble/encodings/tests/TestUtils.h"
#include "dwio/nimble/tools/EncodingUtilities.h"

using namespace ::facebook;
Expand Down Expand Up @@ -112,6 +113,12 @@ void test(std::span<const T> values, std::vector<EncodingDetails> expected) {
auto serialized =
nimble::EncodingFactory::encode<T>(std::move(policy), values, buffer);

// test getRawDataSize
auto size =
facebook::nimble::test::TestUtils::getRawDataSize(*pool, serialized);
auto expectedSize = values.size_bytes();
ASSERT_EQ(size, expectedSize);

LOG(INFO) << "Final size: " << serialized.size();

ASSERT_GT(expected.size(), 0);
Expand Down Expand Up @@ -554,6 +561,12 @@ TEST(EncodingSelectionBoolTests, SelectTrivial) {
auto serialized =
nimble::EncodingFactory::encode<T>(std::move(policy), values, buffer);

// test getRawDataSize
auto size =
facebook::nimble::test::TestUtils::getRawDataSize(*pool, serialized);
auto expectedSize = values.size() * sizeof(T);
ASSERT_EQ(size, expectedSize);

LOG(INFO) << "Final size: " << serialized.size();

verifyEncodingTree(
Expand Down Expand Up @@ -599,6 +612,12 @@ TEST(EncodingSelectionBoolTests, SelectRunLength) {
auto serialized =
nimble::EncodingFactory::encode<T>(std::move(policy), values, buffer);

// test getRawDataSize
auto size =
facebook::nimble::test::TestUtils::getRawDataSize(*pool, serialized);
auto expectedSize = values.size() * sizeof(T);
ASSERT_EQ(size, expectedSize);

LOG(INFO) << "Final size: " << serialized.size();

verifyEncodingTree(
Expand Down Expand Up @@ -635,6 +654,12 @@ TEST(EncodingSelectionStringTests, SelectConst) {
auto serialized =
nimble::EncodingFactory::encode<T>(std::move(policy), values, buffer);

// test getRawDataSize
auto size =
facebook::nimble::test::TestUtils::getRawDataSize(*pool, serialized);
auto expectedSize = value.size() * values.size();
ASSERT_EQ(size, expectedSize);

LOG(INFO) << "Final size: " << serialized.size();

verifyEncodingTree(
Expand All @@ -659,24 +684,35 @@ TEST(EncodingSelectionStringTests, SelectMainlyConst) {
std::string(5000, '\0'),
}) {
std::vector<T> values;
values.resize(1000);
auto expectedSize = 0;

auto resize = 1000;
values.resize(resize);
for (auto i = 0; i < values.size(); ++i) {
values[i] = value;
}
expectedSize += resize * value.size();

std::vector<std::string> uncommonValues;
for (auto i = 0; i < values.size() / 20; ++i) {
uncommonValues.emplace_back(i, 'b');
}

for (auto i = 0; i < uncommonValues.size(); ++i) {
values[i * 20] = uncommonValues[i];
std::string_view val = uncommonValues[i];
values[i * 20] = val;
expectedSize += val.size() - value.size();
}

auto policy = getRootManualSelectionPolicy<T>();
auto serialized =
nimble::EncodingFactory::encode<T>(std::move(policy), values, buffer);

// test getRawDataSize
auto size =
facebook::nimble::test::TestUtils::getRawDataSize(*pool, serialized);
ASSERT_EQ(size, expectedSize);

LOG(INFO) << "Final size: " << serialized.size();

verifyEncodingTree(
Expand Down Expand Up @@ -730,14 +766,21 @@ TEST(EncodingSelectionStringTests, SelectTrivial) {
}

std::vector<T> values;
auto expectedSize = 0;
values.resize(cache.size());
for (auto i = 0; i < cache.size(); ++i) {
values[i] = cache[i];
expectedSize += cache[i].size();
}

auto serialized =
nimble::EncodingFactory::encode<T>(std::move(policy), values, buffer);

// test getRawDataSize
auto size =
facebook::nimble::test::TestUtils::getRawDataSize(*pool, serialized);
ASSERT_EQ(size, expectedSize);

LOG(INFO) << "Final size: " << serialized.size();

verifyEncodingTree(
Expand Down Expand Up @@ -769,14 +812,22 @@ TEST(EncodingSelectionStringTests, SelectDictionary) {
auto policy = getRootManualSelectionPolicy<T>();

std::vector<T> values;
auto expectedSize = 0;
values.resize(10000);
for (auto i = 0; i < values.size(); ++i) {
values[i] = uniqueValues[folly::Random::rand32(rng) % uniqueValues.size()];
T val = uniqueValues[folly::Random::rand32(rng) % uniqueValues.size()];
values[i] = val;
expectedSize += val.size();
}

auto serialized =
nimble::EncodingFactory::encode<T>(std::move(policy), values, buffer);

// test getRawDataSize
auto size =
facebook::nimble::test::TestUtils::getRawDataSize(*pool, serialized);
ASSERT_EQ(size, expectedSize);

LOG(INFO) << "Final size: " << serialized.size();

verifyEncodingTree(
Expand Down Expand Up @@ -821,12 +872,15 @@ TEST(EncodingSelectionStringTests, SelectRunLength) {
}

std::vector<T> values;
auto expectedSize = 0;
values.reserve(valueCount);
auto index = 0;
for (const auto length : runLengths) {
for (auto i = 0; i < length; ++i) {
values.emplace_back(
index % 2 == 0 ? "abcdefghijklmnopqrstuvwxyz" : "1234567890");
std::string_view val =
((index % 2 == 0) ? "abcdefghijklmnopqrstuvwxyz" : "1234567890");
values.emplace_back(val);
expectedSize += val.size();
}
++index;
}
Expand All @@ -835,6 +889,11 @@ TEST(EncodingSelectionStringTests, SelectRunLength) {
auto serialized =
nimble::EncodingFactory::encode<T>(std::move(policy), values, buffer);

// test getRawDataSize
auto size =
facebook::nimble::test::TestUtils::getRawDataSize(*pool, serialized);
ASSERT_EQ(size, expectedSize);

LOG(INFO) << "Final size: " << serialized.size();

verifyEncodingTree(
Expand Down Expand Up @@ -878,5 +937,12 @@ TEST(EncodingSelectionTests, TestNullable) {

auto serialized = nimble::EncodingFactory::encodeNullable<T>(
std::move(policy), data, nulls, buffer);

// test getRawDataSize
auto size =
facebook::nimble::test::TestUtils::getRawDataSize(*pool, serialized);
auto expectedSize = 15 + 6; // 15 bytes for string data, 6 bytes for nulls
ASSERT_EQ(size, expectedSize);

LOG(INFO) << "Final size: " << serialized.size();
}
138 changes: 138 additions & 0 deletions dwio/nimble/encodings/tests/TestUtils.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
/*
* Copyright (c) Meta Platforms, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "dwio/nimble/encodings/tests/TestUtils.h"
#include "dwio/nimble/encodings/EncodingUtils.h"

namespace facebook::nimble::test {

static constexpr int kRowCountOffset = 2;
static constexpr int kPrefixSize = 6;
static constexpr int kCompressionTypeSize = 1;

uint64_t TestUtils::getRawDataSize(
velox::memory::MemoryPool& memoryPool,
std::string_view encodingStr) {
auto encoding = EncodingFactory::decode(memoryPool, encodingStr);
EncodingType encodingType = encoding->encodingType();
DataType dataType = encoding->dataType();
uint32_t rowCount = encoding->rowCount();

if (encodingType == EncodingType::Sentinel) {
NIMBLE_NOT_SUPPORTED("Sentinel encoding is not supported");
}

if (encodingType == EncodingType::Nullable) {
auto pos = encodingStr.data() + kPrefixSize;
auto nonNullsSize = encoding::readUint32(pos);
auto nonNullsCount = encoding::peek<uint32_t>(pos + kRowCountOffset);
// We do not count the bits indicating non-null, therefore we only
// include the size of the null bits and the non-null values.
return getRawDataSize(memoryPool, {pos, nonNullsSize}) +
(rowCount - nonNullsCount);
} else {
if (dataType != DataType::String) {
auto typeSize = nimble::detail::dataTypeSize(dataType);
auto result = typeSize * rowCount;
return result;
} else {
auto pos = encodingStr.data() + kPrefixSize; // Skip the prefix.
uint64_t result = 0;

switch (encodingType) {
case EncodingType::Trivial: {
pos += kCompressionTypeSize;
auto lengthsSize = encoding::readUint32(pos);
auto lengths =
EncodingFactory::decode(memoryPool, {pos, lengthsSize});
std::vector<uint32_t> buffer(rowCount);
lengths->materialize(rowCount, buffer.data());
result += std::accumulate(buffer.begin(), buffer.end(), 0);
break;
}

case EncodingType::Constant: {
auto valueSize = encoding::readUint32(pos);
result += rowCount * valueSize;
break;
}

case EncodingType::MainlyConstant: {
auto isCommonSize = encoding::readUint32(pos);
pos += isCommonSize;
auto otherValuesSize = encoding::readUint32(pos);
auto otherValuesOffset = pos;
auto otherValuesCount =
encoding::peek<uint32_t>(pos + kRowCountOffset);
pos += otherValuesSize;
auto constantValueSize = encoding::readUint32(pos);
result += (rowCount - otherValuesCount) * constantValueSize;
result +=
getRawDataSize(memoryPool, {otherValuesOffset, otherValuesSize});
break;
}

case EncodingType::Dictionary: {
auto alphabetSize = encoding::readUint32(pos);
auto alphabetCount = encoding::peek<uint32_t>(pos + kRowCountOffset);
auto alphabet =
EncodingFactory::decode(memoryPool, {pos, alphabetSize});
std::vector<std::string_view> alphabetBuffer(alphabetCount);
alphabet->materialize(alphabetCount, alphabetBuffer.data());

pos += alphabetSize;
auto indicesSize = encodingStr.length() - (pos - encodingStr.data());
auto indices =
EncodingFactory::decode(memoryPool, {pos, indicesSize});
std::vector<uint32_t> indicesBuffer(rowCount);
indices->materialize(rowCount, indicesBuffer.data());
for (int i = 0; i < rowCount; ++i) {
result += alphabetBuffer[indicesBuffer[i]].size();
}
break;
}

case EncodingType::RLE: {
auto runLengthsSize = encoding::readUint32(pos);
auto runLengthsCount =
encoding::peek<uint32_t>(pos + kRowCountOffset);
auto runLengths =
EncodingFactory::decode(memoryPool, {pos, runLengthsSize});
std::vector<uint32_t> runLengthsBuffer(runLengthsCount);
runLengths->materialize(runLengthsCount, runLengthsBuffer.data());

pos += runLengthsSize;
auto runValuesSize =
encodingStr.length() - (pos - encodingStr.data());
auto runValues =
EncodingFactory::decode(memoryPool, {pos, runValuesSize});
std::vector<std::string_view> runValuesBuffer(runLengthsCount);
runValues->materialize(runLengthsCount, runValuesBuffer.data());

for (int i = 0; i < runLengthsCount; ++i) {
result += runLengthsBuffer[i] * runValuesBuffer[i].size();
}
break;
}

default:
NIMBLE_NOT_SUPPORTED("Encoding type does not support strings.");
}
return result;
}
}
}
} // namespace facebook::nimble::test
8 changes: 7 additions & 1 deletion dwio/nimble/encodings/tests/TestUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
#pragma once

#include "dwio/nimble/encodings/ConstantEncoding.h"
#include "dwio/nimble/encodings/DeltaEncoding.h"
#include "dwio/nimble/encodings/DictionaryEncoding.h"
#include "dwio/nimble/encodings/Encoding.h"
#include "dwio/nimble/encodings/EncodingFactory.h"
Expand Down Expand Up @@ -236,4 +235,11 @@ class Encoder {
encodeNullable(buffer, values, nulls, compressionType));
}
};

class TestUtils {
public:
static uint64_t getRawDataSize(
velox::memory::MemoryPool& memoryPool,
std::string_view encodingStr);
};
} // namespace facebook::nimble::test
Loading

0 comments on commit bbf1351

Please sign in to comment.