Skip to content

Commit

Permalink
Add getRawDataSize to Test Utilities (#73)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #73

# Changes

Adding functions to the TestUtils.h in tests folders of both `encodings` and `velox` which allows us to calculate the raw data size of files by reading.
- Note the special handling for Nullable encoding types and String data types in the `getRawDataSize` function.

# Context
This will be used by nimble_dump later on.
RawDataSize is for streams and encodings, so this captures the raw data size *after* some optimizations such as deduping are applied, but *before* compression and serde.

Reviewed By: sdruzkin, helfman

Differential Revision: D61050482

fbshipit-source-id: bbd2eb59f2891c95ef6eb5a10631f85b3bd6a7be
  • Loading branch information
phoenixawe authored and facebook-github-bot committed Sep 20, 2024
1 parent 6a53405 commit eda098a
Show file tree
Hide file tree
Showing 5 changed files with 272 additions and 6 deletions.
4 changes: 4 additions & 0 deletions dwio/nimble/encodings/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
add_library(nimble_encodings_tests_utils TestUtils.cpp)
target_link_libraries(nimble_encodings_tests_utils nimble_encodings)

add_executable(
nimble_encodings_tests
ConstantEncodingTests.cpp
Expand All @@ -27,6 +30,7 @@ add_test(nimble_encodings_tests nimble_encodings_tests)

target_link_libraries(
nimble_encodings_tests
nimble_encodings_tests_utils
nimble_encodings
nimble_common
nimble_tools_common
Expand Down
76 changes: 71 additions & 5 deletions dwio/nimble/encodings/tests/EncodingSelectionTests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "dwio/nimble/encodings/EncodingFactory.h"
#include "dwio/nimble/encodings/EncodingSelectionPolicy.h"
#include "dwio/nimble/encodings/NullableEncoding.h"
#include "dwio/nimble/encodings/tests/TestUtils.h"
#include "dwio/nimble/tools/EncodingUtilities.h"

using namespace ::facebook;
Expand Down Expand Up @@ -112,6 +113,12 @@ void test(std::span<const T> values, std::vector<EncodingDetails> expected) {
auto serialized =
nimble::EncodingFactory::encode<T>(std::move(policy), values, buffer);

// test getRawDataSize
auto size =
facebook::nimble::test::TestUtils::getRawDataSize(*pool, serialized);
auto expectedSize = values.size_bytes();
ASSERT_EQ(size, expectedSize);

LOG(INFO) << "Final size: " << serialized.size();

ASSERT_GT(expected.size(), 0);
Expand Down Expand Up @@ -554,6 +561,12 @@ TEST(EncodingSelectionBoolTests, SelectTrivial) {
auto serialized =
nimble::EncodingFactory::encode<T>(std::move(policy), values, buffer);

// test getRawDataSize
auto size =
facebook::nimble::test::TestUtils::getRawDataSize(*pool, serialized);
auto expectedSize = values.size() * sizeof(T);
ASSERT_EQ(size, expectedSize);

LOG(INFO) << "Final size: " << serialized.size();

verifyEncodingTree(
Expand Down Expand Up @@ -599,6 +612,12 @@ TEST(EncodingSelectionBoolTests, SelectRunLength) {
auto serialized =
nimble::EncodingFactory::encode<T>(std::move(policy), values, buffer);

// test getRawDataSize
auto size =
facebook::nimble::test::TestUtils::getRawDataSize(*pool, serialized);
auto expectedSize = values.size() * sizeof(T);
ASSERT_EQ(size, expectedSize);

LOG(INFO) << "Final size: " << serialized.size();

verifyEncodingTree(
Expand Down Expand Up @@ -635,6 +654,12 @@ TEST(EncodingSelectionStringTests, SelectConst) {
auto serialized =
nimble::EncodingFactory::encode<T>(std::move(policy), values, buffer);

// test getRawDataSize
auto size =
facebook::nimble::test::TestUtils::getRawDataSize(*pool, serialized);
auto expectedSize = value.size() * values.size();
ASSERT_EQ(size, expectedSize);

LOG(INFO) << "Final size: " << serialized.size();

verifyEncodingTree(
Expand All @@ -659,24 +684,35 @@ TEST(EncodingSelectionStringTests, SelectMainlyConst) {
std::string(5000, '\0'),
}) {
std::vector<T> values;
values.resize(1000);
auto expectedSize = 0;

auto resize = 1000;
values.resize(resize);
for (auto i = 0; i < values.size(); ++i) {
values[i] = value;
}
expectedSize += resize * value.size();

std::vector<std::string> uncommonValues;
for (auto i = 0; i < values.size() / 20; ++i) {
uncommonValues.emplace_back(i, 'b');
}

for (auto i = 0; i < uncommonValues.size(); ++i) {
values[i * 20] = uncommonValues[i];
std::string_view val = uncommonValues[i];
values[i * 20] = val;
expectedSize += val.size() - value.size();
}

auto policy = getRootManualSelectionPolicy<T>();
auto serialized =
nimble::EncodingFactory::encode<T>(std::move(policy), values, buffer);

// test getRawDataSize
auto size =
facebook::nimble::test::TestUtils::getRawDataSize(*pool, serialized);
ASSERT_EQ(size, expectedSize);

LOG(INFO) << "Final size: " << serialized.size();

verifyEncodingTree(
Expand Down Expand Up @@ -730,14 +766,21 @@ TEST(EncodingSelectionStringTests, SelectTrivial) {
}

std::vector<T> values;
auto expectedSize = 0;
values.resize(cache.size());
for (auto i = 0; i < cache.size(); ++i) {
values[i] = cache[i];
expectedSize += cache[i].size();
}

auto serialized =
nimble::EncodingFactory::encode<T>(std::move(policy), values, buffer);

// test getRawDataSize
auto size =
facebook::nimble::test::TestUtils::getRawDataSize(*pool, serialized);
ASSERT_EQ(size, expectedSize);

LOG(INFO) << "Final size: " << serialized.size();

verifyEncodingTree(
Expand Down Expand Up @@ -769,14 +812,22 @@ TEST(EncodingSelectionStringTests, SelectDictionary) {
auto policy = getRootManualSelectionPolicy<T>();

std::vector<T> values;
auto expectedSize = 0;
values.resize(10000);
for (auto i = 0; i < values.size(); ++i) {
values[i] = uniqueValues[folly::Random::rand32(rng) % uniqueValues.size()];
T val = uniqueValues[folly::Random::rand32(rng) % uniqueValues.size()];
values[i] = val;
expectedSize += val.size();
}

auto serialized =
nimble::EncodingFactory::encode<T>(std::move(policy), values, buffer);

// test getRawDataSize
auto size =
facebook::nimble::test::TestUtils::getRawDataSize(*pool, serialized);
ASSERT_EQ(size, expectedSize);

LOG(INFO) << "Final size: " << serialized.size();

verifyEncodingTree(
Expand Down Expand Up @@ -821,12 +872,15 @@ TEST(EncodingSelectionStringTests, SelectRunLength) {
}

std::vector<T> values;
auto expectedSize = 0;
values.reserve(valueCount);
auto index = 0;
for (const auto length : runLengths) {
for (auto i = 0; i < length; ++i) {
values.emplace_back(
index % 2 == 0 ? "abcdefghijklmnopqrstuvwxyz" : "1234567890");
std::string_view val =
((index % 2 == 0) ? "abcdefghijklmnopqrstuvwxyz" : "1234567890");
values.emplace_back(val);
expectedSize += val.size();
}
++index;
}
Expand All @@ -835,6 +889,11 @@ TEST(EncodingSelectionStringTests, SelectRunLength) {
auto serialized =
nimble::EncodingFactory::encode<T>(std::move(policy), values, buffer);

// test getRawDataSize
auto size =
facebook::nimble::test::TestUtils::getRawDataSize(*pool, serialized);
ASSERT_EQ(size, expectedSize);

LOG(INFO) << "Final size: " << serialized.size();

verifyEncodingTree(
Expand Down Expand Up @@ -878,5 +937,12 @@ TEST(EncodingSelectionTests, TestNullable) {

auto serialized = nimble::EncodingFactory::encodeNullable<T>(
std::move(policy), data, nulls, buffer);

// test getRawDataSize
auto size =
facebook::nimble::test::TestUtils::getRawDataSize(*pool, serialized);
auto expectedSize = 15 + 6; // 15 bytes for string data, 6 bytes for nulls
ASSERT_EQ(size, expectedSize);

LOG(INFO) << "Final size: " << serialized.size();
}
138 changes: 138 additions & 0 deletions dwio/nimble/encodings/tests/TestUtils.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
/*
* Copyright (c) Meta Platforms, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "dwio/nimble/encodings/tests/TestUtils.h"
#include "dwio/nimble/encodings/EncodingUtils.h"

namespace facebook::nimble::test {

static constexpr int kRowCountOffset = 2;
static constexpr int kPrefixSize = 6;
static constexpr int kCompressionTypeSize = 1;

uint64_t TestUtils::getRawDataSize(
velox::memory::MemoryPool& memoryPool,
std::string_view encodingStr) {
auto encoding = EncodingFactory::decode(memoryPool, encodingStr);
EncodingType encodingType = encoding->encodingType();
DataType dataType = encoding->dataType();
uint32_t rowCount = encoding->rowCount();

if (encodingType == EncodingType::Sentinel) {
NIMBLE_NOT_SUPPORTED("Sentinel encoding is not supported");
}

if (encodingType == EncodingType::Nullable) {
auto pos = encodingStr.data() + kPrefixSize;
auto nonNullsSize = encoding::readUint32(pos);
auto nonNullsCount = encoding::peek<uint32_t>(pos + kRowCountOffset);
// We do not count the bits indicating non-null, therefore we only
// include the size of the null bits and the non-null values.
return getRawDataSize(memoryPool, {pos, nonNullsSize}) +
(rowCount - nonNullsCount);
} else {
if (dataType != DataType::String) {
auto typeSize = nimble::detail::dataTypeSize(dataType);
auto result = typeSize * rowCount;
return result;
} else {
auto pos = encodingStr.data() + kPrefixSize; // Skip the prefix.
uint64_t result = 0;

switch (encodingType) {
case EncodingType::Trivial: {
pos += kCompressionTypeSize;
auto lengthsSize = encoding::readUint32(pos);
auto lengths =
EncodingFactory::decode(memoryPool, {pos, lengthsSize});
std::vector<uint32_t> buffer(rowCount);
lengths->materialize(rowCount, buffer.data());
result += std::accumulate(buffer.begin(), buffer.end(), 0u);
break;
}

case EncodingType::Constant: {
auto valueSize = encoding::readUint32(pos);
result += rowCount * valueSize;
break;
}

case EncodingType::MainlyConstant: {
auto isCommonSize = encoding::readUint32(pos);
pos += isCommonSize;
auto otherValuesSize = encoding::readUint32(pos);
auto otherValuesOffset = pos;
auto otherValuesCount =
encoding::peek<uint32_t>(pos + kRowCountOffset);
pos += otherValuesSize;
auto constantValueSize = encoding::readUint32(pos);
result += (rowCount - otherValuesCount) * constantValueSize;
result +=
getRawDataSize(memoryPool, {otherValuesOffset, otherValuesSize});
break;
}

case EncodingType::Dictionary: {
auto alphabetSize = encoding::readUint32(pos);
auto alphabetCount = encoding::peek<uint32_t>(pos + kRowCountOffset);
auto alphabet =
EncodingFactory::decode(memoryPool, {pos, alphabetSize});
std::vector<std::string_view> alphabetBuffer(alphabetCount);
alphabet->materialize(alphabetCount, alphabetBuffer.data());

pos += alphabetSize;
auto indicesSize = encodingStr.length() - (pos - encodingStr.data());
auto indices =
EncodingFactory::decode(memoryPool, {pos, indicesSize});
std::vector<uint32_t> indicesBuffer(rowCount);
indices->materialize(rowCount, indicesBuffer.data());
for (int i = 0; i < rowCount; ++i) {
result += alphabetBuffer[indicesBuffer[i]].size();
}
break;
}

case EncodingType::RLE: {
auto runLengthsSize = encoding::readUint32(pos);
auto runLengthsCount =
encoding::peek<uint32_t>(pos + kRowCountOffset);
auto runLengths =
EncodingFactory::decode(memoryPool, {pos, runLengthsSize});
std::vector<uint32_t> runLengthsBuffer(runLengthsCount);
runLengths->materialize(runLengthsCount, runLengthsBuffer.data());

pos += runLengthsSize;
auto runValuesSize =
encodingStr.length() - (pos - encodingStr.data());
auto runValues =
EncodingFactory::decode(memoryPool, {pos, runValuesSize});
std::vector<std::string_view> runValuesBuffer(runLengthsCount);
runValues->materialize(runLengthsCount, runValuesBuffer.data());

for (int i = 0; i < runLengthsCount; ++i) {
result += runLengthsBuffer[i] * runValuesBuffer[i].size();
}
break;
}

default:
NIMBLE_NOT_SUPPORTED("Encoding type does not support strings.");
}
return result;
}
}
}
} // namespace facebook::nimble::test
8 changes: 7 additions & 1 deletion dwio/nimble/encodings/tests/TestUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
#pragma once

#include "dwio/nimble/encodings/ConstantEncoding.h"
#include "dwio/nimble/encodings/DeltaEncoding.h"
#include "dwio/nimble/encodings/DictionaryEncoding.h"
#include "dwio/nimble/encodings/Encoding.h"
#include "dwio/nimble/encodings/EncodingFactory.h"
Expand Down Expand Up @@ -236,4 +235,11 @@ class Encoder {
encodeNullable(buffer, values, nulls, compressionType));
}
};

class TestUtils {
public:
static uint64_t getRawDataSize(
velox::memory::MemoryPool& memoryPool,
std::string_view encodingStr);
};
} // namespace facebook::nimble::test
Loading

0 comments on commit eda098a

Please sign in to comment.