Skip to content

Commit

Permalink
Add getFileRawDataSize Test Utilities (facebookincubator#73)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: facebookincubator#73

# Changes

Adding functions to the TestUtils.h in tests folders of both `encodings` and `velox` which allows us to calculate the raw data size of files by reading.
- Note the special handling for Nullable encoding types and String data types in the `getRawChunkSize` function.

# Context
This is necessary to check the correctness of the raw data size stat added in the following diff, which calculates the size when writing.
This change will also be used by nimble_dump later on (to be implemented in a later diff).

Differential Revision: D61050482
  • Loading branch information
phoenixawe authored and facebook-github-bot committed Aug 20, 2024
1 parent 3fa84b2 commit 0fc9a1a
Show file tree
Hide file tree
Showing 3 changed files with 194 additions and 1 deletion.
134 changes: 134 additions & 0 deletions dwio/nimble/encodings/tests/TestUtils.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
/*
* Copyright (c) Meta Platforms, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "dwio/nimble/encodings/tests/TestUtils.h"
#include "dwio/nimble/encodings/EncodingUtils.h"

namespace facebook::nimble::test {

static constexpr int kRowCountOffset = 2;
static constexpr int kPrefixSize = 6;
static constexpr int kCompressionTypeSize = 1;

std::uint64_t TestUtils::getRawChunkSize(
velox::memory::MemoryPool& memoryPool,
std::string_view chunk) {
auto encoding = EncodingFactory::decode(memoryPool, chunk);
EncodingType encodingType = encoding->encodingType();
DataType dataType = encoding->dataType();
uint32_t rowCount = encoding->rowCount();

if (encodingType == EncodingType::Sentinel) {
throw std::runtime_error("Sentinel encoding is not supported");

} else if (encodingType == EncodingType::Nullable) {
auto pos = chunk.data() + kPrefixSize;
auto nonNullsSize = encoding::readUint32(pos);
auto posNonNullCountOffset = pos + kRowCountOffset;
auto nonNullsCount = encoding::readUint32(posNonNullCountOffset);
return getRawChunkSize(memoryPool, {pos, nonNullsSize}) +
(rowCount - nonNullsCount);
} else {
if (dataType == DataType::String) {
auto pos = chunk.data() + kPrefixSize; // Skip the prefix.
auto result{0};

switch (encodingType) {
case EncodingType::Trivial: {
pos += kCompressionTypeSize;
auto lengthsSize = encoding::readUint32(pos);
auto lengths =
EncodingFactory::decode(memoryPool, {pos, lengthsSize});
Vector<uint32_t> buffer{&memoryPool, rowCount};
lengths->materialize(rowCount, buffer.data());
for (int i = 0; i < rowCount; ++i) {
result += buffer[i];
}
} break;

case EncodingType::Constant: {
auto valueLen = encoding::readUint32(pos);
result += rowCount * valueLen;
} break;

case EncodingType::MainlyConstant: {
auto commonSize = encoding::readUint32(pos);
pos += commonSize;
auto otherValuesSize = encoding::readUint32(pos);
auto otherValuesOffset = pos;
auto otherValuesCount = *reinterpret_cast<const uint32_t*>(
otherValuesOffset + kRowCountOffset);
pos += otherValuesSize;
auto valueLen = encoding::readUint32(pos);
result += (rowCount - otherValuesCount) * valueLen;
result +=
getRawChunkSize(memoryPool, {otherValuesOffset, otherValuesSize});
} break;

case EncodingType::Dictionary: {
auto alphabetSize = encoding::readUint32(pos);
auto alphabetCount = *reinterpret_cast<const uint32_t*>(pos + 2);
auto alphabet =
EncodingFactory::decode(memoryPool, {pos, alphabetSize});
Vector<std::string_view> buffer{&memoryPool, alphabetCount};
alphabet->materialize(alphabetCount, buffer.data());
uint32_t alphabetLens[alphabetCount];
for (int i = 0; i < alphabetCount; ++i) {
alphabetLens[i] = buffer[i].size();
}

pos += alphabetSize;
auto indices = EncodingFactory::decode(memoryPool, {pos});
Vector<uint32_t> indicesBuffer{&memoryPool, rowCount};
indices->materialize(rowCount, indicesBuffer.data());
for (int i = 0; i < rowCount; ++i) {
result += alphabetLens[indicesBuffer[i]];
}
} break;

case EncodingType::RLE: {
auto runLengthsSize = encoding::readUint32(pos);
auto rlCount = *reinterpret_cast<const uint32_t*>(pos + 2);
auto runLengths =
EncodingFactory::decode(memoryPool, {pos, runLengthsSize});
Vector<uint32_t> buffer{&memoryPool, rlCount};
runLengths->materialize(rlCount, buffer.data());

pos += runLengthsSize;
auto runValuesSize = encoding::readUint32(pos);
auto runValues =
EncodingFactory::decode(memoryPool, {pos, runValuesSize});
Vector<std::string> rvBuffer{&memoryPool, rlCount};
runValues->materialize(rlCount, rvBuffer.data());
for (int i = 0; i < rlCount; ++i) {
result += buffer[i] * rvBuffer[i].size();
}
} break;

default:
throw std::runtime_error("Encoding type does not support strings.");
}
return result;

} else {
auto typeSize = nimble::detail::dataTypeSize(dataType);
auto result = typeSize * rowCount;

return result;
}
}
}
} // namespace facebook::nimble::test
8 changes: 7 additions & 1 deletion dwio/nimble/encodings/tests/TestUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
#pragma once

#include "dwio/nimble/encodings/ConstantEncoding.h"
#include "dwio/nimble/encodings/DeltaEncoding.h"
#include "dwio/nimble/encodings/DictionaryEncoding.h"
#include "dwio/nimble/encodings/Encoding.h"
#include "dwio/nimble/encodings/EncodingFactory.h"
Expand Down Expand Up @@ -236,4 +235,11 @@ class Encoder {
encodeNullable(buffer, values, nulls, compressionType));
}
};

class TestUtils {
public:
static std::uint64_t getRawChunkSize(
velox::memory::MemoryPool& memoryPool,
std::string_view chunk);
};
} // namespace facebook::nimble::test
53 changes: 53 additions & 0 deletions dwio/nimble/velox/tests/TestUtils.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/*
* Copyright (c) Meta Platforms, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include "dwio/nimble/encodings/tests/TestUtils.h"
#include "dwio/nimble/tablet/TabletReader.h"
#include "dwio/nimble/velox/ChunkedStream.h"

namespace facebook::nimble::test {
// Calculate the rawDataSize of a file.
inline std::uint64_t getFileRawDataSize(
nimble::TabletReader& tablet,
velox::memory::MemoryPool& pool) {
// Calculate expected size by summing stream sizes.
uint64_t expected{0};
for (auto i = 0; i < tablet.stripeCount(); ++i) {
auto stripeIdentifier = tablet.getStripeIdentifier(i);

auto numStreams = tablet.streamCount(stripeIdentifier);
std::vector<uint32_t> identifiers(numStreams + 1);
std::iota(identifiers.begin(), identifiers.end(), 0);
auto streams = tablet.load(stripeIdentifier, identifiers);

// Skip nullStreams indicated by nullptr.
for (auto j = 0; j < streams.size(); ++j) {
if (streams[j] == nullptr) {
continue;
}
nimble::InMemoryChunkedStream chunkedStream{pool, std::move(streams[j])};
while (chunkedStream.hasNext()) {
auto chunk = chunkedStream.nextChunk();
auto size = TestUtils::getRawChunkSize(pool, chunk);
expected += size;
}
}
}
return expected;
}

} // namespace facebook::nimble::test

0 comments on commit 0fc9a1a

Please sign in to comment.