-
Notifications
You must be signed in to change notification settings - Fork 30
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add getFileRawDataSize Test Utilities (#73)
Summary: Pull Request resolved: #73 # Changes Adding functions to the TestUtils.h in tests folders of both `encodings` and `velox` which allows us to calculate the raw data size of files by reading. - Note the special handling for Nullable encoding types and String data types in the `getRawChunkSize` function. # Context This is necessary to check the correctness of the raw data size stat added in the following diff, which calculates the size when writing. This change will also be used by nimble_dump later on (to be implemented in a later diff). Differential Revision: D61050482
- Loading branch information
1 parent
3fa84b2
commit 0fc9a1a
Showing
3 changed files
with
194 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,134 @@ | ||
/* | ||
* Copyright (c) Meta Platforms, Inc. and its affiliates. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#include "dwio/nimble/encodings/tests/TestUtils.h" | ||
#include "dwio/nimble/encodings/EncodingUtils.h" | ||
|
||
namespace facebook::nimble::test { | ||
|
||
static constexpr int kRowCountOffset = 2; | ||
static constexpr int kPrefixSize = 6; | ||
static constexpr int kCompressionTypeSize = 1; | ||
|
||
std::uint64_t TestUtils::getRawChunkSize( | ||
velox::memory::MemoryPool& memoryPool, | ||
std::string_view chunk) { | ||
auto encoding = EncodingFactory::decode(memoryPool, chunk); | ||
EncodingType encodingType = encoding->encodingType(); | ||
DataType dataType = encoding->dataType(); | ||
uint32_t rowCount = encoding->rowCount(); | ||
|
||
if (encodingType == EncodingType::Sentinel) { | ||
throw std::runtime_error("Sentinel encoding is not supported"); | ||
|
||
} else if (encodingType == EncodingType::Nullable) { | ||
auto pos = chunk.data() + kPrefixSize; | ||
auto nonNullsSize = encoding::readUint32(pos); | ||
auto posNonNullCountOffset = pos + kRowCountOffset; | ||
auto nonNullsCount = encoding::readUint32(posNonNullCountOffset); | ||
return getRawChunkSize(memoryPool, {pos, nonNullsSize}) + | ||
(rowCount - nonNullsCount); | ||
} else { | ||
if (dataType == DataType::String) { | ||
auto pos = chunk.data() + kPrefixSize; // Skip the prefix. | ||
auto result{0}; | ||
|
||
switch (encodingType) { | ||
case EncodingType::Trivial: { | ||
pos += kCompressionTypeSize; | ||
auto lengthsSize = encoding::readUint32(pos); | ||
auto lengths = | ||
EncodingFactory::decode(memoryPool, {pos, lengthsSize}); | ||
Vector<uint32_t> buffer{&memoryPool, rowCount}; | ||
lengths->materialize(rowCount, buffer.data()); | ||
for (int i = 0; i < rowCount; ++i) { | ||
result += buffer[i]; | ||
} | ||
} break; | ||
|
||
case EncodingType::Constant: { | ||
auto valueLen = encoding::readUint32(pos); | ||
result += rowCount * valueLen; | ||
} break; | ||
|
||
case EncodingType::MainlyConstant: { | ||
auto commonSize = encoding::readUint32(pos); | ||
pos += commonSize; | ||
auto otherValuesSize = encoding::readUint32(pos); | ||
auto otherValuesOffset = pos; | ||
auto otherValuesCount = *reinterpret_cast<const uint32_t*>( | ||
otherValuesOffset + kRowCountOffset); | ||
pos += otherValuesSize; | ||
auto valueLen = encoding::readUint32(pos); | ||
result += (rowCount - otherValuesCount) * valueLen; | ||
result += | ||
getRawChunkSize(memoryPool, {otherValuesOffset, otherValuesSize}); | ||
} break; | ||
|
||
case EncodingType::Dictionary: { | ||
auto alphabetSize = encoding::readUint32(pos); | ||
auto alphabetCount = *reinterpret_cast<const uint32_t*>(pos + 2); | ||
auto alphabet = | ||
EncodingFactory::decode(memoryPool, {pos, alphabetSize}); | ||
Vector<std::string_view> buffer{&memoryPool, alphabetCount}; | ||
alphabet->materialize(alphabetCount, buffer.data()); | ||
uint32_t alphabetLens[alphabetCount]; | ||
for (int i = 0; i < alphabetCount; ++i) { | ||
alphabetLens[i] = buffer[i].size(); | ||
} | ||
|
||
pos += alphabetSize; | ||
auto indices = EncodingFactory::decode(memoryPool, {pos}); | ||
Vector<uint32_t> indicesBuffer{&memoryPool, rowCount}; | ||
indices->materialize(rowCount, indicesBuffer.data()); | ||
for (int i = 0; i < rowCount; ++i) { | ||
result += alphabetLens[indicesBuffer[i]]; | ||
} | ||
} break; | ||
|
||
case EncodingType::RLE: { | ||
auto runLengthsSize = encoding::readUint32(pos); | ||
auto rlCount = *reinterpret_cast<const uint32_t*>(pos + 2); | ||
auto runLengths = | ||
EncodingFactory::decode(memoryPool, {pos, runLengthsSize}); | ||
Vector<uint32_t> buffer{&memoryPool, rlCount}; | ||
runLengths->materialize(rlCount, buffer.data()); | ||
|
||
pos += runLengthsSize; | ||
auto runValuesSize = encoding::readUint32(pos); | ||
auto runValues = | ||
EncodingFactory::decode(memoryPool, {pos, runValuesSize}); | ||
Vector<std::string> rvBuffer{&memoryPool, rlCount}; | ||
runValues->materialize(rlCount, rvBuffer.data()); | ||
for (int i = 0; i < rlCount; ++i) { | ||
result += buffer[i] * rvBuffer[i].size(); | ||
} | ||
} break; | ||
|
||
default: | ||
throw std::runtime_error("Encoding type does not support strings."); | ||
} | ||
return result; | ||
|
||
} else { | ||
auto typeSize = nimble::detail::dataTypeSize(dataType); | ||
auto result = typeSize * rowCount; | ||
|
||
return result; | ||
} | ||
} | ||
} | ||
} // namespace facebook::nimble::test |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
/* | ||
* Copyright (c) Meta Platforms, Inc. and its affiliates. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
#pragma once | ||
|
||
#include "dwio/nimble/encodings/tests/TestUtils.h" | ||
#include "dwio/nimble/tablet/TabletReader.h" | ||
#include "dwio/nimble/velox/ChunkedStream.h" | ||
|
||
namespace facebook::nimble::test { | ||
// Calculate the rawDataSize of a file. | ||
inline std::uint64_t getFileRawDataSize( | ||
nimble::TabletReader& tablet, | ||
velox::memory::MemoryPool& pool) { | ||
// Calculate expected size by summing stream sizes. | ||
uint64_t expected{0}; | ||
for (auto i = 0; i < tablet.stripeCount(); ++i) { | ||
auto stripeIdentifier = tablet.getStripeIdentifier(i); | ||
|
||
auto numStreams = tablet.streamCount(stripeIdentifier); | ||
std::vector<uint32_t> identifiers(numStreams + 1); | ||
std::iota(identifiers.begin(), identifiers.end(), 0); | ||
auto streams = tablet.load(stripeIdentifier, identifiers); | ||
|
||
// Skip nullStreams indicated by nullptr. | ||
for (auto j = 0; j < streams.size(); ++j) { | ||
if (streams[j] == nullptr) { | ||
continue; | ||
} | ||
nimble::InMemoryChunkedStream chunkedStream{pool, std::move(streams[j])}; | ||
while (chunkedStream.hasNext()) { | ||
auto chunk = chunkedStream.nextChunk(); | ||
auto size = TestUtils::getRawChunkSize(pool, chunk); | ||
expected += size; | ||
} | ||
} | ||
} | ||
return expected; | ||
} | ||
|
||
} // namespace facebook::nimble::test |