Skip to content

Commit

Permalink
Passing 'skip.header.line.count' to SerDeOptions::nullString. (facebo…
Browse files Browse the repository at this point in the history
…okincubator#8722)

Summary:
Pull Request resolved: facebookincubator#8722

Some TEXT tables have custom NULL string.
We need to pass it to the SerDeOptions::nullString to return
the correct results.

Reviewed By: amitkdutta

Differential Revision: D53643548

fbshipit-source-id: 920f7f3b719e033fe8c31bcdd8dfe00fac405015
  • Loading branch information
Sergey Pershin authored and facebook-github-bot committed Feb 11, 2024
1 parent 9cf0ef0 commit 7b68a82
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 9 deletions.
19 changes: 13 additions & 6 deletions velox/connectors/hive/HiveConnectorUtil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -377,7 +377,8 @@ std::shared_ptr<common::ScanSpec> makeScanSpec(
}

std::unique_ptr<dwio::common::SerDeOptions> parseSerdeParameters(
const std::unordered_map<std::string, std::string>& serdeParameters) {
const std::unordered_map<std::string, std::string>& serdeParameters,
const std::unordered_map<std::string, std::string>& tableParameters) {
auto fieldIt = serdeParameters.find(dwio::common::SerDeOptions::kFieldDelim);
if (fieldIt == serdeParameters.end()) {
fieldIt = serdeParameters.find("serialization.format");
Expand All @@ -393,9 +394,13 @@ std::unique_ptr<dwio::common::SerDeOptions> parseSerdeParameters(
auto mapKeyIt =
serdeParameters.find(dwio::common::SerDeOptions::kMapKeyDelim);

auto nullStringIt = tableParameters.find(
dwio::common::TableParameter::kSerializationNullFormat);

if (fieldIt == serdeParameters.end() &&
collectionIt == serdeParameters.end() &&
mapKeyIt == serdeParameters.end()) {
mapKeyIt == serdeParameters.end() &&
nullStringIt == tableParameters.end()) {
return nullptr;
}

Expand All @@ -413,22 +418,23 @@ std::unique_ptr<dwio::common::SerDeOptions> parseSerdeParameters(
}
auto serDeOptions = std::make_unique<dwio::common::SerDeOptions>(
fieldDelim, collectionDelim, mapKeyDelim);
serDeOptions->nullString = nullStringIt->second;
return serDeOptions;
}

void configureReaderOptions(
dwio::common::ReaderOptions& readerOptions,
const std::shared_ptr<HiveConfig>& hiveConfig,
const Config* sessionProperties,
const RowTypePtr& fileSchema,
std::shared_ptr<HiveConnectorSplit> hiveSplit) {
const std::shared_ptr<HiveTableHandle>& hiveTableHandle,
const std::shared_ptr<HiveConnectorSplit>& hiveSplit) {
readerOptions.setMaxCoalesceBytes(hiveConfig->maxCoalescedBytes());
readerOptions.setMaxCoalesceDistance(hiveConfig->maxCoalescedDistanceBytes());
readerOptions.setFileColumnNamesReadAsLowerCase(
hiveConfig->isFileColumnNamesReadAsLowerCase(sessionProperties));
readerOptions.setUseColumnNamesForColumnMapping(
hiveConfig->isOrcUseColumnNames(sessionProperties));
readerOptions.setFileSchema(fileSchema);
readerOptions.setFileSchema(hiveTableHandle->dataColumns());
readerOptions.setFooterEstimatedSize(hiveConfig->footerEstimatedSize());
readerOptions.setFilePreloadThreshold(hiveConfig->filePreloadThreshold());

Expand All @@ -439,7 +445,8 @@ void configureReaderOptions(
dwio::common::toString(readerOptions.getFileFormat()),
dwio::common::toString(hiveSplit->fileFormat));
} else {
auto serDeOptions = parseSerdeParameters(hiveSplit->serdeParameters);
auto serDeOptions = parseSerdeParameters(
hiveSplit->serdeParameters, hiveTableHandle->tableParameters());
if (serDeOptions) {
readerOptions.setSerDeOptions(*serDeOptions);
}
Expand Down
5 changes: 3 additions & 2 deletions velox/connectors/hive/HiveConnectorUtil.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
namespace facebook::velox::connector::hive {

class HiveColumnHandle;
class HiveTableHandle;
class HiveConfig;
struct HiveConnectorSplit;

Expand Down Expand Up @@ -57,8 +58,8 @@ void configureReaderOptions(
dwio::common::ReaderOptions& readerOptions,
const std::shared_ptr<HiveConfig>& config,
const Config* sessionProperties,
const RowTypePtr& fileSchema,
std::shared_ptr<HiveConnectorSplit> hiveSplit);
const std::shared_ptr<HiveTableHandle>& hiveTableHandle,
const std::shared_ptr<HiveConnectorSplit>& hiveSplit);

void configureRowReaderOptions(
dwio::common::RowReaderOptions& rowReaderOptions,
Expand Down
2 changes: 1 addition & 1 deletion velox/connectors/hive/SplitReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ void SplitReader::configureReaderOptions() {
baseReaderOpts_,
hiveConfig_,
connectorQueryCtx_->sessionProperties(),
hiveTableHandle_->dataColumns(),
hiveTableHandle_,
hiveSplit_);
}

Expand Down
2 changes: 2 additions & 0 deletions velox/dwio/common/Options.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@ class SerDeOptions {

struct TableParameter {
static constexpr const char* kSkipHeaderLineCount = "skip.header.line.count";
static constexpr const char* kSerializationNullFormat =
"serialization.null.format";
};

/**
Expand Down

0 comments on commit 7b68a82

Please sign in to comment.