Skip to content

Commit

Permalink
chore: Update vendored sources to duckdb/duckdb@217ec47 (#511)
Browse files Browse the repository at this point in the history
[Adaptive Sniffer] In case files have only one row, be more permissive to detect headers and types. (duckdb/duckdb#14174)

Co-authored-by: krlmlr <krlmlr@users.noreply.github.com>
  • Loading branch information
github-actions[bot] and krlmlr authored Oct 19, 2024
1 parent 640f15f commit 5000d09
Show file tree
Hide file tree
Showing 22 changed files with 134 additions and 68 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -119,15 +119,15 @@ void CSVBufferManager::ResetBuffer(const idx_t buffer_idx) {
}
}

idx_t CSVBufferManager::GetBufferSize() {
idx_t CSVBufferManager::GetBufferSize() const {
return buffer_size;
}

idx_t CSVBufferManager::BufferCount() {
idx_t CSVBufferManager::BufferCount() const {
return cached_buffers.size();
}

bool CSVBufferManager::Done() {
bool CSVBufferManager::Done() const {
return done;
}

Expand All @@ -144,7 +144,7 @@ void CSVBufferManager::ResetBufferManager() {
}
}

string CSVBufferManager::GetFilePath() {
string CSVBufferManager::GetFilePath() const {
return file_path;
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#include "duckdb/execution/operator/csv_scanner/base_scanner.hpp"

#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/skip_scanner.hpp"

namespace duckdb {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,14 +60,53 @@ bool CSVSchema::Empty() const {
return columns.empty();
}

bool CSVSchema::SchemasMatch(string &error_message, vector<string> &names, vector<LogicalType> &types,
const string &cur_file_path) {
D_ASSERT(names.size() == types.size());
bool CSVSchema::SchemasMatch(string &error_message, SnifferResult &sniffer_result, const string &cur_file_path,
bool is_minimal_sniffer) const {
D_ASSERT(sniffer_result.names.size() == sniffer_result.return_types.size());
bool match = true;
unordered_map<string, TypeIdxPair> current_schema;
for (idx_t i = 0; i < names.size(); i++) {

for (idx_t i = 0; i < sniffer_result.names.size(); i++) {
// Populate our little schema
current_schema[names[i]] = {types[i], i};
current_schema[sniffer_result.names[i]] = {sniffer_result.return_types[i], i};
}
if (is_minimal_sniffer) {
auto min_sniffer = static_cast<AdaptiveSnifferResult &>(sniffer_result);
if (!min_sniffer.more_than_one_row) {
bool min_sniff_match = true;
// If we don't have more than one row, either the names must match or the types must match.
for (auto &column : columns) {
if (current_schema.find(column.name) == current_schema.end()) {
min_sniff_match = false;
break;
}
}
if (min_sniff_match) {
return true;
}
// Otherwise, the types must match.
min_sniff_match = true;
if (sniffer_result.return_types.size() == columns.size()) {
idx_t return_type_idx = 0;
for (auto &column : columns) {
if (column.type != sniffer_result.return_types[return_type_idx++]) {
min_sniff_match = false;
break;
}
}
} else {
min_sniff_match = false;
}
if (min_sniff_match) {
// If we got here, we have the right types but the wrong names, lets fix the names
idx_t sniff_name_idx = 0;
for (auto &column : columns) {
sniffer_result.names[sniff_name_idx++] = column.name;
}
return true;
}
}
// If we got to this point, the minimal sniffer doesn't match, we throw an error.
}
// Here we check if the schema of a given file matched our original schema
// We consider it's not a match if:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"
#include "duckdb/common/types/value.hpp"

namespace duckdb {
Expand Down Expand Up @@ -88,15 +88,14 @@ void CSVSniffer::SetResultOptions() {
options.dialect_options.rows_until_header = best_candidate->GetStateMachine().dialect_options.rows_until_header;
}

SnifferResult CSVSniffer::MinimalSniff() {
AdaptiveSnifferResult CSVSniffer::MinimalSniff() {
if (set_columns.IsSet()) {
// Nothing to see here
return SnifferResult(*set_columns.types, *set_columns.names);
return AdaptiveSnifferResult(*set_columns.types, *set_columns.names, true);
}
// Return Types detected
vector<LogicalType> return_types;
// Column Names detected
vector<string> names;

buffer_manager->sniffing = true;
constexpr idx_t result_size = 2;
Expand All @@ -106,7 +105,8 @@ SnifferResult CSVSniffer::MinimalSniff() {
ColumnCountScanner count_scanner(buffer_manager, state_machine, error_handler, result_size);
auto &sniffed_column_counts = count_scanner.ParseChunk();
if (sniffed_column_counts.result_position == 0) {
return {{}, {}};
// The file is an empty file, we just return
return {{}, {}, false};
}

state_machine->dialect_options.num_cols = sniffed_column_counts[0].number_of_columns;
Expand All @@ -130,20 +130,20 @@ SnifferResult CSVSniffer::MinimalSniff() {

// Possibly Gather Header
vector<HeaderValue> potential_header;
if (start_row != 0) {
for (idx_t col_idx = 0; col_idx < data_chunk.ColumnCount(); col_idx++) {
auto &cur_vector = data_chunk.data[col_idx];
auto vector_data = FlatVector::GetData<string_t>(cur_vector);
auto &validity = FlatVector::Validity(cur_vector);
HeaderValue val;
if (validity.RowIsValid(0)) {
val = HeaderValue(vector_data[0]);
}
potential_header.emplace_back(val);

for (idx_t col_idx = 0; col_idx < data_chunk.ColumnCount(); col_idx++) {
auto &cur_vector = data_chunk.data[col_idx];
auto vector_data = FlatVector::GetData<string_t>(cur_vector);
auto &validity = FlatVector::Validity(cur_vector);
HeaderValue val;
if (validity.RowIsValid(0)) {
val = HeaderValue(vector_data[0]);
}
potential_header.emplace_back(val);
}
names = DetectHeaderInternal(buffer_manager->context, potential_header, *state_machine, set_columns,
best_sql_types_candidates_per_column_idx, options, *error_handler);

vector<string> names = DetectHeaderInternal(buffer_manager->context, potential_header, *state_machine, set_columns,
best_sql_types_candidates_per_column_idx, options, *error_handler);

for (idx_t column_idx = 0; column_idx < best_sql_types_candidates_per_column_idx.size(); column_idx++) {
LogicalType d_type = best_sql_types_candidates_per_column_idx[column_idx].back();
Expand All @@ -153,34 +153,33 @@ SnifferResult CSVSniffer::MinimalSniff() {
detected_types.push_back(d_type);
}

return {detected_types, names};
return {detected_types, names, sniffed_column_counts.result_position > 1};
}

SnifferResult CSVSniffer::AdaptiveSniff(CSVSchema &file_schema) {
SnifferResult CSVSniffer::AdaptiveSniff(const CSVSchema &file_schema) {
auto min_sniff_res = MinimalSniff();
bool run_full = error_handler->AnyErrors() || detection_error_handler->AnyErrors();
// Check if we are happy with the result or if we need to do more sniffing
if (!error_handler->AnyErrors() && !detection_error_handler->AnyErrors()) {
// If we got no errors, we also run full if schemas do not match.
if (!set_columns.IsSet() && !options.file_options.AnySet()) {
string error;
run_full =
!file_schema.SchemasMatch(error, min_sniff_res.names, min_sniff_res.return_types, options.file_path);
run_full = !file_schema.SchemasMatch(error, min_sniff_res, options.file_path, true);
}
}
if (run_full) {
// We run full sniffer
auto full_sniffer = SniffCSV();
if (!set_columns.IsSet() && !options.file_options.AnySet()) {
string error;
if (!file_schema.SchemasMatch(error, full_sniffer.names, full_sniffer.return_types, options.file_path) &&
if (!file_schema.SchemasMatch(error, full_sniffer, options.file_path, false) &&
!options.ignore_errors.GetValue()) {
throw InvalidInputException(error);
}
}
return full_sniffer;
}
return min_sniff_res;
return min_sniff_res.ToSnifferResult();
}
SnifferResult CSVSniffer::SniffCSV(bool force_match) {
buffer_manager->sniffing = true;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#include "duckdb/common/shared_ptr.hpp"
#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"
#include "duckdb/main/client_data.hpp"
#include "duckdb/execution/operator/csv_scanner/csv_reader_options.hpp"

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#include "duckdb/common/types/cast_helpers.hpp"
#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/csv_reader_options.hpp"

#include "utf8proc.hpp"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#include "duckdb/common/operator/integer_cast_operator.hpp"
#include "duckdb/common/string.hpp"
#include "duckdb/common/types/time.hpp"
#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"

namespace duckdb {
struct TryCastFloatingOperator {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/csv_casting.hpp"

namespace duckdb {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"

namespace duckdb {
void CSVSniffer::ReplaceTypes() {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#include "duckdb/execution/operator/csv_scanner/csv_state_machine.hpp"
#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"
#include "utf8proc_wrapper.hpp"
#include "duckdb/main/error_manager.hpp"
#include "duckdb/execution/operator/csv_scanner/csv_state_machine_cache.hpp"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#include "duckdb/execution/operator/csv_scanner/csv_state_machine.hpp"
#include "duckdb/execution/operator/csv_scanner/csv_state_machine_cache.hpp"
#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"

namespace duckdb {

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#include "duckdb/execution/operator/csv_scanner/csv_file_scanner.hpp"

#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/skip_scanner.hpp"
#include "duckdb/function/table/read_csv.hpp"

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#include "duckdb/execution/operator/csv_scanner/global_csv_state.hpp"

#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/scanner_boundary.hpp"
#include "duckdb/execution/operator/csv_scanner/skip_scanner.hpp"
#include "duckdb/execution/operator/persistent/csv_rejects_table.hpp"
Expand Down
2 changes: 1 addition & 1 deletion src/duckdb/src/function/table/copy_csv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#include "duckdb/common/types/column/column_data_collection.hpp"
#include "duckdb/common/types/string_type.hpp"
#include "duckdb/common/vector_operations/vector_operations.hpp"
#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"
#include "duckdb/function/copy_function.hpp"
#include "duckdb/function/scalar/string_functions.hpp"
#include "duckdb/function/table/read_csv.hpp"
Expand Down
2 changes: 1 addition & 1 deletion src/duckdb/src/function/table/read_csv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
#include "duckdb/common/union_by_name.hpp"
#include "duckdb/execution/operator/csv_scanner/global_csv_state.hpp"
#include "duckdb/execution/operator/csv_scanner/csv_error.hpp"
#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"
#include "duckdb/execution/operator/persistent/csv_rejects_table.hpp"
#include "duckdb/function/function_set.hpp"
#include "duckdb/main/client_context.hpp"
Expand Down
2 changes: 1 addition & 1 deletion src/duckdb/src/function/table/sniff_csv.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#include "duckdb/function/built_in_functions.hpp"
#include "duckdb/execution/operator/csv_scanner/csv_reader_options.hpp"
#include "duckdb/common/types/data_chunk.hpp"
#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/csv_buffer_manager.hpp"
#include "duckdb/function/table_function.hpp"
#include "duckdb/main/client_context.hpp"
Expand Down
6 changes: 3 additions & 3 deletions src/duckdb/src/function/table/version/pragma_version.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#ifndef DUCKDB_PATCH_VERSION
#define DUCKDB_PATCH_VERSION "2"
#define DUCKDB_PATCH_VERSION "3-dev11"
#endif
#ifndef DUCKDB_MINOR_VERSION
#define DUCKDB_MINOR_VERSION 1
Expand All @@ -8,10 +8,10 @@
#define DUCKDB_MAJOR_VERSION 1
#endif
#ifndef DUCKDB_VERSION
#define DUCKDB_VERSION "v1.1.2"
#define DUCKDB_VERSION "v1.1.3-dev11"
#endif
#ifndef DUCKDB_SOURCE_ID
#define DUCKDB_SOURCE_ID "f680b7d08f"
#define DUCKDB_SOURCE_ID "217ec4722e"
#endif
#include "duckdb/function/table/system_functions.hpp"
#include "duckdb/main/database.hpp"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,15 @@ class CSVBufferManager {

void UnpinBuffer(const idx_t cache_idx);
//! Returns the buffer size set for this CSV buffer manager
idx_t GetBufferSize();
idx_t GetBufferSize() const;
//! Returns the number of buffers in the cached_buffers cache
idx_t BufferCount();
idx_t BufferCount() const;
//! If this buffer manager is done. In the context of a buffer manager it means that it read all buffers at least
//! once.
bool Done();
bool Done() const;

void ResetBufferManager();
string GetFilePath();
string GetFilePath() const;

ClientContext &context;
idx_t skip_rows = 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#pragma once

#include "duckdb/common/types.hpp"
#include "duckdb/execution/operator/csv_scanner/sniffer/sniff_result.hpp"

namespace duckdb {
//! Basic CSV Column Info
Expand All @@ -23,8 +24,8 @@ struct CSVColumnInfo {
struct CSVSchema {
void Initialize(vector<string> &names, vector<LogicalType> &types, const string &file_path);
bool Empty() const;
bool SchemasMatch(string &error_message, vector<string> &names, vector<LogicalType> &types,
const string &cur_file_path);
bool SchemasMatch(string &error_message, SnifferResult &sniffer_result, const string &cur_file_path,
bool is_minimal_sniffer) const;

private:
static bool CanWeCastIt(LogicalTypeId source, LogicalTypeId destination);
Expand Down
Loading

0 comments on commit 5000d09

Please sign in to comment.