diff --git a/src/duckdb/extension/parquet/include/parquet_reader.hpp b/src/duckdb/extension/parquet/include/parquet_reader.hpp index ef8dcaf8c..6b536bbd5 100644 --- a/src/duckdb/extension/parquet/include/parquet_reader.hpp +++ b/src/duckdb/extension/parquet/include/parquet_reader.hpp @@ -93,6 +93,7 @@ struct ParquetOptions { MultiFileReaderOptions file_options; vector schema; + idx_t explicit_cardinality = 0; public: void Serialize(Serializer &serializer) const; diff --git a/src/duckdb/extension/parquet/parquet_extension.cpp b/src/duckdb/extension/parquet/parquet_extension.cpp index 2d76f82b6..617dc3cac 100644 --- a/src/duckdb/extension/parquet/parquet_extension.cpp +++ b/src/duckdb/extension/parquet/parquet_extension.cpp @@ -70,8 +70,8 @@ struct ParquetReadBindData : public TableFunctionData { // These come from the initial_reader, but need to be stored in case the initial_reader is removed by a filter idx_t initial_file_cardinality; idx_t initial_file_row_groups; + idx_t explicit_cardinality = 0; // can be set to inject exterior cardinality knowledge (e.g. from a data lake) ParquetOptions parquet_options; - MultiFileReaderBindData reader_bind; void Initialize(shared_ptr reader) { @@ -395,6 +395,7 @@ class ParquetScanFunction { table_function.named_parameters["file_row_number"] = LogicalType::BOOLEAN; table_function.named_parameters["debug_use_openssl"] = LogicalType::BOOLEAN; table_function.named_parameters["compression"] = LogicalType::VARCHAR; + table_function.named_parameters["explicit_cardinality"] = LogicalType::UBIGINT; table_function.named_parameters["schema"] = LogicalType::MAP(LogicalType::INTEGER, LogicalType::STRUCT({{{"name", LogicalType::VARCHAR}, {"type", LogicalType::VARCHAR}, @@ -545,7 +546,11 @@ class ParquetScanFunction { result->reader_bind = result->multi_file_reader->BindReader( context, result->types, result->names, *result->file_list, *result, parquet_options); } - + if (parquet_options.explicit_cardinality) { + auto file_count = result->file_list->GetTotalFileCount(); + result->explicit_cardinality = parquet_options.explicit_cardinality; + result->initial_file_cardinality = result->explicit_cardinality / (file_count ? file_count : 1); + } if (return_types.empty()) { // no expected types - just copy the types return_types = result->types; @@ -618,6 +623,8 @@ class ParquetScanFunction { // cannot be combined with hive_partitioning=true, so we disable auto-detection parquet_options.file_options.auto_detect_hive_partitioning = false; + } else if (loption == "explicit_cardinality") { + parquet_options.explicit_cardinality = UBigIntValue::Get(kv.second); } else if (loption == "encryption_config") { parquet_options.encryption_config = ParquetEncryptionConfig::Create(context, kv.second); } @@ -847,12 +854,13 @@ class ParquetScanFunction { static unique_ptr ParquetCardinality(ClientContext &context, const FunctionData *bind_data) { auto &data = bind_data->Cast(); - + if (data.explicit_cardinality) { + return make_uniq(data.explicit_cardinality); + } auto file_list_cardinality_estimate = data.file_list->GetCardinality(context); if (file_list_cardinality_estimate) { return file_list_cardinality_estimate; } - return make_uniq(MaxValue(data.initial_file_cardinality, (idx_t)1) * data.file_list->GetTotalFileCount()); } diff --git a/src/duckdb/src/function/table/version/pragma_version.cpp b/src/duckdb/src/function/table/version/pragma_version.cpp index 924400818..fdc69d562 100644 --- a/src/duckdb/src/function/table/version/pragma_version.cpp +++ b/src/duckdb/src/function/table/version/pragma_version.cpp @@ -1,5 +1,5 @@ #ifndef DUCKDB_PATCH_VERSION -#define DUCKDB_PATCH_VERSION "2-dev186" +#define DUCKDB_PATCH_VERSION "2-dev192" #endif #ifndef DUCKDB_MINOR_VERSION #define DUCKDB_MINOR_VERSION 1 @@ -8,10 +8,10 @@ #define DUCKDB_MAJOR_VERSION 1 #endif #ifndef DUCKDB_VERSION -#define DUCKDB_VERSION "v1.1.2-dev186" +#define DUCKDB_VERSION "v1.1.2-dev192" #endif #ifndef DUCKDB_SOURCE_ID -#define DUCKDB_SOURCE_ID "92e0964376" +#define DUCKDB_SOURCE_ID "35dfcc06e6" #endif #include "duckdb/function/table/system_functions.hpp" #include "duckdb/main/database.hpp"